In [130]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [131]:
amount_data = pd.read_csv('dataset/Project Dataset/amount_data.csv')

In [132]:
amount_data['Date'] = pd.to_datetime(amount_data['Date'], dayfirst = True)
amount_data = amount_data.sort_values(by = 'Date')
amount_data = amount_data.reset_index().drop(columns=['index'])
amount_data

Unnamed: 0,Date,Amount
0,2022-04-01,0
1,2022-04-02,239400
2,2022-04-03,274140
3,2022-04-04,177000
4,2022-04-05,106000
...,...,...
756,2024-04-26,0
757,2024-04-27,582650
758,2024-04-28,367700
759,2024-04-29,451000


In [133]:
amount_data['day_of_year'] = amount_data['Date'].dt.dayofyear
amount_data['month'] = amount_data['Date'].dt.month
amount_data['day_of_week'] = amount_data['Date'].dt.dayofweek
amount_data

Unnamed: 0,Date,Amount,day_of_year,month,day_of_week
0,2022-04-01,0,91,4,4
1,2022-04-02,239400,92,4,5
2,2022-04-03,274140,93,4,6
3,2022-04-04,177000,94,4,0
4,2022-04-05,106000,95,4,1
...,...,...,...,...,...
756,2024-04-26,0,117,4,4
757,2024-04-27,582650,118,4,5
758,2024-04-28,367700,119,4,6
759,2024-04-29,451000,120,4,0


In [134]:
amount_data['lag_1'] = amount_data['Amount'].shift(1)
amount_data['lag_2'] = amount_data['Amount'].shift(2)

# Adding rolling statistics
amount_data['rolling_mean_7'] = amount_data['Amount'].rolling(window=7).mean()
amount_data['rolling_std_7'] = amount_data['Amount'].rolling(window=7).std()
amount_data.dropna(inplace=True)

In [135]:
# X = amount_data[['day_of_year', 'month', 'day_of_week']]
X = amount_data[['day_of_year', 'month', 'day_of_week', 'lag_1', 'lag_2', 'rolling_mean_7', 'rolling_std_7']]
y = amount_data['Amount']

In [136]:
X_initial, X_remaining, y_initial, y_remaining = train_test_split(X, y, train_size = 0.8, shuffle=False)

In [137]:
print(str(len(X_initial)) + " " + str(len(X_remaining)))

604 151


In [138]:
# Standardize features
scaler = StandardScaler()
X_initial = scaler.fit_transform(X_initial)
X_remaining = scaler.fit_transform(X_remaining)

In [139]:
X_initial

array([[-1.05280148e+00, -9.59400497e-01, -8.29016701e-04, ...,
        -3.65056741e-01, -2.38480927e-01, -1.14634904e-01],
       [-1.04217335e+00, -9.59400497e-01,  4.99897071e-01, ...,
        -5.51669367e-01, -1.74114724e-01, -2.66977721e-01],
       [-1.03154521e+00, -9.59400497e-01,  1.00062316e+00, ...,
        -4.74325510e-01, -2.03968785e-01, -3.22538090e-01],
       ...,
       [ 1.45543897e+00,  1.31433566e+00, -5.01555104e-01, ...,
        -5.92151471e-01, -9.61598522e-01, -9.57887570e-01],
       [ 1.46606711e+00,  1.31433566e+00, -8.29016701e-04, ...,
        -8.68614621e-01, -8.60532290e-01, -7.47802579e-01],
       [ 1.47669525e+00,  1.63915511e+00,  4.99897071e-01, ...,
        -6.18481295e-01, -8.60532290e-01, -7.47802579e-01]])

In [140]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_absolute_error

# Initialize the SGDRegressor model
model = SGDRegressor(max_iter=1000, tol=1e-3, learning_rate="adaptive")

# Fit the model on the initial data
model.fit(X_initial, y_initial)

# Variables to track incremental learning performance
predictions = []
true_values = []
mse_scores = []




In [141]:
for i in range(len(X_remaining)):
    # Get the next sample
    X_new = X_remaining[i].reshape(1, -1)
    y_new = y_remaining.iloc[i]

    # Update the model with the new data point
    model.partial_fit(X_new, [y_new])

    # Make a prediction
    y_pred = [0]
    if not y_new == 0:
        y_pred = model.predict(X_new)
    predictions.append(y_pred[0])
    true_values.append(y_new)

    # Track performance
    mse = mean_absolute_error([y_new], y_pred)
    mse_scores.append(mse)

    print(f"Sample {i+1}: True value = {y_new}, Predicted value = {y_pred[0]}, MSE = {mse}")

# Print overall MSE after incremental learning
overall_mse = mean_absolute_error(true_values, predictions)
print(f"Overall Mean Squared Error after incremental learning: {overall_mse}")


Sample 1: True value = 145830, Predicted value = 43434.00896118776, MSE = 102395.99103881224
Sample 2: True value = 112800, Predicted value = 83457.47384661972, MSE = 29342.526153380284
Sample 3: True value = 153500, Predicted value = 105363.54292021006, MSE = 48136.457079789936
Sample 4: True value = 111000, Predicted value = 138051.23809460292, MSE = 27051.238094602915
Sample 5: True value = 163100, Predicted value = 152058.69928213334, MSE = 11041.30071786666
Sample 6: True value = 58300, Predicted value = 117128.95175282373, MSE = 58828.951752823734
Sample 7: True value = 0, Predicted value = 0, MSE = 0.0
Sample 8: True value = 132800, Predicted value = 114935.81833744951, MSE = 17864.181662550487
Sample 9: True value = 399350, Predicted value = 193025.24862641853, MSE = 206324.75137358147
Sample 10: True value = 38250, Predicted value = 112091.76573137156, MSE = 73841.76573137156
Sample 11: True value = 75900, Predicted value = 28975.73672140384, MSE = 46924.26327859616
Sample 12: