## Load libraries

In [16]:
import pandas as pd
from pytz import timezone
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_squared_error
import pickle
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Load the dataset

In [46]:
df = pd.read_excel('07.2022---05.2024.xlsx')
df

Unnamed: 0,date,time,category,amount
0,2022-07-06,05:57:10,Restuarant,5.50
1,2022-07-06,05:57:27,Market,2.00
2,2022-07-06,05:58:12,Coffe,30.10
3,2022-07-06,05:58:25,Market,17.33
4,2022-07-06,05:59:00,Restuarant,5.50
...,...,...,...,...
2774,2024-04-30,05:49:01,Restuarant,8.26
2775,2024-04-30,19:58:00,Restuarant,63.00
2776,2024-05-01,06:29:58,Restuarant,8.26
2777,2024-05-01,12:38:25,Restuarant,14.00


## Create time related columns

In [47]:
df['date'] = pd.to_datetime(df['date'])
df['time'] = pd.to_timedelta(df['time'].astype(str))
df['datetime'] = df['date'] + df['time']
df['datetime'] = pd.to_datetime(df['datetime']).dt.tz_localize('UTC')
current_datetime = datetime.now(timezone('UTC'))
df['time_diff'] = (current_datetime - df['datetime']).dt.days
df['hours'] = df['datetime'].dt.hour
df['weekday'] = df['datetime'].dt.weekday + 1
df['year'] = df['datetime'].dt.year
df['day_of_year'] = df['datetime'].dt.dayofyear
df['month'] = df['datetime'].dt.month
df['time_seconds'] = df['time'].apply(lambda x: x.total_seconds())
df.drop(['date', 'time', 'datetime'], axis=1, inplace=True)
df

Unnamed: 0,category,amount,time_diff,hours,weekday,year,day_of_year,month,time_seconds
0,Restuarant,5.50,671,5,3,2022,187,7,21430.0
1,Market,2.00,671,5,3,2022,187,7,21447.0
2,Coffe,30.10,671,5,3,2022,187,7,21492.0
3,Market,17.33,671,5,3,2022,187,7,21505.0
4,Restuarant,5.50,671,5,3,2022,187,7,21540.0
...,...,...,...,...,...,...,...,...,...
2774,Restuarant,8.26,7,5,2,2024,121,4,20941.0
2775,Restuarant,63.00,6,19,2,2024,121,4,71880.0
2776,Restuarant,8.26,6,6,3,2024,122,5,23398.0
2777,Restuarant,14.00,6,12,3,2024,122,5,45505.0


## Convert categorical variables into numerical format using one-hot encoding

In [48]:
df = pd.get_dummies(df, columns=['category'],drop_first=True)

Unnamed: 0,0
0,amount
1,time_diff
2,hours
3,weekday
4,year
5,day_of_year
6,month
7,time_seconds
8,category_Clothing
9,category_Coffe


## Splitting the data into training and testing sets

In [50]:
X = df.drop('amount', axis=1)  
y = df['amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Standardize the features

In [51]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Random Forest Regressor with Tuning

In [52]:
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("Tuned RandomForestRegressor RMSE:", rmse)

Tuned RandomForestRegressor RMSE: 36.35246252412161


## Feature selection with SelectFromModel

In [53]:
selector = SelectFromModel(model, threshold='median')
selector.fit(X_train_scaled, y_train)
X_train_selected = selector.transform(X_train_scaled)
X_test_selected = selector.transform(X_test_scaled)

## Final 1.0.0 with selected features

In [54]:
final_model = RandomForestRegressor(n_estimators=200, random_state=42)
final_model.fit(X_train_selected, y_train)
y_pred_final = final_model.predict(X_test_selected)
rmse_final = mean_squared_error(y_test, y_pred_final, squared=False)
print("Selected Features RMSE:", rmse_final)

Selected Features RMSE: 38.719973787491206


## Save the final 1.0.0 to a .pkl fil

In [55]:
with open('final_model.pkl', 'wb') as file:
    pickle.dump(final_model, file)