In [43]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [44]:
# Loading data
data_no_menu = pd.read_csv('../data/all+coupons.csv')
data_menu = pd.read_csv('../data/all_with_onehot_new.csv')

In [45]:
def create_dataset(data_menu, data_no_menu, include_menu=True, one_hot=False):
    data_no_menu_columns = data_no_menu.columns
    data_menu_columns = data_menu.columns

    menu = pd.DataFrame()

    for col in data_menu_columns:
        if col not in data_no_menu_columns:
            menu[col] = data_menu[col]
    if include_menu:
        if not one_hot:
            unique_combinations = {}
            numerical_menu = []

            cur = 0
            for i in range(len(menu)):
                menu_string = ""
                for col in menu.columns:
                    menu_string += str(menu[col][i])

                if menu_string not in unique_combinations:
                    unique_combinations[menu_string] = cur
                    cur += 1
                numerical_menu.append(unique_combinations[menu_string])

            numerical_menu = pd.DataFrame(numerical_menu, columns=['menu'])

            data = pd.concat([data_no_menu, numerical_menu], axis=1)
        else:
            data = pd.concat([data_no_menu, menu], axis=1)
    else:
        data = data_no_menu

    return data

In [46]:
data = create_dataset(data_menu, data_no_menu, include_menu=True, one_hot=False)

In [47]:
data.head()

Unnamed: 0,Date,Weekday,MealType,Paytm+Cash,Coupons,SemType,Holiday,CouponsMand,15_coupon_count,20_coupon_count,25_coupon_count,30_coupon_count,menu
0,2022-10-01,5.0,Lunch,333.0,75.0,Acad,0.0,1.0,28.0,83.0,4.0,0.0,0
1,2022-10-01,5.0,EveningSnacks,17.0,36.0,Acad,0.0,1.0,28.0,83.0,4.0,0.0,1
2,2022-10-01,5.0,Dinner,47.0,58.0,Acad,0.0,1.0,28.0,83.0,4.0,0.0,2
3,2022-10-02,6.0,BreakFast,69.0,71.0,Acad,1.0,1.0,61.0,93.0,5.0,0.0,3
4,2022-10-02,6.0,Lunch,106.0,86.0,Acad,1.0,1.0,61.0,93.0,5.0,0.0,4


In [48]:
# Preprocessing the dataset
def preprocess(data, to_onehot=False):
    data = data.dropna()
        
    # Converting the date column to datetime
    data['Date'] = pd.to_datetime(data['Date'])

    # Adding day, dayofweek, month and year columns
    data['Day'] = pd.DatetimeIndex(data['Date']).day
    data['Month'] = pd.DatetimeIndex(data['Date']).month
    data['Year'] = pd.DatetimeIndex(data['Date']).year

    # Scaling the data
    # to_normalize = ['Day']
    # scaler = MinMaxScaler()
    # data[to_normalize] = scaler.fit_transform(data[to_normalize])

    # to_normalize = ['Holiday']
    # scaler = StandardScaler()
    # data[to_normalize] = scaler.fit_transform(data[to_normalize])

    # Encoding the categorical data
    if to_onehot:
        categorical_features = ['Weekday', 'Month', 'Year', 'MealType', 'SemType']
        data = pd.get_dummies(data, columns=categorical_features)
    else:
        # Using label encoding
        categorical_features = ['MealType', 'SemType']
        for feature in categorical_features:
            data[feature] = data[feature].astype('category')
            data[feature] = data[feature].cat.codes

    # Splitting into X and y
    X = data.drop(columns=['Paytm+Cash', 'Coupons'])
    y_paytm = data['Paytm+Cash']
    y_coupons = data['Coupons']
    y_total = data['Paytm+Cash'] + data['Coupons']

    return X, y_paytm, y_coupons, y_total

In [49]:
# Splitting the data into train and test sets (with vacations)
X_no_onehot, y_paytm_no_onehot, y_coupons_no_onehot, y_total_no_onehot = preprocess(data_no_menu, to_onehot=False)
X_onehot, y_paytm_onehot, y_coupons_onehot, y_total_onehot = preprocess(data_no_menu, to_onehot=True)

# cutoff at 2023-08-31 for no onehot
X_train_no_onehot = X_no_onehot[X_no_onehot['Date'] <= '2023-08-31']
X_test_no_onehot = X_no_onehot[X_no_onehot['Date'] > '2023-08-31']
y_paytm_train_no_onehot = y_paytm_no_onehot[X_no_onehot['Date'] <= '2023-08-31']
y_paytm_test_no_onehot = y_paytm_no_onehot[X_no_onehot['Date'] > '2023-08-31']
y_coupons_train_no_onehot = y_coupons_no_onehot[X_no_onehot['Date'] <= '2023-08-31']
y_coupons_test_no_onehot = y_coupons_no_onehot[X_no_onehot['Date'] > '2023-08-31']
y_total_train_no_onehot = y_total_no_onehot[X_no_onehot['Date'] <= '2023-08-31']
y_total_test_no_onehot = y_total_no_onehot[X_no_onehot['Date'] > '2023-08-31']

# cutoff at 2023-08-31 for onehot
X_train_onehot = X_onehot[X_onehot['Date'] <= '2023-08-31']
X_test_onehot = X_onehot[X_onehot['Date'] > '2023-08-31']
y_paytm_train_onehot = y_paytm_onehot[X_onehot['Date'] <= '2023-08-31']
y_paytm_test_onehot = y_paytm_onehot[X_onehot['Date'] > '2023-08-31']
y_coupons_train_onehot = y_coupons_onehot[X_onehot['Date'] <= '2023-08-31']
y_coupons_test_onehot = y_coupons_onehot[X_onehot['Date'] > '2023-08-31']
y_total_train_onehot = y_total_onehot[X_onehot['Date'] <= '2023-08-31']
y_total_test_onehot = y_total_onehot[X_onehot['Date'] > '2023-08-31']

# Drop the date column
X_train_no_onehot = X_train_no_onehot.drop(columns=['Date'])
X_test_no_onehot = X_test_no_onehot.drop(columns=['Date'])
X_train_onehot = X_train_onehot.drop(columns=['Date'])
X_test_onehot = X_test_onehot.drop(columns=['Date'])

In [50]:
# Defining parameters for grid search
parameters = {
    'n_estimators': [10, 25, 50, 100, 500],
    'max_depth': [2, 4, 6, 8, 10, 15, 20, None],
    'min_samples_split': [2, 4, 6, 10],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2', 25, None]
}

In [51]:
def grid_search(X_train, y_train, X_test, y_test, parameters):
   # Grid search for paytm+cash using for loops
    best_parameters = {}
    X_val = X_train[int(len(X_train)*0.8):]
    y_val = y_train[int(len(y_train)*0.8):]
    X_train_new = X_train[:int(len(X_train)*0.8)]
    y_train_new = y_train[:int(len(y_train)*0.8)]

    for n_estimators in parameters['n_estimators']:
        for max_depth in parameters['max_depth']:
            for min_samples_split in parameters['min_samples_split']:
                for min_samples_leaf in parameters['min_samples_leaf']:
                    for max_features in parameters['max_features']:
                        rfr = RandomForestRegressor(n_estimators=n_estimators, 
                                                    max_depth=max_depth, 
                                                    min_samples_split=min_samples_split, 
                                                    min_samples_leaf=min_samples_leaf,
                                                    max_features=max_features, 
                                                    random_state=42)
                        rfr.fit(X_train_new, y_train_new)
                        y_pred = rfr.predict(X_val)
                        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
                        r2 = r2_score(y_val, y_pred)

                        y_pred_train = rfr.predict(X_train_new)
                        rmse_train = np.sqrt(mean_squared_error(y_train_new, y_pred_train))
                        r2_train = r2_score(y_train_new, y_pred_train)
                        best_parameters[(rmse, r2, rmse_train, r2_train)] = (n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features)

    # Printing the best parameters with scores
    best_parameters = sorted(best_parameters.items(), key=lambda x: x[0][0])
    print('Best parameters:')
    print('Val RMSE:', best_parameters[0][0][0])
    print('Val R2:', best_parameters[0][0][1])
    print('Train RMSE:', best_parameters[0][0][2])
    print('Train R2:', best_parameters[0][0][3])
    print('Parameters:', best_parameters[0][1])

    # evaluation on test set
    rfr = RandomForestRegressor(n_estimators=best_parameters[0][1][0], 
                                max_depth=best_parameters[0][1][1], 
                                min_samples_split=best_parameters[0][1][2], 
                                min_samples_leaf=best_parameters[0][1][3],
                                max_features=best_parameters[0][1][4], 
                                random_state=42)
    rfr.fit(X_train, y_train)

    y_pred_train = rfr.predict(X_train)
    rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
    r2_train = r2_score(y_train, y_pred_train)
    print('Train (full) RMSE:', rmse_train)
    print('Train (full) R2:', r2_train)

    y_pred_test = rfr.predict(X_test)
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
    r2_test = r2_score(y_test, y_pred_test)
    print('Test RMSE:', rmse_test)
    print('Test R2:', r2_test)

    return best_parameters[0][1]

In [52]:
# Grid search for paytm+cash using onehot
best_parameters_paytm_onehot = grid_search(X_train_onehot, y_paytm_train_onehot, X_test_onehot, y_paytm_test_onehot, parameters)

(245, 34)
(979, 34)


KeyboardInterrupt: 

In [None]:
# Grid search for paytm_cash without using onehot
best_parameters_paytm_no_onehot = grid_search(X_train_no_onehot, y_paytm_train_no_onehot, X_test_no_onehot, y_paytm_test_no_onehot, parameters)

In [None]:
# Grid search for coupons using onehot
best_parameters_coupons_onehot = grid_search(X_train_onehot, y_coupons_train_onehot, X_test_onehot, y_coupons_test_onehot, parameters)

In [None]:
# Grid search for coupons without using onehot
best_parameters_coupons_no_onehot = grid_search(X_train_no_onehot, y_coupons_train_no_onehot, X_test_no_onehot, y_coupons_test_no_onehot, parameters)

In [None]:
# Grid search for total using onehot
best_parameters_total_onehot = grid_search(X_train_onehot, y_total_train_onehot, X_test_onehot, y_total_test_onehot, parameters)

In [None]:
# Grid search for total without using onehot
best_parameters_total_no_onehot = grid_search(X_train_no_onehot, y_total_train_no_onehot, X_test_no_onehot, y_total_test_no_onehot, parameters)