In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# PREPROCESSING

In [2]:
train = pd.read_excel("train.xlsx")
train.drop_duplicates(inplace = True)
train.reset_index(drop = True, inplace = True)
train["Average_Cost"] = train.Average_Cost.apply(lambda x : ''.join(x.split('₹')[1:]))
train['Average_Cost'] = train['Average_Cost'].str.replace(',', '')
train['Average_Cost'] = pd.to_numeric(train['Average_Cost'])
train["Minimum_Order"] = train.Minimum_Order.apply(lambda x : ''.join(x.split('₹')[1:]))
train['Minimum_Order'] = pd.to_numeric(train['Minimum_Order'])
train['Rating'] = pd.to_numeric(train['Rating'], errors = 'coerce')
train['Votes'] = pd.to_numeric(train['Votes'], errors = 'coerce')
train['Reviews'] = pd.to_numeric(train['Reviews'], errors = 'coerce')
train['Average_Cost'] = train['Average_Cost'].fillna(train['Average_Cost'].mean())
train['Rating'] = train['Rating'].fillna(0)
train['Votes'] = train['Votes'].fillna(0)
train['Reviews'] = train['Reviews'].fillna(0)
train['Minimum_To_Cost_Ratio'] = train['Minimum_Order']/train['Average_Cost']
train['Branches'] = train['Restaurant'].map(train['Restaurant'].value_counts())
train['Restaurants_Count'] = train['Location'].map(train['Location'].value_counts())
train.drop(['Restaurant'], axis = 1, inplace = True)

In [3]:
city = []
for location in train.Location:
    if "Pune" in location:
        city.append("Pune")
    elif "Bangalore" in location:
        city.append("Bangalore")
    elif "Hyderabad" in location:
        city.append("Hyderabad")
    elif "Noida" in location:
        city.append("Noida")
    elif "Kolkata" in location:
        city.append("Kolkata")
    elif "Delhi" in location:
        city.append("Delhi")
    elif "Marathalli" in location:
        city.append("Bangalore")
    elif "Electronic City" in location:
        city.append("Bangalore")
    elif "Mumbai" in location:
        city.append("Mumbai")
    elif "Whitefield" in location:
        city.append("Bangalore")
    elif "Gurgaon" in location:
        city.append("Gurgaon")
    elif "Gurgoan" in location:
        city.append("Gurgaon")
    elif "Majestic" in location:
        city.append("Bangalore")

train['City'] = city

## WINSORIZING OUTLIERS

In [4]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# plt.rcParams['figure.figsize'] = (15,7)
# f,(ax1, ax2, ax3, ax4, ax5) = plt.subplots(1,5)
# sns.boxplot(y = 'Average_Cost', data = train, ax = ax1, palette = 'coolwarm')
# sns.boxplot(y = 'Minimum_Order', data = train, ax = ax2)
# sns.boxplot(y = 'Rating', data = train, ax = ax3, palette = 'coolwarm')
# sns.boxplot(y = 'Votes', data = train, ax = ax4)
# sns.boxplot(y = 'Reviews', data = train, ax = ax5, palette = 'coolwarm')
# f.tight_layout()

In [5]:
# for column in ['Average_Cost', 'Minimum_Order', 'Rating', 'Votes', 'Reviews']:
#     IQR = train[column].quantile(0.75) - train[column].quantile(0.25)
#     Lower_fence = train[column].quantile(0.25) - (IQR * 1.5)
#     Upper_fence = train[column].quantile(0.75) + (IQR * 1.5)
#     print(f'{column} outliers are values < {round(Lower_fence,2)} or > {round(Upper_fence,2)}')

In [6]:
# train['Average_Cost'] = np.where(train['Average_Cost'] > 350.0, 350.0, train['Average_Cost'])
# train['Minimum_Order'] = np.where(train['Minimum_Order'] < 50.0, 50.0, train['Minimum_Order'])
# train['Minimum_Order'] = np.where(train['Minimum_Order'] > 50.0, 50.0, train['Minimum_Order'])
# train['Rating'] = np.where(train['Rating'] < 2.8, 2.8, train['Rating'])
# train['Rating'] = np.where(train['Rating'] > 4.4, 4.4, train['Rating'])
# train['Votes'] = np.where(train['Votes'] > 370.0, 370.0, train['Votes'])
# train['Reviews'] = np.where(train['Reviews'] > 150.0, 150.0, train['Reviews'])

## DUMMIFICATION, MIN-MAX SCALING, LABEL ENCODING

In [7]:
X1 = pd.get_dummies(train['City'], drop_first = True)

# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()

# scaled1 = scaler.fit_transform(train[['Average_Cost', 'Minimum_Order', 'Rating', 'Votes', 'Reviews']])
# scaled_df1 = pd.DataFrame(scaled1, columns = ['Average_Cost', 'Minimum_Order', 'Rating', 'Votes', 'Reviews'])

X1 = pd.concat([train[['Average_Cost', 'Minimum_Order', 'Rating', 'Votes', 'Reviews', 'Branches']], X1], axis = 1)
# X2 = pd.concat([scaled_df1, X1], axis = 1)

# from sklearn.preprocessing import LabelEncoder
# label_encoder = LabelEncoder()
# y = label_encoder.fit_transform(train['Delivery_Time'])

## RANDOM FOREST CLASSIFIER

In [8]:
# X = train[['Average_Cost', 'Minimum_Order', 'Rating', 'Votes', 'Reviews']]
y = train['Delivery_Time']

X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size = 0.3, random_state = 1234, shuffle = True)

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state = 1234)

model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)
print("Train Accuracy:", accuracy_score(y_train, y_pred_train))
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))

Train Accuracy: 0.9907276239536381
Test Accuracy: 0.7933313307299489


## RFECV

In [9]:
from sklearn.feature_selection import RFECV

rfecv = RFECV(estimator = model, step = 1, cv = 5, scoring = 'accuracy')
rfecv = rfecv.fit(X_train, y_train)

print("The optimal number of features:", rfecv.n_features_)
print("Best features:", X_train.columns[rfecv.support_])



The optimal number of features: 12
Best features: Index(['Average_Cost', 'Minimum_Order', 'Rating', 'Votes', 'Reviews',
       'Branches', 'Delhi', 'Gurgaon', 'Kolkata', 'Mumbai', 'Noida', 'Pune'],
      dtype='object')


In [10]:
X_train_selected = rfecv.transform(X_train)
X_test_selected = rfecv.transform(X_test)

model = RandomForestClassifier(random_state = 1234)
model.fit(X_train_selected, y_train)
y_pred_train = model.predict(X_train_selected)
y_pred_test = model.predict(X_test_selected)
print("Train Accuracy:", accuracy_score(y_train, y_pred_train))
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))

Train Accuracy: 0.990856406954282
Test Accuracy: 0.7918293781916491


## OPTUNA

In [11]:
import optuna

def objective(trial):
    param_space = {
        'n_estimators': trial.suggest_int('n_estimators', 40, 300, step=20),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 10, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 3, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None])
    }
    
    model = RandomForestClassifier(random_state = 1234, **param_space)
    model.fit(X_train_selected, y_train)
    y_pred_test = model.predict(X_test_selected)
    return accuracy_score(y_test, y_pred_test)

study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 20, show_progress_bar = True)

[I 2023-06-20 01:42:17,034] A new study created in memory with name: no-name-21399203-0421-4d24-89ca-3fb3dbf2e827


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2023-06-20 01:42:18,252] Trial 0 finished with value: 0.7047161309702613 and parameters: {'n_estimators': 280, 'max_depth': 4, 'min_samples_split': 17, 'min_samples_leaf': 3, 'max_features': 'log2', 'criterion': 'entropy', 'bootstrap': False, 'class_weight': None}. Best is trial 0 with value: 0.7047161309702613.
[I 2023-06-20 01:42:19,583] Trial 1 finished with value: 0.6740762991889456 and parameters: {'n_estimators': 160, 'max_depth': 19, 'min_samples_split': 11, 'min_samples_leaf': 7, 'max_features': 'sqrt', 'criterion': 'entropy', 'bootstrap': True, 'class_weight': 'balanced'}. Best is trial 0 with value: 0.7047161309702613.
[I 2023-06-20 01:42:20,399] Trial 2 finished with value: 0.7266446380294382 and parameters: {'n_estimators': 160, 'max_depth': 7, 'min_samples_split': 16, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'criterion': 'gini', 'bootstrap': True, 'class_weight': None}. Best is trial 2 with value: 0.7266446380294382.
[I 2023-06-20 01:42:21,765] Trial 3 finished wi

In [12]:
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 20
Best trial:
  Value: 0.7671973565635326
  Params: 
    n_estimators: 280
    max_depth: 20
    min_samples_split: 10
    min_samples_leaf: 4
    max_features: log2
    criterion: gini
    bootstrap: False
    class_weight: None


In [94]:
# rfr_params = {'n_estimators': 100,
#               'max_depth': 21,
#               'min_samples_split': 3,
#               'min_samples_leaf': 1,
#               'max_features': 'sqrt',
#               'criterion': 'entropy',
#               'bootstrap': True,
#               'class_weight': None
#              }

rfr_params = {'n_estimators': 70,
              'max_depth': 10,
              'min_samples_split': 10,
              'min_samples_leaf': 7,
              'max_features': 'log2',
              'criterion': 'gini',
              'bootstrap': False,
              'class_weight': None
             }

model = RandomForestClassifier(**rfr_params, random_state = 1234)
model.fit(X_train_selected, y_train)
y_pred_train = model.predict(X_train_selected)
y_pred_test = model.predict(X_test_selected)
print("Train Accuracy:", accuracy_score(y_train, y_pred_train))
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))

Train Accuracy: 0.7886670959433355
Test Accuracy: 0.7413637729047762


# SUBMISSION

## PREPROCESSING TEST DATA

In [95]:
test = pd.read_excel("test.xlsx")
test["Average_Cost"] = test.Average_Cost.apply(lambda x : ''.join(x.split('₹')[1:]))
test['Average_Cost'] = test['Average_Cost'].str.replace(',', '')
test['Average_Cost'] = pd.to_numeric(test['Average_Cost'])
test["Minimum_Order"] = test.Minimum_Order.apply(lambda x : ''.join(x.split('₹')[1:]))
test['Minimum_Order'] = pd.to_numeric(test['Minimum_Order'])
test['Rating'] = pd.to_numeric(test['Rating'], errors = 'coerce')
test['Votes'] = pd.to_numeric(test['Votes'], errors = 'coerce')
test['Reviews'] = pd.to_numeric(test['Reviews'], errors = 'coerce')
test['Average_Cost'] = test['Average_Cost'].fillna(test['Average_Cost'].mean())
test['Rating'] = test['Rating'].fillna(0)
test['Votes'] = test['Votes'].fillna(0)
test['Reviews'] = test['Reviews'].fillna(0)
test['Minimum_To_Cost_Ratio'] = test['Minimum_Order']/test['Average_Cost']
test['Branches'] = test['Restaurant'].map(test['Restaurant'].value_counts())
test['Restaurants_Count'] = test['Location'].map(test['Location'].value_counts())
test.drop(['Restaurant'], axis = 1, inplace = True)

In [96]:
city = []
for location in test.Location:
    if "Pune" in location:
        city.append("Pune")
    elif "Bangalore" in location:
        city.append("Bangalore")
    elif "Hyderabad" in location:
        city.append("Hyderabad")
    elif "Noida" in location:
        city.append("Noida")
    elif "Kolkata" in location:
        city.append("Kolkata")
    elif "Delhi" in location:
        city.append("Delhi")
    elif "Marathalli" in location:
        city.append("Bangalore")
    elif "Electronic City" in location:
        city.append("Bangalore")
    elif "Mumbai" in location:
        city.append("Mumbai")
    elif "Whitefield" in location:
        city.append("Bangalore")
    elif "Gurgaon" in location:
        city.append("Gurgaon")
    elif "Gurgoan" in location:
        city.append("Gurgaon")
    elif "Majestic" in location:
        city.append("Bangalore")

test['City'] = city

## WINSORIZING OUTLIERS

In [97]:
# import matplotlib.pyplot as plt
# import seaborn as sns
# plt.rcParams['figure.figsize'] = (15,7)
# f,(ax1, ax2, ax3, ax4, ax5) = plt.subplots(1,5)
# sns.boxplot(y = 'Average_Cost', data = test, ax = ax1, palette = 'coolwarm')
# sns.boxplot(y = 'Minimum_Order', data = test, ax = ax2)
# sns.boxplot(y = 'Rating', data = test, ax = ax3, palette = 'coolwarm')
# sns.boxplot(y = 'Votes', data = test, ax = ax4)
# sns.boxplot(y = 'Reviews', data = test, ax = ax5, palette = 'coolwarm')
# f.tight_layout()

In [98]:
# for column in ['Average_Cost', 'Minimum_Order', 'Rating', 'Votes', 'Reviews']:
#     IQR = test[column].quantile(0.75) - test[column].quantile(0.25)
#     Lower_fence = test[column].quantile(0.25) - (IQR * 1.5)
#     Upper_fence = test[column].quantile(0.75) + (IQR * 1.5)
#     print(f'{column} outliers are values < {round(Lower_fence,2)} or > {round(Upper_fence,2)}')

In [99]:
# test['Average_Cost'] = np.where(test['Average_Cost'] > 350.0, 350.0, test['Average_Cost'])
# test['Minimum_Order'] = np.where(test['Minimum_Order'] < 50.0, 50.0, test['Minimum_Order'])
# test['Minimum_Order'] = np.where(test['Minimum_Order'] > 50.0, 50.0, test['Minimum_Order'])
# test['Rating'] = np.where(test['Rating'] < 2.4, 2.4, test['Rating'])
# test['Votes'] = np.where(test['Votes'] > 481.12, 481.12, test['Votes'])
# test['Reviews'] = np.where(test['Reviews'] > 212.0, 212.0, test['Reviews'])

## DUMMIFICATION, MIN-MAX SCALING

In [100]:
X2 = pd.get_dummies(test['City'], drop_first = True)

# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()

# scaled2 = scaler.fit_transform(test[['Average_Cost', 'Minimum_Order', 'Rating', 'Votes', 'Reviews']])
# scaled_df2 = pd.DataFrame(scaled2, columns = ['Average_Cost', 'Minimum_Order', 'Rating', 'Votes', 'Reviews'])

X2 = pd.concat([test[['Average_Cost', 'Minimum_Order', 'Rating', 'Votes', 'Reviews', 'Branches']], X2], axis = 1)

## WRITE INTO SUBMISSION FILE

In [101]:
X2 = rfecv.transform(X2)
test['Delivery_Time'] = model.predict(X2)
test['Delivery_Time'].to_excel('submission.xlsx', index = False)