In [0]:
from google.colab import drive
drive.mount('/gdrive')

In [0]:
path = "/gdrive/My Drive/machathon1-main-competition/train.csv"
path2 = "/gdrive/My Drive/machathon1-main-competition/test.csv"

In [0]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import numpy as np
from scipy import stats
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

# Loading, cleaning,and preprocessing data

In [0]:
#bring in the six packs
data_train = pd.read_csv(path)
data_test = pd.read_csv(path2)

In [0]:
target = data_train['Ratings']
data_train.drop(['Ratings'],axis=1, inplace=True)

sup_id = data_test['Id']
data_test.drop(['Id'],axis=1, inplace=True)

In [0]:
data_train_len = len(data_train) 

In [0]:
data_train = data_train.append(data_test, ignore_index=True)

In [0]:
data_train["restaurant_features"] = data_train["restaurant_features"].fillna("None")

In [0]:
data_train.head()

In [0]:
data_train.describe()

In [0]:
#removing usess coulomns
data_train.drop(['restaurant_uuid'],axis=1, inplace=True)
data_train.drop(['user_uuid'],axis=1, inplace=True)


In [0]:
data_train.isnull().sum().sort_values(ascending = True)

In [0]:
data_train.head(5)

In [0]:
#one hot encoding data
oneHE1 = pd.get_dummies(data_train['area'],prefix='', drop_first=True)
oneHE2 = pd.get_dummies(data_train['cuisine'],prefix='', drop_first=True)
oneHE3 = pd.get_dummies(data_train['restaurant_features'],prefix='', drop_first=True)


In [0]:
data_train = pd.concat([data_train, oneHE1, oneHE2, oneHE3], axis=1, sort=False)

In [0]:
data_train.drop(['area'],axis=1, inplace=True)
data_train.drop(['cuisine'],axis=1, inplace=True)
data_train.drop(['restaurant_features'],axis=1, inplace=True)


# model selection

In [0]:
models = [  
    ["Nearest Neighbors", KNeighborsClassifier(3)],
    ["Decision Tree", DecisionTreeClassifier(max_depth=5)],
    ["Random Forest", RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)],
    ["Neural Net", MLPClassifier(alpha=1, max_iter=1000)],
    ["AdaBoost", AdaBoostClassifier()],
        ]

In [0]:
# Import train_test_split
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(data_train[:data_train_len], 
                                                    target, 
                                                    test_size = 0.8, 
                                                    random_state = 0)

In [0]:
model_data1 = []
models_w1 = []
# Run all the proposed models and update the information in a list model_data
for name,curr_model in models :
    print(name)
    curr_model_data1 = {}
    curr_model.random_state = 54
    curr_model_data1["Name"] = name
    curr_model.fit(train_X,train_y)
    models_w1.append(curr_model)
    print('model ok')
    curr_model_data1["train_f1_W"] = (f1_score(train_y,curr_model.predict(train_X),average='weighted'))
    curr_model_data1["test_f1_W"] = (f1_score(test_y,curr_model.predict(test_X),average='weighted'))
    print(curr_model_data1["train_f1_W"] )
    print(curr_model_data1["train_f1_W"] )
    model_data1.append(curr_model_data1)
scores = pd.DataFrame(model_data1)

In [0]:
scores = pd.DataFrame(model_data1)

In [0]:
scores

# Grid search 

In [0]:
classifer =  KNeighborsClassifier()

grid_parms = { 
    'n_neighbors' : [3,5,11,19],
    'weights' : ['uniform','distance'],
    'metric' : ['euclidean','manhattan']
} 

gs =  GridSearchCV(classifer, grid_parms, verbose=1, cv=3, n_jobs=-1)
gs_results = gs.fit(train_X,train_y)

In [0]:
gs_results.best_params_

In [0]:
predections = gs_results.best_estimator_.predict(data_train[data_train_len:])

# convert results to csv to uploade it to kaggle

In [0]:
lenn = len(predections)

In [0]:
np.reshape(predections, (lenn, 1))

In [0]:
dataid = pd.read_csv("/gdrive/My Drive/machathon1-main-competition/test.csv")
dataid['Ratings'] = predections.tolist()
ready = dataid[['Id','Ratings']].copy()
ready.to_csv('/gdrive/My Drive/machathon1-main-competition/Submission.csv', index=False)