# Classification

In [19]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

In [29]:
data = pd.read_csv('../small_business/data/restaurants.csv')
data = data.drop(columns = 'Unnamed: 0')

In [59]:
X = data.drop(columns=['rating','name', 'address', 'label', 'postal_code', 'no_del_exp', 'municipality', 'review_count'])
y = data['rating']

In [60]:
y_class=pd.cut(x=y, bins=[0, 4.2,4.4,4.6, 5], 
                        labels=["below_avg", "2_q", "3_q","4_q"])

In [61]:
X_train, X_test, yc_train, yc_test = train_test_split(X, y_class, test_size=0.20, random_state=42)

In [67]:
price_transformer = SimpleImputer(strategy="most_frequent")
cat_transformer = OneHotEncoder(handle_unknown='ignore')

preproc_basic = make_column_transformer((price_transformer, ['price']),
                                       (cat_transformer, ['neighborhood', 'type']), remainder='passthrough')

pipe = make_pipeline(preproc_basic)
pipe

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent'),
                                                  ['price']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['neighborhood', 'type'])]))])

In [73]:
X_train_t = pipe.fit_transform(X_train)

In [104]:
models= {'KNN':{'model':KNeighborsClassifier(),
               'params':{'n_neighbors': [5, 10, 20, 50, 100]}},
        'RandomForest':{'model':RandomForestClassifier(),
                        'params':{'n_estimators':[50, 100]}},
         'SVC':{'model':LinearSVC(), 
                'params':{'C':[1, 2,3]}}
        }

best = {}      
for key, value in models.items():
    grid_search = GridSearchCV(value['model'], param_grid= value['params'],
                               cv=5, scoring="accuracy", n_jobs = -1)
    grid_search.fit(X_train_t, yc_train)
    best[key] = {'params':grid_search.best_params_,
                   'score':grid_search.best_score_}



In [108]:
best

{'KNN': {'params': {'n_neighbors': 50}, 'score': 0.4261172161172161},
 'RandomForest': {'params': {'n_estimators': 100},
  'score': 0.39333333333333337},
 'SVC': {'params': {'C': 1}, 'score': 0.33216117216117214}}

In [69]:
#pipe.get_params()

In [76]:
grid_search.fit(X_train_t, yc_train)
grid_search.best_params_

{'n_neighbors': 50}

In [109]:
price_transformer = SimpleImputer(strategy="most_frequent")
cat_transformer = OneHotEncoder(handle_unknown='ignore')

preproc_basic = make_column_transformer((price_transformer, ['price']),
                                       (cat_transformer, ['neighborhood', 'type']), remainder='passthrough')

pipe = make_pipeline(preproc_basic, KNeighborsClassifier(n_neighbors=50))
pipe

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent'),
                                                  ['price']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['neighborhood', 'type'])])),
                ('kneighborsclassifier', KNeighborsClassifier(n_neighbors=50))])

In [112]:
pipe.fit(X_train,yc_train)
pipe.score(X_test,yc_test)

0.5114503816793893

In [113]:
y_pred = pipe.predict(X_test)

array(['3_q', '3_q', '3_q', '3_q', '3_q', '3_q', 'below_avg', '3_q',
       '3_q', '3_q', '3_q', 'below_avg', '3_q', '3_q', '3_q', 'below_avg',
       '3_q', '3_q', '3_q', 'below_avg', '3_q', 'below_avg', 'below_avg',
       '3_q', '3_q', '3_q', '3_q', '3_q', '3_q', '3_q', '3_q', '3_q',
       '3_q', '3_q', '3_q', '3_q', '3_q', '3_q', '3_q', '3_q', '3_q',
       '3_q', '3_q', '3_q', '3_q', '3_q', '3_q', 'below_avg', '3_q',
       '3_q', '3_q', '3_q', '3_q', '3_q', '3_q', '3_q', '3_q', '3_q',
       '3_q', '3_q', '3_q', '3_q', '3_q', '3_q', 'below_avg', 'below_avg',
       'below_avg', '3_q', '3_q', '4_q', '3_q', '3_q', '3_q', '3_q',
       '3_q', 'below_avg', '3_q', '3_q', '3_q', '3_q', '3_q', '3_q',
       'below_avg', '3_q', '3_q', '3_q', '3_q', '3_q', '3_q', 'below_avg',
       '3_q', '3_q', '3_q', '3_q', '3_q', '3_q', 'below_avg', '3_q',
       '4_q', '3_q', '3_q', '3_q', '3_q', '3_q', '3_q', '3_q', '3_q',
       '3_q', '3_q', '3_q', '3_q', '3_q', '3_q', 'below_avg', '3_q',
       