# Regression

In [11]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import math

In [12]:
# importing the newest data 
data=pd.read_csv('../small_business/data/restaurants.csv')
data=data.drop(columns='Unnamed: 0')

In [13]:
#Defining X and y and splitting between test and train 

X = data.drop(columns=['rating','name', 'address', 'label', 'postal_code', 'no_del_exp', 'municipality', 'review_count'])
y = data['rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)

In [14]:
data

Unnamed: 0,name,type,rating,review_count,price,address,label,latitude,longitude,dine_in,takeaway,delivery,drive_through,no_del_exp,curb_pickup,postal_code,municipality,neighborhood
0,Augusto Lisboa,brunch,4.8,1032.0,2,"Rua Santa M.nha 26, 1100-491 Lisboa, Portugal","['dine-in', 'takeaway', 'no delivery']",38.714376,-9.130176,1,1,0,0,1,0,1100-491,Lisboa,Graça
1,Tiffin Cafe & Restaurant Lisboa,cafe,4.9,139.0,2,"R. do Conde 32, 1200-637 Lisboa, Portugal","['dine-in', 'takeaway', 'delivery']",38.706143,-9.161868,1,1,1,0,0,0,1200-637,Lisboa,Prazeres
2,Crisfama,mediterranean,4.8,649.0,1,"Rua da Cruz de Santa Apolónia 56, 1100-188 Lis...","['dine-in', 'takeaway', 'no delivery']",38.717502,-9.120669,1,1,0,0,1,0,1100-188,Lisboa,Santa Engrácia
3,Alma,european,4.8,953.0,4,"R. Anchieta 15, 1200-224 Lisboa, Portugal","['dine-in', 'no takeaway', 'no delivery']",38.710140,-9.141088,1,1,0,0,1,0,1200-224,Lisboa,São Paulo
4,Restaurant CHULHO & Bar,chicken,4.9,180.0,1,"R. de São João da Mata 82, 1200-850 Lisboa, Po...","['dine-in', 'takeaway', 'delivery']",38.707959,-9.158521,1,1,1,0,0,0,1200-850,Lisboa,Lapa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
647,CASA DOS AMIGOS,mexican,4.2,125.0,1,"R. dos Remédios 140, 1100-451 Lisboa","['dine-in', 'takeaway', 'no delivery']",38.712775,-9.126086,1,1,0,0,1,0,1100-451,Lisboa,Santo Estevão
648,El Paso Cantina Mexicana y Cocktail Bar,mexican,4.0,8.0,1,"R. das Portas de Santo Antão 125, 1150-312 Lisboa","['dine-in', 'takeaway']",38.716843,-9.141172,1,1,0,0,0,0,1150-312,Lisboa,Santa Justa
649,La fugitiva,mexican,4.3,22.0,1,"Rua de S. Paulo 186, 1200-058 Lisboa","['dine-in', 'takeaway']",38.708392,-9.145909,1,1,0,0,0,0,1200-058,Lisboa,São Paulo
650,Tico y Taco,mexican,4.8,4.0,1,"R. da Atalaia 4, 1200-050 Lisboa","['dine-in', 'takeaway']",38.710863,-9.144949,1,1,0,0,0,0,1200-050,Lisboa,Encarnação


In [15]:
#Creating a Pipeline with one One encoder + filling Na in prices 

price_transformer = SimpleImputer(strategy="most_frequent")
cat_transformer = OneHotEncoder(handle_unknown='ignore')

preproc_basic = make_column_transformer((price_transformer, ['price']),
                                       (cat_transformer, ['neighborhood', 'type']), remainder='passthrough')

pipe = make_pipeline(preproc_basic, KNeighborsRegressor(n_neighbors= 50))
pipe

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent'),
                                                  ['price']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['neighborhood', 'type'])])),
                ('kneighborsregressor', KNeighborsRegressor(n_neighbors=50))])

In [26]:
pipe.fit(X_train,y_train)
pipe.score(X_test,y_test)

0.06618303882377363

In [27]:
#X_train_t = pipe.fit_transform(X_train)

In [24]:
# Make a Grid Search to find the best params of the regression model 
models= {'KNN':{'model':KNeighborsRegressor(),
               'params':{'n_neighbors': [5, 10, 20, 50, 100]}},
        'SVR':{'model':SVR(),
                        'params':{'kernel':['rbf', 'poly']}},
         'DecTree':{'model':DecisionTreeRegressor(), 
                'params':{'max_depth':[1, 2,3]}}
        }

best = {}      
for key, value in models.items():
    grid_search = GridSearchCV(value['model'], param_grid= value['params'],
                               cv=5, scoring="r2", n_jobs = -1)
    grid_search.fit(X_train_t, y_train)
    best[key] = {'params':grid_search.best_params_,
                   'score':grid_search.best_score_}

NameError: name 'X_train_t' is not defined

In [19]:
#best

{}

In [28]:
y_pred = pipe.predict(X_test)

In [29]:
def pos(x):
    if x<0: 
        return 0 
    else: 
        return x
positive = np.vectorize(pos)

In [30]:
y_pred = positive(y_pred)

In [31]:
y_pred = y_pred.round(1)

In [32]:
from sklearn.metrics import r2_score
print(f'r2 :{r2_score(y_test, y_pred)}')
print(f'mse :{mean_squared_error(y_test, y_pred)}')
print(f'rmse :{math.sqrt(mean_squared_error(y_test, y_pred))}')

r2 :0.05616896815578143
mse :0.15381679389312977
rmse :0.39219484174722363
