In [1]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [2]:
# importing the newest data 
data=pd.read_csv('../small_business/data/restaurants.csv')
data=data.drop(columns='Unnamed: 0')

In [3]:
data.head(2)

Unnamed: 0,name,type,rating,review_count,price,address,label,dine_in,takeaway,delivery,drive_through,no_del_exp,curb_pickup,postal_code,municipality,neighborhood
0,Augusto Lisboa,brunch,4.8,1032.0,2,"Rua Santa M.nha 26, 1100-491 Lisboa, Portugal","['dine-in', 'takeaway', 'no delivery']",1,1,0,0,1,0,1100-491,Lisboa,Graça
1,Tiffin Cafe & Restaurant Lisboa,cafe,4.9,139.0,2,"R. do Conde 32, 1200-637 Lisboa, Portugal","['dine-in', 'takeaway', 'delivery']",1,1,1,0,0,0,1200-637,Lisboa,Prazeres


In [4]:
#Defining X and y and splitting between test and train 

X = data.drop(columns=['rating','name', 'address', 'label', 'postal_code', 'no_del_exp', 'municipality', 'review_count'])
y = data['rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)

In [5]:
X.head(3)

Unnamed: 0,type,price,dine_in,takeaway,delivery,drive_through,curb_pickup,neighborhood
0,brunch,2,1,1,0,0,0,Graça
1,cafe,2,1,1,1,0,0,Prazeres
2,mediterranean,1,1,1,0,0,0,Santa Engrácia


In [6]:
#Creating a Pipeline with one One encoder + filling Na in prices 

price_transformer = SimpleImputer(strategy="most_frequent")
cat_transformer = OneHotEncoder(handle_unknown='ignore')

preproc_basic = make_column_transformer((price_transformer, ['price']),
                                       (cat_transformer, ['neighborhood', 'type']), remainder='passthrough')

pipe = make_pipeline(preproc_basic, Ridge())
pipe

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('simpleimputer',
                                                  SimpleImputer(strategy='most_frequent'),
                                                  ['price']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['neighborhood', 'type'])])),
                ('ridge', Ridge())])

In [7]:
pipe.fit(X_train,y_train)
pipe.score(X_test,y_test)

0.005632208301196262

In [8]:
# Make a Grid Search to find the best params of the regression model 

pipe.get_params()

grid_search = GridSearchCV(
    pipe, 
    param_grid={
        'ridge__alpha': [0.1, 0.5, 1, 5, 10]},
    cv=5,
    scoring="r2")

grid_search.fit(X_train, y_train)

model=grid_search.best_estimator_

print(grid_search.best_params_, grid_search.best_score_)

{'ridge__alpha': 10} 0.011467726261738776


In [9]:
# Predicting score for the 2 example below: 
pipe.predict(X_test.iloc[0:2])

array([4.57204436, 4.46435614])

In [10]:
X_test.iloc[0:2]

Unnamed: 0,type,price,dine_in,takeaway,delivery,drive_through,curb_pickup,neighborhood
37,bistro,1,1,1,0,0,0,Anjos
444,bar,1,1,1,1,0,0,Mercês
