# REGRESSION

In [454]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import math
import itertools
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn import model_selection
from sklearn.model_selection import cross_validate
from sklearn.model_selection import learning_curve
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split


In [455]:
def linearReg(X, y):
    model = sm.OLS(endog=y, exog=X).fit() # Finds the best beta
    return model.summary(), model

In [456]:
def LinRegElastic(X, y):
    alphas = [0.01, 0.1, 1]
    l1_ratios = [0.2, 0.5, 0.8]
    hyperparams = itertools.product(alphas, l1_ratios)
    for hyperparam in hyperparams:
        alpha = hyperparam[0]
        l1_ratio = hyperparam[1]
        model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
        r2 = cross_val(model, X, y, cv=5).mean()
        print(f"alpha: {alpha}, l1_ratio: {l1_ratio},   r2: {r2}")

In [457]:
# Lasso, Ridge
def feature_perm(X, y, alpha = 0.2):
    ridge = Ridge(alpha=alpha).fit(X, y)
    lasso = Lasso(alpha=alpha).fit(X, y)
    return cross_val(ridge, X, y, cv=5, scoring=['r2', 'mse']), cross_val(lasso, X, y, cv=5, scoring=['r2', 'mse'])

# SGD Regression
def SGDReg(X, y, learning_rate, loss='mse', penalty='l2', alpha=0.0001, l1_ratio=0.15,
    max_iter = 1000, random_state= 42, *args, **kwargs):

    sgd_reg = SGDRegressor(loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio,
    max_iter = max_iter, random_state= random_state,
    learning_rate=learning_rate).fit(X, y)
    sgd_model_cv = cross_val(sgd_reg,
                              X,
                              y,
                              cv = 5,
                              scoring = ['r2','mse'] )
    return sgd_model_cv, sgd_reg

# KNN Regressor
def best_k( X, y):
    score = []
    neighbours = []

    for k in range(1,25):

        # Instanciate the model
        knn_model = KNeighborsRegressor(n_neighbors = k)

        # Train the model on the scaled Training data
        cv_results = cross_val(knn_model, X ,y)

        # Append the score and k
        score.append(cv_results['test_score'].mean())
        neighbours.append(k)
    best = dict(zip(neighbours, score))
    best_k  = min(best, key=best.get)
    return best_k

def KNNReg(X, y, best_k, *args, **kwargs):
    knn_reg = KNeighborsRegressor(n_neighbors=best_k, n_jobs = -1).fit(X, y)
    knn_reg_cv = cross_val(knn_reg,
                              X,
                              y,
                              cv = 5,
                              scoring = 'mse')
    return knn_reg_cv, knn_reg

#SVM
def SVMReg(X, y):
    svm_reg = SVR(epsilon=0.1, C=1, kernel='linear').fit(X, y)
    svm_reg_cv = cross_val(svm_reg,
                              X,
                              y,
                              cv = 5,
                              scoring = 'mse')
    return svm_reg_cv, svm_reg

# cross_val
def cross_val(model, X, y, cv=5, scoring=['r2'], *args, **kwargs):
    cv_results =  cross_validate(model, X, y, cv=cv, scoring=scoring,  n_jobs =-1)
    return pd.DataFrame(cv_results)

# Grid Search
def GridSearch(model, grid, X, y):
    search = GridSearchCV(model, grid,
                           scoring = 'r2',
                           cv = 5,
                           max_iter = 1000,
                           n_jobs =-1
                          )
    search.fit(X, y)
    return search.best_score_ , search.best_params_ , search.best_estimator_

# Learning Curves
def learning_curves(model, X, y):
    train_sizes, train_scores, test_scores = learning_curve(estimator = model,
                                                                X = X,
                                                                y = y,
                                                                train_sizes = [50,100,250,500,750,1000,1250],
                                                                cv = 10,
                                                                scoring = 'r2')
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)

    plt.plot(train_sizes, train_scores_mean, label = 'Training score')
    plt.plot(train_sizes, test_scores_mean, label = 'Test score')
    plt.ylabel('r2 score', fontsize = 14)
    plt.xlabel('Training set size', fontsize = 14)
    plt.title('Learning curves', fontsize = 18, y = 1.03)
    plt.legend()
    return plt.show()

def metrics(y, y_pred):
    mse = mean_squared_error(y, y_pred)
    rmse = math.sqrt(mse)
    mae = mean_absolute_error(y, y_pred)
    rsquared = r2_score(y, y_pred)
    print('MSE =', round(mse, 2))
    print('RMSE =', round(rmse, 2))
    print('MAE =', round(mae, 2))
    print('R2 =', round(rsquared, 2))

# Decision Tree

# Random Forest
#Boosters



In [458]:
def get_prediction(model, input):
    return model.predict(input)


In [459]:
data = pd.read_csv('../raw_data/Clean_data_1_12_v2.csv')

In [460]:
data.type.value_counts()

restaurant           282
cafe                  51
portuguese            27
fast food             22
indian                16
                    ... 
restaurante halal      1
pizaria                1
mexican                1
american               1
turkish                1
Name: type, Length: 86, dtype: int64

In [461]:
def deletespace(x):
    return x.strip()

def replace_type(x):
    result=x
    for k, v in dico.items():
        if x in v:
            result=k
    return result

In [462]:
dico={}
dico['restaurant']=['restaurant', 'restaurante', 'family']
dico['cafe']=['cafe', 'tea room', 'bubble tea store', 'torrefatores de café','coffee shop', 'coffee roasters', 'art cafe', 'café', 'espresso bar', 'coffee store']
dico['italian']=['italian']
dico['pizza']= ['pizza takeaway', 'pizza', 'pizaria']
dico['fast_food']=['fast food', 'hamburger', 'comida rápida', 'sandwich shop',  'kebab shop', 'hot dog stand', 'hot dog ', 'fried chicken takeaway', 'chicken']
dico['tapas']=['tapas', 'tapas bar']
dico['bar']=['pub', 'bar', 'cocktail bar', 'gastropub']
dico['brunch']=['brunch',  'breakfast', 'restaurante de brunch']
dico['show']=['fado', 'dinner theatre']
dico['bakery_pastry']=['pastry shop','dessert shop', 'bakery' ]
dico['grill']=['grill', 'barbecue','steak house' ]
dico['veggie_healthy']= ['health food', 'vegan', 'vegetarian']
dico['japanese']=['sushi', 'japanese']
dico['american']=['american', 'diner']
dico['african']=[ 'african', 'moroccan']
dico['asian']=['asian', 'pan-asian', 'vietnamese',  'thai',  'chinese']
dico['european']=['european', 'modern european']
dico['indian']=['indian muslim', 'indian', 'bangladeshi', 'restaurante nepalês', 'nepalese ']
dico['mexican']=['restaurante mexicano','mexican']
dico['south_am']=['peruvian', 'argentinian']
dico['middle_eastern']=['middle eastern', 'turkish','georgian', 'halal',  'restaurante halal' ]
dico['bistro']=['bistro']
dico['traditional']=['traditional']
dico['international_rest']=['belgian', 'austrian', 'australian', 'french']
dico['mediterranean']=['mediterranean', 'andalusian', 'basque']
dico['fine-dining']=['fine-dining']
dico['seafood']=['seafood']
dico['portuguese']=['portuguese']

In [463]:
data['type_gen']=data.type.map(deletespace)
data['type_gen']=data.type.map(replace_type)

In [464]:
data.columns

Index(['Unnamed: 0', 'name', 'type', 'description', 'rating', 'review_count',
       'price', 'address', 'label', 'dine_in', 'takeaway', 'delivery',
       'drive_through', 'no_del_exp', 'curb_pickup', 'postal_code',
       'municipality', 'neighborhood', 'type_gen'],
      dtype='object')

In [465]:
X = data[['review_count', 'price', 'dine_in', 'takeaway', 'delivery','drive_through', 'type_gen','no_del_exp', 'curb_pickup', 'neighborhood']]

In [466]:
y = data['rating']

In [467]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [468]:
X

Unnamed: 0,review_count,price,dine_in,takeaway,delivery,drive_through,type_gen,no_del_exp,curb_pickup,neighborhood
0,1032.0,2.0,1,1,0,0,brunch,1,0,Graça
1,139.0,2.0,1,1,1,0,brunch,0,0,Prazeres
2,649.0,1.0,1,1,0,0,restaurant,1,0,Santa Engrácia
3,953.0,4.0,1,1,0,0,fine dining,1,0,São Paulo
4,180.0,,1,1,1,0,restaurant,0,0,Lapa
...,...,...,...,...,...,...,...,...,...,...
580,116.0,1.0,1,1,0,0,restaurant,1,0,São Nicolau
581,2.0,,1,1,1,0,restaurant,0,0,Campolide
582,394.0,1.0,1,1,1,0,turkish,0,0,Alto do Pina
583,24.0,,1,0,1,1,restaurant,0,0,Santa Catarina


## Pipe

In [469]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="most_frequent") 

X_train['price']= imputer.fit_transform(X_train[['price']]); 


In [470]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler


In [471]:
X_train

Unnamed: 0,review_count,price,dine_in,takeaway,delivery,drive_through,type_gen,no_del_exp,curb_pickup,neighborhood
77,15.0,2.0,1,0,0,0,restaurant,0,1,Alto do Pina
436,492.0,2.0,1,1,1,0,restaurant,0,0,Lapa
211,267.0,2.0,1,1,0,0,vegetarian,0,0,São Paulo
192,595.0,1.0,1,1,1,0,indian,0,0,Anjos
450,428.0,2.0,1,0,0,0,cafe,1,0,São Cristóvão
...,...,...,...,...,...,...,...,...,...,...
71,1204.0,2.0,1,1,1,0,vegan,0,0,Mártires
106,972.0,2.0,1,1,1,0,italian,0,0,Madalena
270,264.0,1.0,1,1,0,0,restaurant,0,0,São Cristóvão
435,203.0,1.0,0,0,0,0,restaurant,0,0,Santo Estevão


In [472]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ohe = OneHotEncoder(handle_unknown='ignore', sparse = False) 
train_X_encoded = pd.DataFrame(ohe.fit_transform(X_train[['price', 'neighborhood', 'type_gen']]))
train_X_encoded.columns = ohe.get_feature_names_out()


In [473]:
train_X_encoded

Unnamed: 0,price_1.0,price_2.0,price_3.0,price_4.0,neighborhood_Alcântara,neighborhood_Alto do Pina,neighborhood_Alvalade,neighborhood_Anjos,neighborhood_Beato,neighborhood_Campo Grande,...,type_gen_sushi,type_gen_tapas,type_gen_tapas.1,type_gen_thai,"type_gen_tour operator in lisbon, portugal",type_gen_traditional,type_gen_turkish,type_gen_vegan,type_gen_vegetarian,type_gen_vietnamese
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
463,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
464,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
465,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
466,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [474]:
X_train= X_train.join(train_X_encoded, how='left')

In [475]:
X_train = X_train.drop(columns = ['neighborhood', 'type_gen', 'review_count'])

## Regression

In [476]:
X_train = X_train.fillna(0)

In [477]:
model = LinearRegression().fit(X_train, y_train)

In [478]:
linearReg(X_train, y_train)

(<class 'statsmodels.iolib.summary.Summary'>
 """
                                  OLS Regression Results                                
 Dep. Variable:                 rating   R-squared (uncentered):                   0.973
 Model:                            OLS   Adj. R-squared (uncentered):              0.965
 Method:                 Least Squares   F-statistic:                              123.2
 Date:                Wed, 01 Dec 2021   Prob (F-statistic):                   5.90e-233
 Time:                        18:00:33   Log-Likelihood:                         -514.60
 No. Observations:                 468   AIC:                                      1239.
 Df Residuals:                     363   BIC:                                      1675.
 Df Model:                         105                                                  
 Covariance Type:            nonrobust                                                  
                                                  coef    st

In [440]:
from xgboost import XGBRegressor(max_depth=10, n_estimators=100, learning_rate=0.1)

xgb_reg.fit(X_train, y_train,
    # evaluate loss at each iteration
    eval_set=[(X_train, y_train), (X_val, y_val)],  
    # stop iterating when eval loss increases 5 times in a row
    early_stopping_rounds=5
)

y_pred = xgb_reg.predict(X_val)

from sklearn.pipeline import make_pipeline

pipe_xgb = make_pipeline(xgb_reg)
cv_results = cross_validate(pipe_xgb,X,y,cv=10,scoring='r2')

SyntaxError: invalid syntax (4112532204.py, line 1)

In [449]:
def pipeline(data, test_size = 0.2):
    data['type_gen']=data.type.map(deletespace)
    data['type_gen']=data.type.map(replace_type)
    data=data[data.type_gen != 'out1']
    #data = data[data.type_gen != 'restaurant']
    X = data[['review_count', 'price', 'dine_in', 'takeaway', 'delivery','drive_through', 'type_gen','no_del_exp', 'curb_pickup', 'neighborhood']]
    y = data['rating']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = 42)
    imputer = SimpleImputer(strategy="most_frequent")
    imputer.fit(X_train[['price']])
    X_train['price']= imputer.transform(X_train[['price']])
    X_test['price']= imputer.transform(X_test[['price']])
    ohe = OneHotEncoder(handle_unknown='ignore', sparse = False)
    ohe.fit(X_train[['price', 'neighborhood', 'type_gen']])
    X_train_encoded = pd.DataFrame(ohe.transform(X_train[['price', 'neighborhood', 'type_gen']]))
    X_test_encoded = pd.DataFrame(ohe.transform(X_test[['price', 'neighborhood', 'type_gen']]))
    X_test_encoded.columns = ohe.get_feature_names_out()
    X_train_encoded.columns = ohe.get_feature_names_out()
    X_train= X_train.join(X_train_encoded, how='left')
    X_test = X_test.join(X_test_encoded, how='left')
    X_train = X_train.drop(columns = ['neighborhood', 'type_gen', 'review_count'])
    X_test = X_test.drop(columns = ['neighborhood', 'type_gen', 'review_count'])
    X_train = X_train.fillna(0)
    X_test = X_test.fillna(0)
    return  X_train, X_test, y_train, y_test


In [452]:
pipeline(data)[0]

Unnamed: 0,price,dine_in,takeaway,delivery,drive_through,no_del_exp,curb_pickup,price_1.0,price_2.0,price_3.0,...,type_gen_sushi,type_gen_tapas,type_gen_tapas.1,type_gen_thai,"type_gen_tour operator in lisbon, portugal",type_gen_traditional,type_gen_turkish,type_gen_vegan,type_gen_vegetarian,type_gen_vietnamese
77,2.0,1,0,0,0,0,1,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
436,2.0,1,1,1,0,0,0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
211,2.0,1,1,0,0,0,0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
192,1.0,1,1,1,0,0,0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
450,2.0,1,0,0,0,1,0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71,2.0,1,1,1,0,0,0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
106,2.0,1,1,1,0,0,0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
270,1.0,1,1,0,0,0,0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
435,1.0,0,0,0,0,0,0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
