# Regression

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import math

In [2]:
# importing the newest data 
data=pd.read_csv('../small_business/data/restaurants.csv')
data=data.drop(columns='Unnamed: 0')

In [3]:
#Defining X and y and splitting between test and train 

X = data.drop(columns=['rating','name', 'address', 'label', 'postal_code', 'no_del_exp', 'municipality', 'review_count'])
y = data['rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)

In [5]:
#Creating a Pipeline with one One encoder + filling Na in prices 

price_transformer = SimpleImputer(strategy="most_frequent")
cat_transformer = OneHotEncoder(handle_unknown='ignore')

preproc_basic = make_column_transformer((price_transformer, ['price']),
                                       (cat_transformer, ['neighborhood', 'type']), remainder='passthrough')

preproc = make_pipeline(preproc_basic)


In [6]:
X_train_t = preproc.fit_transform(X_train)

In [9]:
# Make a Grid Search to find the best params of the regression model 
models= {'KNN':{'model':KNeighborsRegressor(),
               'params':{'n_neighbors': [5, 10, 20, 50, 100]}},
        'SVR':{'model':SVR(),
                        'params':{'kernel':['rbf', 'poly']}},
         'DecTree':{'model':DecisionTreeRegressor(), 
                'params':{'max_depth':[1, 2,3]}}
        }

best = {}      
for key, value in models.items():
    grid_search = GridSearchCV(value['model'], param_grid= value['params'],
                               cv=5, scoring="r2", n_jobs = -1)
    grid_search.fit(X_train_t, y_train)
    best[key] = {'params':grid_search.best_params_,
                   'score':grid_search.best_score_}
print(best)

{'KNN': {'params': {'n_neighbors': 50}, 'score': 0.0035246325871077567}, 'SVR': {'params': {'kernel': 'poly'}, 'score': -0.008222787461872504}, 'DecTree': {'params': {'max_depth': 1}, 'score': -0.08188233673063632}}


In [11]:
pipe = make_pipeline(preproc, KNeighborsRegressor(n_neighbors= 50))

In [19]:
pipe.fit(X_train,y_train)
pipe.score(X_test,y_test)

0.06618303882377363

In [13]:
y_pred = pipe.predict(X_test)

In [14]:
def pos(x):
    if x<0: 
        return 0 
    else: 
        return x
positive = np.vectorize(pos)

In [15]:
y_pred = positive(y_pred)

In [16]:
y_pred = y_pred.round(1)

In [17]:
from sklearn.metrics import r2_score
print(f'r2 :{r2_score(y_test, y_pred)}')
print(f'mse :{mean_squared_error(y_test, y_pred)}')
print(f'rmse :{math.sqrt(mean_squared_error(y_test, y_pred))}')

r2 :0.05616896815578143
mse :0.15381679389312977
rmse :0.39219484174722363


In [18]:
from sklearn.dummy import DummyRegressor
baseline_model = DummyRegressor(strategy="mean") # Baseline
baseline_model.fit(X_train, y_train) # Calculate value for stratgy
baseline_model.score(X_test, y_test)

-0.0079773820576583