In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# train test split the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# import regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

#import grid search cv for cross validation
from sklearn.model_selection import GridSearchCV

# import preprocessors
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# load dataset
df = sns.load_dataset('tips')

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [5]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

we are going to predict how much the tip could be, therefore it is a regression task.

--------------------------------

<span style="color: yellow; font-size: 350%; font-weight: bold;">Machine Learning</span>

In [9]:
#selecting features and target
X = df.drop('tip', axis=1)
y = df['tip']

#label and encode categorical variables
le = LabelEncoder()
X['sex'] = le.fit_transform(X['sex'])
X['smoker'] = le.fit_transform(X['smoker'])
X['day'] = le.fit_transform(X['day'])
X['time'] = le.fit_transform(X['time'])

#split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create dictionary of models
models = {
    'Linear Regression': LinearRegression(),
    'Support Vector Regression': SVR(),
    'Decision Tree Regression': DecisionTreeRegressor(),
    'Random Forest Regression': RandomForestRegressor(),
    'K-Nearest Neighbors Regression': KNeighborsRegressor(),
    'Gradient Boosting Regression': GradientBoostingRegressor(),
    'XGBoost Regression': XGBRegressor()
}

model_scores = []

#train, predict and evaluate the models with evaluation metrics
for name, model in models.items():
    
    #fit the model
    model.fit(X_train, y_train)
   
    #predict the model
    y_pred = model.predict(X_test)

    #evaluate the model
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    #append the scores
    model_scores.append([name, r2, mse, mae])

# create dataframe of model scores
model_scores_df = pd.DataFrame(model_scores, columns=['Model', 'R2 Score', 'MSE', 'MAE'])
model_scores_df


Unnamed: 0,Model,R2 Score,MSE,MAE
0,Linear Regression,0.444137,0.694813,0.670381
1,Support Vector Regression,0.569333,0.538322,0.57071
2,Decision Tree Regression,0.040652,1.199157,0.869592
3,Random Forest Regression,0.256075,0.929885,0.774627
4,K-Nearest Neighbors Regression,0.329403,0.838227,0.726245
5,Gradient Boosting Regression,0.350952,0.811291,0.727322
6,XGBoost Regression,0.408849,0.738922,0.67217


In [10]:
#sorting and printing the best model
model_scores_df.sort_values('R2 Score', ascending=False).head(1)


Unnamed: 0,Model,R2 Score,MSE,MAE
1,Support Vector Regression,0.569333,0.538322,0.57071


--------------------------

In [21]:
models = { 
          'LinearRegression' : (LinearRegression(), {}),
          'SVR' : (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid']}),
          'DecisionTreeRegressor' : (DecisionTreeRegressor(), {'max_depth': [None, 5, 10]}),
          'RandomForestRegressor' : (RandomForestRegressor(), {'n_estimators': [10, 100]}),
          'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2)}),
          'GradientBoostingRegressor' : (GradientBoostingRegressor(), {'n_estimators': [10, 100]}),
          'XGBRegressor' : (XGBRegressor(), {'n_estimators': [10, 100]}),          
          }

In [29]:
#train, predict and evaluate the models with evaluation metrics
for name, (model, params) in models.items():
    #creating pipeline
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    
    #create grid search
    grid = GridSearchCV(model, params, cv=5, n_jobs=-1)
    
    #fit the model
    grid.fit(X_train, y_train)
    
    #predict the model
    y_pred = grid.predict(X_test)
    
    #evaluate the model
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
#print the scores
    print(f'{name}: R2 Score = {r2:.4f}, MSE = {mse:.4f}, MAE = {mae:.4f}')
    print(f'{name}: MSE= {mse:.4f}, MAE= {mae:.4f}')
    print(f'{name}: MAE= {mae:.4f}')
    print('***************************')

#choose the best model
best_model = grid.best_estimator_
print(f'The Best Model with Highest R2 score is : {best_model}')

LinearRegression: R2 Score = 0.4441, MSE = 0.6948, MAE = 0.6704
LinearRegression: MSE= 0.6948, MAE= 0.6704
LinearRegression: MAE= 0.6704
***************************
SVR: R2 Score = -0.1686, MSE = 1.4607, MAE = 0.8935
SVR: MSE= 1.4607, MAE= 0.8935
SVR: MAE= 0.8935
***************************
DecisionTreeRegressor: R2 Score = 0.2981, MSE = 0.8774, MAE = 0.7189
DecisionTreeRegressor: MSE= 0.8774, MAE= 0.7189
DecisionTreeRegressor: MAE= 0.7189
***************************
RandomForestRegressor: R2 Score = 0.2218, MSE = 0.9727, MAE = 0.7752
RandomForestRegressor: MSE= 0.9727, MAE= 0.7752
RandomForestRegressor: MAE= 0.7752
***************************
KNeighborsRegressor: R2 Score = 0.4687, MSE = 0.6641, MAE = 0.6204
KNeighborsRegressor: MSE= 0.6641, MAE= 0.6204
KNeighborsRegressor: MAE= 0.6204
***************************
GradientBoostingRegressor: R2 Score = 0.3514, MSE = 0.8107, MAE = 0.7658
GradientBoostingRegressor: MSE= 0.8107, MAE= 0.7658
GradientBoostingRegressor: MAE= 0.7658
**********