In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Modelling
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [17]:
df = pd.read_csv(r"C:\Users\F2076TU\MLproject\notebook\data\raw.csv")
df.head()

Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [18]:
df.drop(columns = 'CustomerID', axis = 1, inplace = True)

In [19]:
df.head()

Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,Male,19,15,39
1,Male,21,15,81
2,Female,20,16,6
3,Female,23,16,77
4,Female,31,17,40


In [20]:
x = df.drop(columns = 'Spending Score (1-100)', axis = 1)

In [21]:
x.head()

Unnamed: 0,Gender,Age,Annual Income (k$)
0,Male,19,15
1,Male,21,15
2,Female,20,16
3,Female,23,16
4,Female,31,17


In [22]:
y = df['Spending Score (1-100)']
y.head()

0    39
1    81
2     6
3    77
4    40
Name: Spending Score (1-100), dtype: int64

In [23]:
num_col = x.select_dtypes(exclude = 'object').columns
cat_col = x.select_dtypes(include = 'object').columns

In [24]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [25]:
num_transformer = StandardScaler()
ohe_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", ohe_transformer, cat_col),
        ("StandardScaler", num_transformer, num_col)
    ]
)

In [26]:
X = preprocessor.fit_transform(x)

In [28]:
X

array([[ 0.        ,  1.        , -1.42456879, -1.73899919],
       [ 0.        ,  1.        , -1.28103541, -1.73899919],
       [ 1.        ,  0.        , -1.3528021 , -1.70082976],
       [ 1.        ,  0.        , -1.13750203, -1.70082976],
       [ 1.        ,  0.        , -0.56336851, -1.66266033],
       [ 1.        ,  0.        , -1.20926872, -1.66266033],
       [ 1.        ,  0.        , -0.27630176, -1.62449091],
       [ 1.        ,  0.        , -1.13750203, -1.62449091],
       [ 0.        ,  1.        ,  1.80493225, -1.58632148],
       [ 1.        ,  0.        , -0.6351352 , -1.58632148],
       [ 0.        ,  1.        ,  2.02023231, -1.58632148],
       [ 1.        ,  0.        , -0.27630176, -1.58632148],
       [ 1.        ,  0.        ,  1.37433211, -1.54815205],
       [ 1.        ,  0.        , -1.06573534, -1.54815205],
       [ 0.        ,  1.        , -0.13276838, -1.54815205],
       [ 0.        ,  1.        , -1.20926872, -1.54815205],
       [ 1.        ,  0.

In [29]:
X.shape

(200, 4)

In [30]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)
X_train.shape, X_test.shape

((160, 4), (40, 4))

In [45]:
def evaluate_model(true, predicted):
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(true, predicted)
    r2 = r2_score(true, predicted)

    return mse, rmse, mae, r2

In [51]:
models = {
    'Linear Regression': LinearRegression(), 
    'Lasso': Lasso(),
    'Ridge': Ridge(),
    'K-Neighbors Regressor': KNeighborsRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest Regressor': RandomForestRegressor(),
    'XGBoost Regressor': XGBRegressor(),
    'CatBoosting Regressor': CatBoostRegressor(verbose=False),
    'AdaBoost Regressor': AdaBoostRegressor()
    }

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    #Make Predictions
    y_train_cap = model.predict(X_train)
    y_test_cap = model.predict(X_test)

    #Evaluate Train and Test dataset
    model_train_mse, model_train_rmse, model_train_mae, model_train_r2 = evaluate_model(y_train, y_train_cap)
    model_test_mse, model_test_rmse, model_test_mae, model_test_r2 = evaluate_model(y_test, y_test_cap)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance fro Training set')
    print("- Mean Squared Error: ",model_train_mse)
    print("- Root mean squared error: ",model_train_rmse)
    print("- Mean Absolute error: ",model_train_mae)
    print("- R2 Score: ",model_train_r2)

    print("--------------------------------")

    print('Model performance fro Test set')
    print("- Mean Squared Error: ",model_test_mse)
    print("- Root mean squared error: ",model_test_rmse)
    print("- Mean Absolute error: ",model_test_mae)
    print("- R2 Score: ",model_test_r2)

    print("="*45)
    print('\n')

Linear Regression
Model performance fro Training set
- Mean Squared Error:  624.5327880859375
- Root mean squared error:  24.99065401476995
- Mean Absolute error:  20.887109375
- R2 Score:  0.09969063767288744
--------------------------------
Model performance fro Test set
- Mean Squared Error:  478.7537109375
- Root mean squared error:  21.88044128754034
- Mean Absolute error:  18.103125
- R2 Score:  0.029369655872394707


Lasso
Model performance fro Training set
- Mean Squared Error:  626.7168057926796
- Root mean squared error:  25.03431256880603
- Mean Absolute error:  20.80142938559346
- R2 Score:  0.09654221756368164
--------------------------------
Model performance fro Test set
- Mean Squared Error:  476.0894874851157
- Root mean squared error:  21.819474958969927
- Mean Absolute error:  17.92317185786002
- R2 Score:  0.03477113071706328


Ridge
Model performance fro Training set
- Mean Squared Error:  624.4766743066393
- Root mean squared error:  24.98953129425679
- Mean Absol