In [80]:
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
import matplotlib.pyplot as plt

In [81]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

In [82]:
df = pd.read_csv('data/stud.csv')

In [83]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [84]:
x = df.drop('math_score', axis=1)

In [85]:
y = df['math_score']

In [86]:
x.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [87]:
y.head()

0    72
1    69
2    90
3    47
4    76
Name: math_score, dtype: int64

In [88]:
numerical_feature = x.select_dtypes(exclude='object').columns
categorical_feature = x.select_dtypes(include='object').columns

In [89]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

num_transformer = StandardScaler()
ohe_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", ohe_transformer, categorical_feature),
        ("StandardScaler", num_transformer, numerical_feature)
    ]
)

In [90]:
x = preprocessor.fit_transform(x)

In [91]:
x

array([[ 1.        ,  0.        ,  0.        , ...,  1.        ,
         0.19399858,  0.39149181],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         1.42747598,  1.31326868],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.77010859,  1.64247471],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.12547206, -0.20107904],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.60515772,  0.58901542],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.15336989,  1.18158627]])

In [92]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=33)
x_train.shape, x_test.shape

((800, 19), (200, 19))

In [93]:
def evaluate_model(true, pred):
    mae = mean_absolute_error(true, pred)
    mse = mean_squared_error(true, pred)
    rmse = np.sqrt(mean_squared_error(true, pred))
    r2 = r2_score(true, pred)

    return mae, rmse, r2

In [94]:
models = {
    "LinearRegression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbour Regressor": KNeighborsRegressor(),
    "Decission Tree Regressor": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBoostRegressor": XGBRegressor(),
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor(),
    "SVR": SVR()
}

In [95]:
for i in range(len(list(models))):
    print(list(models.keys())[i])

LinearRegression
Lasso
Ridge
K-Neighbour Regressor
Decission Tree Regressor
Random Forest Regressor
XGBoostRegressor
CatBoosting Regressor
AdaBoost Regressor
SVR


In [96]:
for i in range(len(list(models))):
    print(list(models.values())[i])

LinearRegression()
Lasso()
Ridge()
KNeighborsRegressor()
DecisionTreeRegressor()
RandomForestRegressor()
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, ...)
<catboost.core.CatBoostRegressor object at 0x0000016C4AA9D7F0>
AdaBoostRegressor()
SVR()


In [97]:
model_list = []
r2_list = []

In [98]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

LinearRegression
Model performance for Training set
- Root Mean Squared Error: 5.2794
- Mean Absolute Error: 4.2115
- R2 Score: 0.8827
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.5635
- Mean Absolute Error: 4.4650
- R2 Score: 0.8384


Lasso
Model performance for Training set
- Root Mean Squared Error: 6.5511
- Mean Absolute Error: 5.1795
- R2 Score: 0.8194
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 6.2339
- Mean Absolute Error: 4.8635
- R2 Score: 0.7971


Ridge
Model performance for Training set
- Root Mean Squared Error: 5.2768
- Mean Absolute Error: 4.2067
- R2 Score: 0.8828
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.5690
- Mean Absolute Error: 4.4765
- R2 Score: 0.8381


K-Neighbour Regressor
Model performance for Training set
- Root Mean Squared Error: 5.7629
- Mean Absolute Error: 4.5608
- R2 Score: 0.8602
------------------------

In [99]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=['R2_Score'], ascending=False)

Unnamed: 0,Model Name,R2_Score
0,LinearRegression,0.83842
2,Ridge,0.838098
7,CatBoosting Regressor,0.812072
1,Lasso,0.797131
8,AdaBoost Regressor,0.796479
5,Random Forest Regressor,0.788535
9,SVR,0.776271
3,K-Neighbour Regressor,0.773947
6,XGBoostRegressor,0.766563
4,Decission Tree Regressor,0.692079


In [103]:
linear_model = LinearRegression()
linear_model.fit(x_train, y_train)
y_pred = linear_model.predict(x_test)
score = r2_score(y_test, y_pred)*100
print(f"Accuracy of the model is {int(score)}%")

Accuracy of the model is 83%
