In [1]:
# Import libraries
import numpy as np
import pandas as pd

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Model training, validation, and evaluation
# Split arrays or matrices into random train and test subsets.
# Evaluate a score by cross-validation.
from sklearn.model_selection import train_test_split, cross_val_score

# Model building
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

# Mean absolute error regression loss.
# Mean squared error regression loss.
# Root mean squared error regression loss.
# R2 (coefficient of determination) regression score function.
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
# Load dataset
df = pd.read_csv('data/stud.csv')
# Diplay top & last 5 rows of data
df

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [3]:
# Add columns for "Total Score"
df['total_score'] = df['math_score'] + df['reading_score'] + df['writing_score']
# Display top 5 rows of data
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,total_score
0,female,group B,bachelor's degree,standard,none,72,72,74,218
1,female,group C,some college,standard,completed,69,90,88,247
2,female,group B,master's degree,standard,none,90,95,93,278
3,male,group A,associate's degree,free/reduced,none,47,57,44,148
4,male,group C,some college,standard,none,76,78,75,229


In [4]:
# Split data into X, y features
X = df.drop(columns=['total_score'], axis=1)
y = df['total_score']

In [5]:
X.shape, y.shape

((1000, 8), (1000,))

In [6]:
# Check the categorical indexs
print("Categories in 'gender' variable:     ",end=" " )
print(df['gender'].unique())

print("Categories in 'race_ethnicity' variable:  ",end=" ")
print(df['race_ethnicity'].unique())

print("Categories in'parental level of education' variable:",end=" " )
print(df['parental_level_of_education'].unique())

print("Categories in 'lunch' variable:     ",end=" " )
print(df['lunch'].unique())

print("Categories in 'test preparation course' variable:     ",end=" " )
print(df['test_preparation_course'].unique())

Categories in 'gender' variable:      ['female' 'male']
Categories in 'race_ethnicity' variable:   ['group B' 'group C' 'group A' 'group D' 'group E']
Categories in'parental level of education' variable: ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
Categories in 'lunch' variable:      ['standard' 'free/reduced']
Categories in 'test preparation course' variable:      ['none' 'completed']


In [7]:
# Create ColumnTransformer with 3 types of transformers
num_features = X.select_dtypes(exclude= 'object').columns
cat_features = X.select_dtypes(include= 'object').columns

# Data preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_trf = StandardScaler()
ohe_trf = OneHotEncoder(dtype= 'int64')

preprocess = ColumnTransformer(
    [
        ('OneHotEncoding', ohe_trf, cat_features),
        ('StandardScaler', numeric_trf, num_features)
    ]
)

In [8]:
# Apply on X 
X = preprocess.fit_transform(X)

In [9]:
# Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)
X_train.shape, X_test.shape

((800, 20), (200, 20))

In [10]:
# Calling different regression models
linear = LinearRegression()
lasso = Lasso()
ridge = Ridge()
elasticnet = ElasticNet()
knn = KNeighborsRegressor()
svr = SVR()
dt = DecisionTreeRegressor()
rf = RandomForestRegressor()
ada = AdaBoostRegressor()
catboost = CatBoostRegressor(verbose=False, metric_period=100)
xgboost = XGBRegressor()

In [11]:
# Convert model into dictory
models = {
    'Linear_Reg': linear,
    'Lasso_Reg': lasso,
    'Ridge_Reg': ridge,
    'ElasticNet': elasticnet,
    'KNeighbors': knn,
    'SVR': svr,
    'Decision Tree': dt,
    'Random Forest': rf,
    'AdaBoost': ada,
    'CatBoost': catboost,
    'XgBoost': xgboost,
}

In [12]:
# Check the testing of cross_val_score on all the models
def cross_val_src(model, x_trn, y_trn):
    src = np.round(cross_val_score(model, x_trn, y_trn, cv=5), 2)
    mean_src = np.round(np.mean(src), 2)
    return src, mean_src

In [13]:
# Testing with cross_val_score
result = []
mean_src = []

for model_name, model in models.items():
    current_rst, current_mean_src = cross_val_src(model, X_train, y_train)

    print('Model Name :', model_name)
    print('\n')
    print('Current Result :', current_rst)
    print('Current Mean Score :', current_mean_src)
    print('\n')

    result.append(current_rst)
    mean_src.append(current_mean_src)


Model Name : Linear_Reg


Current Result : [1. 1. 1. 1. 1.]
Current Mean Score : 1.0


Model Name : Lasso_Reg


Current Result : [1. 1. 1. 1. 1.]
Current Mean Score : 1.0


Model Name : Ridge_Reg


Current Result : [1. 1. 1. 1. 1.]
Current Mean Score : 1.0


Model Name : ElasticNet


Current Result : [0.97 0.97 0.97 0.97 0.97]
Current Mean Score : 0.97


Model Name : KNeighbors


Current Result : [0.96 0.97 0.96 0.96 0.96]
Current Mean Score : 0.96


Model Name : SVR


Current Result : [0.78 0.87 0.86 0.9  0.78]
Current Mean Score : 0.84


Model Name : Decision Tree


Current Result : [0.98 0.99 0.98 0.98 0.99]
Current Mean Score : 0.98


Model Name : Random Forest


Current Result : [1.   1.   1.   1.   0.99]
Current Mean Score : 1.0


Model Name : AdaBoost


Current Result : [0.97 0.98 0.98 0.98 0.98]
Current Mean Score : 0.98


Model Name : CatBoost


Current Result : [1. 1. 1. 1. 1.]
Current Mean Score : 1.0


Model Name : XgBoost


Current Result : [1. 1. 1. 1. 1.]
Current Mean Sc

In [14]:
X_test.shape[1]

20

In [15]:
len(y)

1000

In [16]:
# Evaluate Function to give all metrics after model Training 
def evaluate_model(true, predictor):
    mse = np.round(mean_squared_error(true, predictor), 2)
    rmse = np.round(np.sqrt(mean_squared_error(true, predictor)), 2)
    mae = np.round(mean_absolute_error(true, predictor), 2)
    r2 = np.round(r2_score(true, predictor), 2)
    adj_r2 = np.round(1 - ((1-r2) * (200 - 1) / (200-20-1)))

    return mse, rmse, mae, r2, adj_r2


In [17]:
# Train all models and get the traning loss/metrics evaluation
results = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # Make prediction
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate Train and Test dataset
    trn_mse, trn_rmse, trn_mae, trn_r2, trn_adj_r2 = evaluate_model(y_train, y_train_pred)
    tst_mse, tst_rmse, tst_mae, tst_r2, tst_adj_r2 = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])
    
    print('Model performance for Training set')
    print(f'Mean Squared Error : {trn_mse}')
    print(f'Root Mean Squared Error : {trn_rmse}')
    print(f"Mean Absolute Error: {trn_mae}")
    print(f"R2 Score: {trn_r2}")
    print(f'Adjusted R2 Score : {trn_adj_r2}')
    print('-'*35)

    print('Model performance for Test set')
    print(f'Mean Squared Error : {tst_mse}')
    print(f'Root Mean Squared Error : {tst_rmse}')
    print(f"Mean Absolute Error: {tst_mae}")
    print(f"R2 Score: {tst_r2}")
    print(f'Adjusted R2 Score : {tst_adj_r2}')
    print('-'*35)

    
    print('\n *** Model Detail ***')
    train = np.round(model.score(X_train, y_train), 2)
    test = np.round(model.score(X_test, y_test), 2)
    print(f" Training Model score :\n {train}")
    print(f" Testing Model score :\n {test}")
    print('='*35)
    print('\n')

    results.append({
        'model': model.__class__.__name__,
        'mse': trn_mse,
        'rmse': trn_rmse,
        'mae': trn_mae,
        'r2': trn_r2,
        'adj_r2': trn_adj_r2
    })

Linear_Reg
Model performance for Training set
Mean Squared Error : 0.0
Root Mean Squared Error : 0.0
Mean Absolute Error: 0.0
R2 Score: 1.0
Adjusted R2 Score : 1.0
-----------------------------------
Model performance for Test set
Mean Squared Error : 0.0
Root Mean Squared Error : 0.0
Mean Absolute Error: 0.0
R2 Score: 1.0
Adjusted R2 Score : 1.0
-----------------------------------

 *** Model Detail ***
 Training Model score :
 1.0
 Testing Model score :
 1.0


Lasso_Reg
Model performance for Training set
Mean Squared Error : 1.13
Root Mean Squared Error : 1.06
Mean Absolute Error: 0.85
R2 Score: 1.0
Adjusted R2 Score : 1.0
-----------------------------------
Model performance for Test set
Mean Squared Error : 1.24
Root Mean Squared Error : 1.11
Mean Absolute Error: 0.88
R2 Score: 1.0
Adjusted R2 Score : 1.0
-----------------------------------

 *** Model Detail ***
 Training Model score :
 1.0
 Testing Model score :
 1.0


Ridge_Reg
Model performance for Training set
Mean Squared Err

In [18]:
# All models scores convert into dataframe
performance_df = pd.DataFrame(results)
performance_df

Unnamed: 0,model,mse,rmse,mae,r2,adj_r2
0,LinearRegression,0.0,0.0,0.0,1.0,1.0
1,Lasso,1.13,1.06,0.85,1.0,1.0
2,Ridge,0.0,0.02,0.02,1.0,1.0
3,ElasticNet,50.64,7.12,5.69,0.97,1.0
4,KNeighborsRegressor,45.86,6.77,5.3,0.97,1.0
5,SVR,232.72,15.26,6.65,0.87,1.0
6,DecisionTreeRegressor,0.0,0.0,0.0,1.0,1.0
7,RandomForestRegressor,0.87,0.93,0.56,1.0,1.0
8,AdaBoostRegressor,25.77,5.08,3.94,0.99,1.0
9,CatBoostRegressor,0.1,0.32,0.26,1.0,1.0


In [20]:
df.to_csv('data/student.csv')