In [52]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from xgboost import XGBRegressor

In [53]:
df = pd.read_csv("data/stud.csv")

In [54]:
df

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


## Independent and Dependent Features

In [55]:
x = df.drop(['math_score'],axis=1)
y = df['math_score']

In [56]:
categorical = [col for col in x.columns if x[col].dtype == "O"]
numerical = [col for col in x.columns if x[col].dtype != 'O']

In [57]:
from sklearn.compose import ColumnTransformer
encoder = OneHotEncoder(drop='first')
scaler = StandardScaler()

preprocessor = ColumnTransformer(
    [
        ("one hot encoder",OneHotEncoder(),categorical),
        ("standard scaler",StandardScaler(),numerical)
    ]
)
x = preprocessor.fit_transform(x)

## Train and Test Split

In [58]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20,random_state=35)

## Training Models

In [61]:
models = {
    "LinearRegression" : LinearRegression(),
    "Ridge" : Ridge(),
    "KNeighborsRegressor":KNeighborsRegressor(),
    "SVR":SVR(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "RandomForestRegressor":RandomForestRegressor(),
    "Lasso" : Lasso(),
    "AdaBoostRegressor" : AdaBoostRegressor(),
    "XGBRegressor" : XGBRegressor()
}

model_list = []
r2_list = []

for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(x_train,y_train)

    y_pred_test = model.predict(x_test)
    y_pred_train = model.predict(x_train)

    model_train_mse = mean_squared_error(y_train,y_pred_train)
    model_train_mae = mean_absolute_error(y_train,y_pred_train)
    model_train_r2score = r2_score(y_train,y_pred_train)

    model_test_mse = mean_squared_error(y_test,y_pred_test)
    model_test_mae = mean_absolute_error(y_test,y_pred_test)
    model_test_r2score = r2_score(y_test,y_pred_test)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Training set performance:")
    print(f"MSE : {model_train_mse:.4f}")
    print(f"MAE : {model_train_mae:.4f}")
    print(f"R²  : {model_test_r2score:.4f}")

    print("------------------------")

    print("Testing set performance : ")
    print(f"MSE : {model_test_mse:.4f}")
    print(f"MAE : {model_test_mae:.4f}")
    print(f"R²  : {model_test_r2score:.4f}")
    r2_list.append(model_test_r2score)

    print("\n\n\n------------------------")

LinearRegression
Training set performance:
MSE : 29.5570
MAE : 4.3426
R²  : 0.8948
------------------------
Testing set performance : 
MSE : 23.7290
MAE : 3.8893
R²  : 0.8948



------------------------
Ridge
Training set performance:
MSE : 29.5598
MAE : 4.3413
R²  : 0.8951
------------------------
Testing set performance : 
MSE : 23.6656
MAE : 3.8855
R²  : 0.8951



------------------------
KNeighborsRegressor
Training set performance:
MSE : 33.2478
MAE : 4.5550
R²  : 0.7899
------------------------
Testing set performance : 
MSE : 47.4198
MAE : 5.5250
R²  : 0.7899



------------------------
SVR
Training set performance:
MSE : 48.8827
MAE : 5.0605
R²  : 0.8235
------------------------
Testing set performance : 
MSE : 39.8382
MAE : 4.6389
R²  : 0.8235



------------------------
DecisionTreeRegressor
Training set performance:
MSE : 0.0625
MAE : 0.0125
R²  : 0.7200
------------------------
Testing set performance : 
MSE : 63.1900
MAE : 6.2600
R²  : 0.7200



------------------------
Ra

In [64]:
pd.DataFrame({ 'Model Name' : model_list , 'R2_Score' : r2_list}).sort_values(['R2_Score'],ascending=False)

Unnamed: 0,Model Name,R2_Score
1,Ridge,0.895127
0,LinearRegression,0.894845
7,AdaBoostRegressor,0.839321
5,RandomForestRegressor,0.838367
6,Lasso,0.837284
3,SVR,0.823458
8,XGBRegressor,0.811096
2,KNeighborsRegressor,0.78986
4,DecisionTreeRegressor,0.719975


In [70]:
linmodel = LinearRegression()
linmodel.fit(x_train,y_train)
y_pred_test = linmodel.predict(x_test)
score = r2_score(y_test,y_pred_test) * 100
print("Accuracy of the model : %.2f"%score)

Accuracy of the model : 89.48
