In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#modeling
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
import warnings



In [12]:
df = pd.read_csv("data/stud.csv")
df.head()


Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [25]:
for col in df.columns:
    print(f"{col} : ",end=" ")
    print(df[col].nunique())

gender :  2
race_ethnicity :  5
parental_level_of_education :  6
lunch :  2
test_preparation_course :  2
math_score :  81
reading_score :  72
writing_score :  77


Preparing X and y variavle

In [None]:
x = df.drop('math_score',axis=1)
y = df['math_score']


In [15]:
df['parental_level_of_education'].value_counts()

parental_level_of_education
some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: count, dtype: int64

In [17]:
x['total_score'] = x['reading_score'] + x['writing_score']

In [30]:
#Create column transform
cat_feature = x.select_dtypes(include='object').columns
num_feature = x.select_dtypes(exclude='object').columns
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

num_transform = StandardScaler()
cat_transform = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoding",cat_transform,cat_feature),
        ("Standard scaler",num_transform,num_feature)
    ]
)
 

In [31]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)
    

In [32]:
x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test)

Create an Evaluate Function to give all metrices after model training

In [33]:
def evaluate_model(true,pred):
    mae = mean_squared_error(true,pred)
    score = r2_score(true,pred)
    return mae,score

In [40]:
models = {
    "LinearRegression" : LinearRegression(),
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    "K-Neighbours Regressor" : KNeighborsRegressor(),
    "DecisionTreeRegressor" : DecisionTreeRegressor(),
    "RandomForestRegressor" : RandomForestRegressor(),
    "XGBRegressor" : XGBRegressor(),
    "AdaBoostRegressor" : AdaBoostRegressor(),
}
model_list = []
r2_list = []
for name , model in models.items():
    print(f"Training {name} model")
    model.fit(x_train,y_train)

    #make prediction
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    #Evaluate Train and Test dataset
    model_train_mae,model_train_score = evaluate_model(y_train,y_train_pred)
    model_test_mae,model_test_score = evaluate_model(y_test,y_test_pred)

    model_list.append(name)
    r2_list.append(model_test_score)
     

Training LinearRegression model
Training Lasso model
Training Ridge model
Training K-Neighbours Regressor model
Training DecisionTreeRegressor model
Training RandomForestRegressor model
Training XGBRegressor model
Training AdaBoostRegressor model


In [45]:
for model_name, score in zip(model_list, r2_list):
    print(f"{model_name} : {score:.4f}")

    

LinearRegression : 0.8804
Lasso : 0.8256
Ridge : 0.8806
K-Neighbours Regressor : 0.8018
DecisionTreeRegressor : 0.7562
RandomForestRegressor : 0.8422
XGBRegressor : 0.8327
AdaBoostRegressor : 0.8439
