In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [8]:
from sklearn.metrics import r2_score,mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [4]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp312-cp312-win_amd64.whl.metadata (1.2 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.7-cp312-cp312-win_amd64.whl (101.7 MB)
   ---------------------------------------- 0.0/101.7 MB ? eta -:--:--
   ---------------------------------------- 0.0/101.7 MB 991.0 kB/s eta 0:01:43
   ---------------------------------------- 0.1/101.7 MB 1.4 MB/s eta 0:01:11
   ---------------------------------------- 0.3/101.7 MB 2.5 MB/s eta 0:00:40
   ---------------------------------------- 0.5/101.7 MB 3.5 MB/s eta 0:00:30
   ---------------------------------------- 0.6/101.7 MB 3.2 MB/s eta 0:00:33
   ---------------------------------------- 0.6/101.7 MB 2.6 MB/s eta 0:00:39
   ---------------------------------------- 0.6/101.7 MB 2.6 MB/s eta 0:00:40
   ---------------------------------------- 0.6/101.7 MB 2.3 MB/s eta 0:00:45
   ---------------------------------------- 0.

In [6]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.1/124.9 MB 1.8 MB/s eta 0:01:10
   ---------------------------------------- 0.3/124.9 MB 3.5 MB/s eta 0:00:36
   ---------------------------------------- 0.6/124.9 MB 4.3 MB/s eta 0:00:30
   ---------------------------------------- 0.8/124.9 MB 4.4 MB/s eta 0:00:29
   ---------------------------------------- 1.1/124.9 MB 5.2 MB/s eta 0:00:24
   ---------------------------------------- 1.1/124.9 MB 4.5 MB/s eta 0:00:28
   ---------------------------------------- 1.4/124.9 MB 5.1 MB/s eta 0:00:25
    --------------------------------------- 1.9/124.9 MB 5.9 MB/s eta 0:00:21
    --------------------------------------- 2.1/124.9 MB 6.1 MB/s eta 0:00:21
    ----

In [9]:
df=pd.read_csv("data/StudentsPerformance.csv")

In [10]:
df.head()


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [11]:
X=df.drop(columns=['math score'],axis=1)

In [12]:
Y=df['math score']

In [13]:
X.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [19]:
numeric_features=X.select_dtypes(exclude='object').columns
categorical_features=X.select_dtypes(include='object').columns

In [20]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
num_scaler=StandardScaler()
oh=OneHotEncoder()

In [22]:
preprocessor=ColumnTransformer(
    [
        ("OneHotEncoder",oh,categorical_features),
        ("StandardScaler",num_scaler,numeric_features)
    ]
)

In [23]:
X=preprocessor.fit_transform(X)

In [25]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [26]:
X_train.shape

(800, 19)

In [31]:
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rmse=np.sqrt(mse)
    r2=r2_score(true,predicted)
    return mae,rmse,r2

In [32]:
models={
    "LinearRegression":LinearRegression(),
    "Lasso":Lasso(),
    "AdaBoostRegressor":AdaBoostRegressor(),
    "KNeighborsRegressor":KNeighborsRegressor(),
    "RandomForestRegressor":RandomForestRegressor(),
    "Ridge":Ridge(),
    "CatBoostRegressor":CatBoostRegressor(verbose=False),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "XGBRegressor":XGBRegressor()
}
model_list=[]
r2_list=[]
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)

    train_mae,train_rmse,train_r2=evaluate_model(y_train,y_train_pred)
    test_mae,test_rmse,test_r2=evaluate_model(y_test,y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model preformance on training set")
    print("RMSE:{:.4f}".format(train_rmse))
    print("MAE:{:.4f}".format(train_mae))
    print("R2:{:.4f}".format(train_r2))

    print("--------------------------------")

    print("Model preformance on test set")
    print("RMSE:{:.4f}".format(test_rmse))
    print("MAE:{:.4f}".format(test_mae))
    print("R2:{:.4f}".format(test_r2))

    r2_list.append(test_r2)

    print("="*35)
    print("\n")



LinearRegression
Model preformance on training set
RMSE:5.3231
MAE:4.2667
R2:0.8743
--------------------------------
Model preformance on test set
RMSE:5.3940
MAE:4.2148
R2:0.8804


Lasso
Model preformance on training set
RMSE:6.5938
MAE:5.2063
R2:0.8071
--------------------------------
Model preformance on test set
RMSE:6.5197
MAE:5.1579
R2:0.8253


AdaBoostRegressor
Model preformance on training set
RMSE:5.7987
MAE:4.7580
R2:0.8509
--------------------------------
Model preformance on test set
RMSE:6.0245
MAE:4.6829
R2:0.8508


KNeighborsRegressor
Model preformance on training set
RMSE:5.7088
MAE:4.5177
R2:0.8554
--------------------------------
Model preformance on test set
RMSE:7.2494
MAE:5.6090
R2:0.7840


RandomForestRegressor
Model preformance on training set
RMSE:2.3006
MAE:1.8198
R2:0.9765
--------------------------------
Model preformance on test set
RMSE:5.9471
MAE:4.6123
R2:0.8547


Ridge
Model preformance on training set
RMSE:5.3233
MAE:4.2650
R2:0.8743
-------------------

In [33]:
pd.DataFrame(list(zip(model_list,r2_list)),columns=['Model_name','R2']).sort_values(by=['R2'],ascending=False)

Unnamed: 0,Model_name,R2
5,Ridge,0.880593
0,LinearRegression,0.880433
4,RandomForestRegressor,0.854656
6,CatBoostRegressor,0.851632
2,AdaBoostRegressor,0.850845
8,XGBRegressor,0.827797
1,Lasso,0.82532
3,KNeighborsRegressor,0.78403
7,DecisionTreeRegressor,0.741697
