In [17]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [5]:
df=pd.read_csv('../data/StudentsPerformance.csv')

In [6]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [7]:
df['parental level of education']=df['parental level of education'].str.replace("some high school","high school")

In [8]:
df["total score"]=df["math score"]+df["reading score"]+df["writing score"]
df["average score"]=df["total score"]/3
df["average score"]=df["average score"].round(2)
df["total score"]=df["total score"].astype(int)
df["average score"]=df["average score"].astype(float)
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,total score,average score
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.67
1,female,group C,some college,standard,completed,69,90,88,247,82.33
2,female,group B,master's degree,standard,none,90,95,93,278,92.67
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.33
4,male,group C,some college,standard,none,76,78,75,229,76.33


In [9]:
X=df.drop(columns="total score",axis=1)
y=df["total score"]


In [12]:
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

scaler=StandardScaler()
encoder=OneHotEncoder(handle_unknown='ignore')
preproceessor=ColumnTransformer(
    [
        ("OneHotEncoder",encoder, categorical_features),
        ("StandardScaler",scaler, numeric_features)
    ])


In [13]:
X= preproceessor.fit_transform(X)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [20]:
model={
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "AdaBoost": AdaBoostRegressor(),
    "Support Vector Regressor": SVR(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "CatBoost Regressor": CatBoostRegressor(verbose=0),
    "XGBoost Regressor": XGBRegressor(use_label_encoder=False, eval_metric='rmse')
}

results = {}
for name, reg in model.items():
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {"MSE": mse, "R2": r2}
    print(f"{name} - MSE: {mse}, R2: {r2}")
results_df = pd.DataFrame(results).T
results_df.sort_values(by="R2", ascending=False, inplace=True)
results_df

Linear Regression - MSE: 1.952840948092719e-27, R2: 1.0
Ridge Regression - MSE: 0.00036947782977774055, R2: 0.9999998084911615
Lasso Regression - MSE: 1.1135660971599657, R2: 0.9994228131360451
Decision Tree - MSE: 4.64, R2: 0.9975949815142712
Random Forest - MSE: 6.773805499999998, R2: 0.9964889811538293
Gradient Boosting - MSE: 4.5447322265845695, R2: 0.9976443609875811
AdaBoost - MSE: 12.433851573514733, R2: 0.9935552493786398
Support Vector Regressor - MSE: 442.53987792483866, R2: 0.7706214251979703
K-Neighbors Regressor - MSE: 62.64379999999999, R2: 0.9675302808154527
CatBoost Regressor - MSE: 7.667642971861498, R2: 0.9960256846790311
XGBoost Regressor - MSE: 4.889644622802734, R2: 0.9974656105041504


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Unnamed: 0,MSE,R2
Linear Regression,1.952841e-27,1.0
Ridge Regression,0.0003694778,1.0
Lasso Regression,1.113566,0.999423
Gradient Boosting,4.544732,0.997644
Decision Tree,4.64,0.997595
XGBoost Regressor,4.889645,0.997466
Random Forest,6.773805,0.996489
CatBoost Regressor,7.667643,0.996026
AdaBoost,12.43385,0.993555
K-Neighbors Regressor,62.6438,0.96753
