In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [2]:
df = pd.read_csv('data/StudentsPerformance.csv')

In [3]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [4]:
X = df.drop('math score', axis=1)

In [5]:
X.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [6]:
print('categories in Gender:', end="")
print(df['gender'].unique())

categories in Gender:['female' 'male']


In [7]:
y = df['math score']

In [8]:
y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math score, Length: 1000, dtype: int64

In [9]:
num_var = X.select_dtypes(exclude="object").columns
cat_var = X.select_dtypes(include="object").columns

In [10]:
ohEncoder = OneHotEncoder()
scScaler= StandardScaler()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", ohEncoder, cat_var),
         ("StandardScaler", scScaler, num_var),        
    ]
)

In [11]:
X = preprocessor.fit_transform(X)

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((800, 19), (200, 19))

In [13]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}

In [14]:
results =[]
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)


    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
 
    

    print(f"Model: {name}")
    print(f"R2 score: {r2:.4f}")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    r2 = r2_score(y_test, y_pred)
    print("-" * 50)
    print("\n")

    results.append({'model Name': name,
                'R2': r2,
                "Mean Absolute Error (MAE)": mae,
                "Mean Squared Error (MSE)": mse,
                "Root Mean Squared Error (RMSE)":rmse
                })

Model: Linear Regression
R2 score: 0.8801
Mean Absolute Error (MAE): 4.2184
Mean Squared Error (MSE): 29.1696
Root Mean Squared Error (RMSE): 5.4009
--------------------------------------------------


Model: Lasso
R2 score: 0.8253
Mean Absolute Error (MAE): 5.1579
Mean Squared Error (MSE): 42.5064
Root Mean Squared Error (RMSE): 6.5197
--------------------------------------------------


Model: Ridge
R2 score: 0.8806
Mean Absolute Error (MAE): 4.2111
Mean Squared Error (MSE): 29.0563
Root Mean Squared Error (RMSE): 5.3904
--------------------------------------------------


Model: K-Neighbors Regressor
R2 score: 0.7835
Mean Absolute Error (MAE): 5.6370
Mean Squared Error (MSE): 52.6834
Root Mean Squared Error (RMSE): 7.2583
--------------------------------------------------


Model: Decision Tree
R2 score: 0.7439
Mean Absolute Error (MAE): 6.2100
Mean Squared Error (MSE): 62.3300
Root Mean Squared Error (RMSE): 7.8949
--------------------------------------------------


Model: Random 

In [15]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,model Name,R2,Mean Absolute Error (MAE),Mean Squared Error (MSE),Root Mean Squared Error (RMSE)
0,Linear Regression,0.880127,4.218389,29.16964,5.400892
1,Lasso,0.82532,5.157882,42.506417,6.519695
2,Ridge,0.880593,4.211101,29.056272,5.390387
3,K-Neighbors Regressor,0.783497,5.637,52.6834,7.258333
4,Decision Tree,0.743855,6.21,62.33,7.894935
5,Random Forest Regressor,0.851836,4.633408,36.054021,6.0045
6,XGBRegressor,0.827797,5.057731,41.903708,6.473307
7,CatBoosting Regressor,0.851632,4.612532,36.103658,6.008632
8,AdaBoost Regressor,0.848149,4.740875,36.951233,6.078753
