In [80]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# models
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR 
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

In [81]:
df = pd.read_csv("./data/stud.csv")
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [82]:
numerical_features = [i for i in df.columns if df[i].dtype != "object"]
categorical_features = [i for i in df.columns if df[i].dtype == "object"]
print("Numerical Features : ", numerical_features)
print("Categorical Features : ", categorical_features)

Numerical Features :  ['math_score', 'reading_score', 'writing_score']
Categorical Features :  ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']


In [83]:
df["Total_Score"] = df["math_score"] + df["reading_score"] + df["writing_score"]
df["Average_Score"] = df["Total_Score"] / 3
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,Total_Score,Average_Score
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,group C,some college,standard,completed,69,90,88,247,82.333333
2,female,group B,master's degree,standard,none,90,95,93,278,92.666667
3,male,group A,associate's degree,free/reduced,none,47,57,44,148,49.333333
4,male,group C,some college,standard,none,76,78,75,229,76.333333


In [84]:
X = df.drop(columns = ["Total_Score","Average_Score"], axis = 1)
y = df["Average_Score"]

sc=StandardScaler()
ohe = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", ohe, categorical_features),
        ("StandardScaler", sc, numerical_features)
    ]
)

In [85]:
type(X)

pandas.core.frame.DataFrame

In [86]:
X = preprocessor.fit_transform(X)
X = pd.DataFrame(X)
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.390024,0.193999,0.391492
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.192076,1.427476,1.313269
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.577711,1.770109,1.642475
3,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-1.259543,-0.833899,-1.583744
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.653954,0.605158,0.457333


In [87]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 20), (200, 20), (800,), (200,))

In [88]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_val = r2_score(true, predicted)
    return (mae, mse, rmse, r2_val)

In [None]:
models = {
    "LinearRegression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "KNeighborsRegressor": KNeighborsRegressor(),
    "DecisionTreeRegressor": DecisionTreeRegressor(),
    "RandomForestRegressor": RandomForestRegressor(),
    "AdaBoostRegressor": AdaBoostRegressor(),
    "XGBRegressor": XGBRegressor(),
    "CatBoostRegressor": CatBoostRegressor(verbose=False)
}

model_list=[]
r2_list=[]

for model in models.values():
    model.fit(X_train, y_train)
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    y_train_mae, y_train_mse, y_train_rmse, y_train_r2 = evaluate_model(y_train, y_train_pred)
    y_test_mae, y_test_mse, y_test_rmse, y_test_r2 = evaluate_model(y_test, y_test_pred)

    print(f"Model: {model.__class__.__name__}")
    model_list.append(model.__class__.__name__)

    print("Model Performance on Training Set")
    print("-- Mean Absolute Error: ", y_train_mae)
    print("-- Mean Squared Error: ", y_train_mse)
    print("-- Root Mean Squared Error: ", y_train_rmse)
    print("-- R2 Score: ", y_train_r2)
    print("--------------------------------------------")
    print("Model Performance on Test Set")
    print("-- Mean Absolute Error: ", y_test_mae)
    print("-- Mean Squared Error: ", y_test_rmse)
    print("-- Root Mean Squared Error: ", y_test_rmse)
    print("-- R2 Score: ", y_test_r2)

    r2_list.append(y_test_r2)
    print("====================================================\n\n")

ValueError: Expected a 2-dimensional container but got <class 'pandas.core.series.Series'> instead. Pass a DataFrame containing a single row (i.e. single sample) or a single column (i.e. single feature) instead.