In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error,root_mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
import warnings

warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv('C:\project\course_project1\data\stud.csv')

In [3]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [7]:
x=df.drop(columns=['math_score','reading_score','writing_score'])
y1=df['math_score']+df['reading_score']+df['writing_score']
y2=y1/3

In [8]:
x

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course
0,female,group B,bachelor's degree,standard,none
1,female,group C,some college,standard,completed
2,female,group B,master's degree,standard,none
3,male,group A,associate's degree,free/reduced,none
4,male,group C,some college,standard,none
...,...,...,...,...,...
995,female,group E,master's degree,standard,completed
996,male,group C,high school,free/reduced,none
997,female,group C,high school,free/reduced,completed
998,female,group D,some college,standard,completed


In [9]:
y1

0      218
1      247
2      278
3      148
4      229
      ... 
995    282
996    172
997    195
998    223
999    249
Length: 1000, dtype: int64

In [10]:
y2

0      72.666667
1      82.333333
2      92.666667
3      49.333333
4      76.333333
         ...    
995    94.000000
996    57.333333
997    65.000000
998    74.333333
999    83.000000
Length: 1000, dtype: float64

In [11]:
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race_ethnicity               1000 non-null   object
 2   parental_level_of_education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test_preparation_course      1000 non-null   object
dtypes: object(5)
memory usage: 39.2+ KB


In [12]:
cat_features=x.select_dtypes(include='object').columns
num_features=x.select_dtypes(exclude='object').columns

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
oh_encoder=OneHotEncoder()
scaler=StandardScaler()

preprocessor=ColumnTransformer(
    [("OneHotEncoder",oh_encoder,cat_features),
     ("StanderScaler",scaler,num_features)]
)


In [13]:
from sklearn.model_selection import train_test_split
x_train,x_test,y1_train,y1_test=train_test_split(x,y1,test_size=0.25,random_state=42)
x_train,x_test,y2_train,y2_test=train_test_split(x,y1,test_size=0.25,random_state=42)

In [14]:
x_train_scaled = preprocessor.fit_transform(x_train)

# Use the same scaler (same mean/std) for test data
x_test_scaled = preprocessor.transform(x_test)

In [15]:
x_train_scaled

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 3750 stored elements and shape (750, 17)>

In [17]:
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    R2_score =r2_score(true,predicted)
    rmse=root_mean_squared_error(true,predicted)
    return mae,mse,rmse,R2_score

In [None]:
models ={
    "Linear Regression":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "K Nearest Neighbor":KNeighborsRegressor(),
    "Decision Tree":DecisionTreeRegressor(),
    "AdaBoost Regressor":AdaBoostRegressor(),
    "Random Forest Regressor":RandomForestRegressor(),
    "XGBRegressor":XGBRegressor(),
    "CatBoost Regressor":CatBoostRegressor(),
}
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model1=list(models.values())[i]
    model1.fit(x_train_scaled,y1_train)

    model2=list(models.values())[i]
    model2.fit(x_train_scaled,y2_train)

    y1_train_pred=model1.predict(x_train_scaled)
    y1_test_pred=model1.predict(x_test_scaled)

    y2_train_pred=model2.predict(x_train_scaled)
    y2_test_pred=model2.predict(x_test_scaled)

    model1_train_mae,model1_train_mse,model1_train_R2_score,model1_train_rmse=evaluate_model(y1_train,y1_train_pred)
    model1_test_mae,model1_test_mse,model1_test_R2_score,model1_test_rmse=evaluate_model(y1_test,y1_test_pred)

    model2_train_mae,model2_train_mse,model2_train_R2_score,model2_train_rmse=evaluate_model(y2_train,y2_train_pred)
    model2_test_mae,model2_test_mse,model2_test_R2_score,model2_test_rmse=evaluate_model(y2_test,y2_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    # r2_list.append()

    print("MODEL1 PERFORMANCE FOR TRAINING SET")
    print(f"MEAN SQUARE ERROR: {model1_train_mse}")
    print(f"MEAN ABSOLUTE ERROR: {model1_train_mae}")
    print(f"ROOT MEAN SQUARE ERROR: {model1_train_rmse}")
    print(f"R2 SCORE: {model1_train_R2_score}")
    print("\n")
    print("MODEL1 PERFORMANCE FOR TESTING SET")
    print(f"MEAN SQUARE ERROR: {model1_test_mse}")
    print(f"MEAN ABSOLUTE ERROR: {model1_test_mae}")
    print(f"ROOT MEAN SQUARE ERROR: {model1_test_rmse}")
    print(f"R2 SCORE: {model1_test_R2_score}")
    print("\n")
    print("MODEL2 PERFORMANCE FOR TRAINING SET")
    print(f"MEAN SQUARE ERROR: {model2_train_mse}")
    print(f"MEAN ABSOLUTE ERROR: {model2_train_mae}")
    print(f"ROOT MEAN SQUARE ERROR: {model2_train_rmse}")
    print(f"R2 SCORE: {model2_train_R2_score}")
    print("\n")
    print("MODEL2 PERFORMANCE FOR TESTING SET")
    print(f"MEAN SQUARE ERROR: {model2_test_mse}")
    print(f"MEAN ABSOLUTE ERROR: {model2_test_mae}")
    print(f"ROOT MEAN SQUARE ERROR: {model2_test_rmse}")
    print(f"R2 SCORE: {model2_test_R2_score}")
    print("\n")
    print("=======================================================")




Linear Regression
MODEL1 PERFORMANCE FOR TRAINING SET
MEAN SQUARE ERROR: 1322.1835612484663
MEAN ABSOLUTE ERROR: 29.621875136432646
ROOT MEAN SQUARE ERROR: 0.24611405417245136
R2 SCORE: 36.36184210471832


MODEL1 PERFORMANCE FOR TESTING SET
MEAN SQUARE ERROR: 1619.0864522661543
MEAN ABSOLUTE ERROR: 31.629421906103918
ROOT MEAN SQUARE ERROR: 0.1958971983337261
R2 SCORE: 40.237873356654354


MODEL2 PERFORMANCE FOR TRAINING SET
MEAN SQUARE ERROR: 1322.1835612484663
MEAN ABSOLUTE ERROR: 29.621875136432646
ROOT MEAN SQUARE ERROR: 0.24611405417245136
R2 SCORE: 36.36184210471832


MODEL2 PERFORMANCE FOR TESTING SET
MEAN SQUARE ERROR: 1619.0864522661543
MEAN ABSOLUTE ERROR: 31.629421906103918
ROOT MEAN SQUARE ERROR: 0.1958971983337261
R2 SCORE: 40.237873356654354


Lasso
MODEL1 PERFORMANCE FOR TRAINING SET
MEAN SQUARE ERROR: 1374.5577704269785
MEAN ABSOLUTE ERROR: 29.980511004727585
ROOT MEAN SQUARE ERROR: 0.21625119595764353
R2 SCORE: 37.07502893359597


MODEL1 PERFORMANCE FOR TESTING SET
MEA

In [21]:
pd.DataFrame(list(zip(model_list,r2_list)),columns=['model name','r2_score']).sort_values(by=['r2_score'],ascending=False)

Unnamed: 0,model name,r2_score
