In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Models

from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [3]:
df=pd.read_csv("data/new_Stud.csv")

In [4]:
df.columns

Index(['Unnamed: 0', 'gender', 'race_ethnicity', 'parental_level_of_education',
       'lunch', 'test_preparation_course', 'math_score', 'reading_score',
       'writing_score', 'total_score', 'average'],
      dtype='object')

In [5]:
y=df['average']

In [6]:
X=df.drop(["average","Unnamed: 0","total_score"],axis=1)             

In [7]:
X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [9]:
num_features=X.select_dtypes(exclude="object").columns
cat_features=X.select_dtypes(include="object").columns

from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

num_transformer=StandardScaler()
cat_transformer=OneHotEncoder()

## pipeline

preprocessor=ColumnTransformer(
    [
        ("OneHotEncoder",cat_transformer,cat_features),
        ("StandardScaler",num_transformer,num_features)
    ]
)

In [10]:
X_new=preprocessor.fit_transform(X)

In [11]:
X_new

array([[ 1.        ,  0.        ,  0.        , ...,  0.39002351,
         0.19399858,  0.39149181],
       [ 1.        ,  0.        ,  0.        , ...,  0.19207553,
         1.42747598,  1.31326868],
       [ 1.        ,  0.        ,  0.        , ...,  1.57771141,
         1.77010859,  1.64247471],
       ...,
       [ 1.        ,  0.        ,  0.        , ..., -0.46775108,
         0.12547206, -0.20107904],
       [ 1.        ,  0.        ,  0.        , ...,  0.12609287,
         0.60515772,  0.58901542],
       [ 1.        ,  0.        ,  0.        , ...,  0.71993682,
         1.15336989,  1.18158627]])

In [12]:
X_new.shape

(1000, 20)

In [13]:
X.shape

(1000, 8)

In [14]:
y

0      72.666667
1      82.333333
2      92.666667
3      49.333333
4      76.333333
         ...    
995    94.000000
996    57.333333
997    65.000000
998    74.333333
999    83.000000
Name: average, Length: 1000, dtype: float64

In [15]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_new,y,test_size=0.2,random_state=42)

In [16]:
X_train.shape

(800, 20)

In [17]:
X_test.shape

(200, 20)

In [18]:
y_train

29     69.000000
535    77.333333
695    84.666667
557    64.666667
836    64.666667
         ...    
106    95.666667
270    64.333333
860    56.000000
435    50.333333
102    88.333333
Name: average, Length: 800, dtype: float64

In [19]:
def evaluate_model(true,pred):
    mae=mean_absolute_error(true,pred)
    mse=mean_squared_error(true,pred)
    rmse=np.sqrt(mse)
    r2=r2_score(true,pred)
    return mae,rmse,r2

In [20]:
models={
    "LinearRegression":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "KNN":KNeighborsRegressor(),
    "Decision_Tree":DecisionTreeRegressor(),
    "Random":RandomForestRegressor(),
    "Boosting":XGBRegressor(),
    "catBoost":CatBoostRegressor(),
    "adaboost":AdaBoostRegressor()
}

model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)
    
    ## Make Prediction
    y_pred_train=model.predict(X_train)
    y_pred_test=model.predict(X_test)
    
    ## Evaluate
    
    mae_train,rmse_train,r2_train=evaluate_model(y_train,y_pred_train)
    mae_test,rmse_test,r2_test=evaluate_model(y_test,y_pred_test)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print("Model Performance of Training Set")
    print("1.RMSE -> {:.4f}".format(rmse_train))
    print("2.MAE -> {:.4f}".format(mae_train))
    print("3.R2 -> {:.4f}".format(r2_train))
    
    print("-"*35)
          
    print("Model Performance of Test Set")
    print("1.RMSE -> {:.4f}".format(rmse_test))
    print("2.MAE -> {:.4f}".format(mae_test))
    print("3.R2 -> {:.4f}".format(r2_test))
    
    r2_list.append(r2_test)
    
    print("="*35)
    print('\n')

LinearRegression
Model Performance of Training Set
1.RMSE -> 0.0000
2.MAE -> 0.0000
3.R2 -> 1.0000
-----------------------------------
Model Performance of Test Set
1.RMSE -> 0.0000
2.MAE -> 0.0000
3.R2 -> 1.0000


Lasso
Model Performance of Training Set
1.RMSE -> 1.0645
2.MAE -> 0.8506
3.R2 -> 0.9943
-----------------------------------
Model Performance of Test Set
1.RMSE -> 1.1142
2.MAE -> 0.8769
3.R2 -> 0.9942


Ridge
Model Performance of Training Set
1.RMSE -> 0.0080
2.MAE -> 0.0065
3.R2 -> 1.0000
-----------------------------------
Model Performance of Test Set
1.RMSE -> 0.0088
2.MAE -> 0.0067
3.R2 -> 1.0000


KNN
Model Performance of Training Set
1.RMSE -> 2.2573
2.MAE -> 1.7680
3.R2 -> 0.9745
-----------------------------------
Model Performance of Test Set
1.RMSE -> 3.0706
2.MAE -> 2.2833
3.R2 -> 0.9560


Decision_Tree
Model Performance of Training Set
1.RMSE -> 0.0000
2.MAE -> 0.0000
3.R2 -> 1.0000
-----------------------------------
Model Performance of Test Set
1.RMSE -> 1.8

In [47]:
pd.DataFrame(list(zip(model_list,r2_list)),columns=["Model_name","R2_score"]).sort_values(by=["R2_score"],ascending=False)

Unnamed: 0,Model_name,R2_score
0,LinearRegression,1.0
2,Ridge,1.0
6,Boosting,0.994748
1,Lasso,0.994209
5,Random,0.993989
7,catBoost,0.993447
4,Decision_Tree,0.984616
8,adaboost,0.977623
3,KNN,0.956016
