In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [33]:
from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings


In [34]:
df=pd.read_csv('StudentsPerformance_updated.csv')
df.head(2)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,Total_score,AvgScore
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,group C,some college,standard,completed,69,90,88,247,82.333333


In [35]:
df.drop(['math score','reading score','writing score'],axis=1,inplace=True)
df.head(2)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,Total_score,AvgScore
0,female,group B,bachelor's degree,standard,none,218,72.666667
1,female,group C,some college,standard,completed,247,82.333333


In [36]:
x=df.drop('AvgScore',axis=1)
y=df['AvgScore']

In [37]:
x.head(2)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,Total_score
0,female,group B,bachelor's degree,standard,none,218
1,female,group C,some college,standard,completed,247


In [38]:
y

0      72.666667
1      82.333333
2      92.666667
3      49.333333
4      76.333333
         ...    
995    94.000000
996    57.333333
997    65.000000
998    74.333333
999    83.000000
Name: AvgScore, Length: 1000, dtype: float64

In [39]:
num_values=x.select_dtypes(exclude='object').columns
cat_values=x.select_dtypes(include='object').columns

In [40]:
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
scaler=StandardScaler()
encoder=OneHotEncoder()

### creating a pipeline where the categorical values are converted to numerical _values and then the numerical values are scaled with standardScaler


In [41]:
preprocessor=ColumnTransformer(
    [
        ("OneHotEncoder",encoder,cat_values),
        ("StandardScaler",scaler,num_values)
    ]
)

In [42]:
# Apply the preprocessing pipeline
x = preprocessor.fit_transform(x)


In [44]:
x.shape

(1000, 18)

In [45]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=42)

In [47]:
x_train.shape

(700, 18)

In [None]:
def PerformanceCaluculation(test,pred):
    mae=mean_absolute_error(test,pred)
    mse=mean_squared_error(test,pred)
    rmse=np.sqrt(mse)
    r2Score=r2_score(test,pred)
    return mae,mse,rmse,r2Score

In [55]:
models={
    "LinearRegression":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "KNeighborsRegressor":KNeighborsRegressor(),
    "DecisionTreeRegressor":DecisionTreeRegressor(),
    "XGBRegressor":XGBRegressor(),
    "CatBoostRegressor":CatBoostRegressor(),
    "AdaBoostRegressor":AdaBoostRegressor()
}
models.keys()


dict_keys(['LinearRegression', 'Lasso', 'Ridge', 'KNeighborsRegressor', 'DecisionTreeRegressor', 'XGBRegressor', 'CatBoostRegressor', 'AdaBoostRegressor'])

In [57]:
model_list=[]
r2_list=[]
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(x_train,y_train)

    y_train_predict=model.predict(x_train)
    y_test_predict=model.predict(x_test)

    train_mae,train_mse,train_rmse,train_r2score=PerformanceCaluculation(y_train,y_train_predict)
    test_mae,test_mse,test_rmse,test_r2score=PerformanceCaluculation(y_test,y_test_predict)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Performance Matrix for Training data:")
    print("- Root Mean Sqaured Error: {:.4f}".format(train_rmse))
    print("- mean absolute error: {:.4f}".format(train_mae))
    print("- R2 score: {:.4f}".format(train_r2score))

    print("-----------------------------------------------------------------------")

    print("Performance Matrix for Test data:")
    print("- Root Mean Sqaured Error: {:.4f}".format(test_rmse))
    print("- mean absolute error: {:.4f}".format(test_mae))
    print("- R2 score: {:.4f}".format(test_r2score))
    r2_list.append(test_r2score)

    print('='*35)
    print('\n')







LinearRegression
Performance Matrix for Training data:
- Root Mean Sqaured Error: 0.0000
- mean absolute error: 0.0000
- R2 score: 1.0000
-----------------------------------------------------------------------
Performance Matrix for Test data:
- Root Mean Sqaured Error: 0.0000
- mean absolute error: 0.0000
- R2 score: 1.0000


Lasso
Performance Matrix for Training data:
- Root Mean Sqaured Error: 1.0277
- mean absolute error: 0.8232
- R2 score: 0.9945
-----------------------------------------------------------------------
Performance Matrix for Test data:
- Root Mean Sqaured Error: 1.1227
- mean absolute error: 0.8870
- R2 score: 0.9944


Ridge
Performance Matrix for Training data:
- Root Mean Sqaured Error: 0.0239
- mean absolute error: 0.0194
- R2 score: 1.0000
-----------------------------------------------------------------------
Performance Matrix for Test data:
- Root Mean Sqaured Error: 0.0262
- mean absolute error: 0.0209
- R2 score: 1.0000


KNeighborsRegressor
Performance Mat

In [58]:
pd.DataFrame(list(zip(model_list,r2_list)),columns=['Model Name','R2 Score']).sort_values(by=['R2 Score'],ascending=False)

Unnamed: 0,Model Name,R2 Score
0,LinearRegression,1.0
2,Ridge,0.999997
4,DecisionTreeRegressor,0.998408
5,XGBRegressor,0.997606
1,Lasso,0.994408
7,AdaBoostRegressor,0.9942
6,CatBoostRegressor,0.990644
3,KNeighborsRegressor,0.875596
