In [27]:
#basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#modelling
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error,r2_score
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')



In [13]:
df=pd.read_csv('data/stud.csv')

In [14]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


preparing X and Y

In [15]:
x=df.drop(columns=['math_score'],axis=1)

In [16]:
y=df['math_score']

In [17]:
y

0      72
1      69
2      90
3      47
4      76
       ..
995    88
996    62
997    59
998    68
999    77
Name: math_score, Length: 1000, dtype: int64

In [18]:
##exploring data
print("categories in 'gender' variable:   ",end="")
print(df['gender'].unique())
print("categories in 'race/ethnicity' variable:   ",end="")
print(df['race_ethnicity'].unique())
print("categories in 'parental level of education' variable:   ",end="")
print(df['parental_level_of_education'].unique())
print("categories in 'lunch' variable:   ",end="")
print(df['lunch'].unique())
print("categories in 'test_preparation_course' variable:   ",end="")
print(df['test_preparation_course'].unique())

categories in 'gender' variable:   ['female' 'male']
categories in 'race/ethnicity' variable:   ['group B' 'group C' 'group A' 'group D' 'group E']
categories in 'parental level of education' variable:   ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
categories in 'lunch' variable:   ['standard' 'free/reduced']
categories in 'test_preparation_course' variable:   ['none' 'completed']


In [19]:
#create column transformer with 3 types of transformer
num_features=x.select_dtypes(exclude="object").columns
cat_features=x.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer([
    ("onehotencoder",oh_transformer,cat_features),
    ("standardscaler",numeric_transformer,num_features),
])

In [20]:
x=preprocessor.fit_transform(x)

In [21]:
x

array([[ 1.        ,  0.        ,  0.        , ...,  1.        ,
         0.19399858,  0.39149181],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         1.42747598,  1.31326868],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.77010859,  1.64247471],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.12547206, -0.20107904],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.60515772,  0.58901542],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.15336989,  1.18158627]], shape=(1000, 19))

In [22]:
x.shape

(1000, 19)

In [26]:
#train test split

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
x_train.shape,x_test.shape

((800, 19), (200, 19))

In [45]:
#create and evaluate function to give all metics after model training
def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    mse=mean_squared_error(true,predicted)
    rsme=np.sqrt(mean_squared_error(true,predicted))
    r2_sco=r2_score(true,predicted)
    return mae,mse,rsme,r2_score

In [50]:
models={
    "linear regression":LinearRegression(),
    "ridge regression":Ridge(),
    "lasso regression":Lasso(),
    "knn regression":KNeighborsRegressor(),

    "random forest":RandomForestRegressor(),
    "decision tree":DecisionTreeRegressor(),
    "XGBoost":XGBRegressor(),
    "catboost":CatBoostRegressor(),
    "AdaBoost":AdaBoostRegressor()


}
model_lst=[]
r2_lst=[]
for i in range(len(models)):
    model=list(models.values())[i]
    model.fit(x_train,y_train)

    #make predictions
    y_train_pred=model.predict(x_train)
    y_test_pred=model.predict(x_test)

    #evaluate train and test dataset
    model_train_mae,model_train_mse,model_train_rmse,model_train_r2=evaluate_model(y_train,y_train_pred)
    model_test_mae,model_test_mse,model_test_rmse,model_test_r2=evaluate_model(y_test,y_test_pred)

    print(list(models.keys())[i])
    model_lst.append(list(models.keys())[i])
    print("model performance for training set")
    print("-root mean squared error: {:.4f} ",format(model_train_rmse))
    print("-mean absolute error: {:.4f} ",format(model_train_mae))
    print("-r2 score: {:.4f} ",format(model_test_r2))

    print('--------------------------------------------')

    print("model performance for testing set")
    print("-root mean squared error: {:.4f} ",format(model_test_rmse))
    print("-mean absolute error: {:.4f} ",format(model_test_mae))
    print("-r2 score: {:.4f} ",format(model_test_r2))
    r2_lst.append(model_test_r2)

    print('='*35)
    print('\n')

linear regression
model performance for training set
-root mean squared error: {:.4f}  5.323050852720514
-mean absolute error: {:.4f}  4.266711846071957
-r2 score: {:.4f}  <function r2_score at 0x000002912E1A0CC0>
--------------------------------------------
model performance for testing set
-root mean squared error: {:.4f}  5.393993869732843
-mean absolute error: {:.4f}  4.21476314247485
-r2 score: {:.4f}  <function r2_score at 0x000002912E1A0CC0>


ridge regression
model performance for training set
-root mean squared error: {:.4f}  5.323324922741654
-mean absolute error: {:.4f}  4.264987823725981
-r2 score: {:.4f}  <function r2_score at 0x000002912E1A0CC0>
--------------------------------------------
model performance for testing set
-root mean squared error: {:.4f}  5.390387016935642
-mean absolute error: {:.4f}  4.211100688014261
-r2 score: {:.4f}  <function r2_score at 0x000002912E1A0CC0>


lasso regression
model performance for training set
-root mean squared error: {:.4f}  6.59