In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as snb
import os

# modeling
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')


In [54]:
df = pd.read_csv("students.csv")

In [55]:
df.head(5)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [56]:
#independent feature (what we will use to predict)
y = df['math_score']
x= df.drop(columns=['math_score'], axis=1)

In [57]:
y.head(5)

0    72
1    69
2    90
3    47
4    76
Name: math_score, dtype: int64

In [58]:
x.head(5)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [59]:
#creat column transformer with 3 types of trnasformers

numerical_features = x.select_dtypes(exclude=['object']).columns
categorical_features = x.select_dtypes(include=['object']).columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

#initialize standard scalar and onehotencoder
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

#combine as pipeline 
preprocessor = ColumnTransformer(

    [
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


In [60]:
#x.head(5)
X = preprocessor.fit_transform(x)

In [61]:
X.shape
X

array([[ 0.19399858,  0.39149181,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       [ 1.42747598,  1.31326868,  1.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 1.77010859,  1.64247471,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [ 0.12547206, -0.20107904,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.60515772,  0.58901542,  1.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 1.15336989,  1.18158627,  1.        , ...,  0.        ,
         0.        ,  1.        ]])

In [62]:
#split X dataset into train, test sets
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((800, 19), (200, 19), (800,), (200,))

In [66]:
# create evaluation function to calculate rmse and r2 score
def evaluate_module(true, predected):
    mae = mean_absolute_error(true, predected)
    mse = mean_squared_error(true, predected)

    rmse = np.sqrt(mse)

    r2 = r2_score(true, predected)
    return mae, rmse, r2



In [67]:
models = {
    "Linear Regression":LinearRegression(),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "K-Neighbours Regressor": KNeighborsRegressor(),
    "Decission Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "CatBoostRegressor": CatBoostRegressor(),
    "AddABoostRegressor": AdaBoostRegressor()
    }
model_list = []
r2_list = []

In [68]:
print(list(models.keys()))
for i in range(len(list(models))):
    print(list(models.keys())[i])

['Linear Regression', 'Lasso', 'Ridge', 'K-Neighbours Regressor', 'Decission Tree', 'Random Forest Regressor', 'XGBRegressor', 'CatBoostRegressor', 'AddABoostRegressor']
Linear Regression
Lasso
Ridge
K-Neighbours Regressor
Decission Tree
Random Forest Regressor
XGBRegressor
CatBoostRegressor
AddABoostRegressor


In [69]:
for i in range (len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train, y_train) #train model
    #model prediction
    y_train_pred = model.predict(x_train)
    y_test_predict = model.predict(x_test)

    #evaluate train and test datasets
    model_train_mae, model_train_rmse, model_train_r2_error = evaluate_module(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2_error = evaluate_module(y_test, y_test_predict)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    r2_list.append(model_test_r2_error)


    print('model performance for training set')
    print('root mean square error:{:.4f}'.format(model_train_rmse))
    print('root abslute error:{:.4f}'.format(model_train_mae))
    print('model r2 error:{:.4f}'.format(model_train_r2_error))

    print("-------------------------------------------------")
    print("model performance for test set")
    print('root mean square error:{:.4f}'.format(model_test_rmse))
    print('root abslute error:{:.4f}'.format(model_test_mae))
    print('model r2 error:{:.4f}'.format(model_test_r2_error))

    print('='*35)
    print('\n')


    



Linear Regression
model performance for training set
root mean square error:5.3244
root abslute error:4.2691
model r2 error:0.8743
-------------------------------------------------
model performance for test set
root mean square error:5.3987
root abslute error:4.2186
model r2 error:0.8802


Lasso
model performance for training set
root mean square error:6.5938
root abslute error:5.2063
model r2 error:0.8071
-------------------------------------------------
model performance for test set
root mean square error:6.5197
root abslute error:5.1579
model r2 error:0.8253


Ridge
model performance for training set
root mean square error:5.3233
root abslute error:4.2650
model r2 error:0.8743
-------------------------------------------------
model performance for test set
root mean square error:5.3904
root abslute error:4.2111
model r2 error:0.8806


K-Neighbours Regressor
model performance for training set
root mean square error:5.7165
root abslute error:4.5215
model r2 error:0.8550
------------

In [71]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)
# linear regression achieved the lowest score 

Unnamed: 0,Model Name,R2_Score
2,Ridge,0.880593
0,Linear Regression,0.880223
8,AddABoostRegressor,0.852374
7,CatBoostRegressor,0.851632
5,Random Forest Regressor,0.849077
1,Lasso,0.82532
6,XGBRegressor,0.82122
3,K-Neighbours Regressor,0.78377
4,Decission Tree,0.752382


In [75]:
#how to use dataclass class
from dataclasses import dataclass

#@dataclass
class Fruit:
    def __init__(self, name: str, calories: int):
        self.name = name
        self.calories = calories
    # self.name = name
    # calories:int 
    def __str__(self):
        return f'{self.name}: {self.calories} calories'
    

apple = Fruit('Ali', 20)

print(apple)


Ali: 20 calories
