In [1]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# For Modelling
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.metrics import mean_absolute_error
import warnings

In [3]:
df = pd.read_csv("data/stud.csv")
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [4]:
df.isna().sum()

gender                         0
race_ethnicity                 0
parental_level_of_education    0
lunch                          0
test_preparation_course        0
math_score                     0
reading_score                  0
writing_score                  0
dtype: int64

In [5]:
for label, content in df.items():
    if not pd.api.types.is_numeric_dtype(content):
        print(label)

gender
race_ethnicity
parental_level_of_education
lunch
test_preparation_course


In [6]:
for label, content in df.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label)
# no null values in numeric dtype features


In [7]:
for label,content in df.items():
    if not pd.api.types.is_numeric_dtype(content):
        print(label)

gender
race_ethnicity
parental_level_of_education
lunch
test_preparation_course


In [8]:
# Turn categorical features 
x = df.drop('math_score',axis=1)
y = df['math_score']

In [83]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

# cat_imputer = SimpleImputer(strategy ='constant',
#                            fill_value='missing')


# cat_features = ['gender','race_ethnicity','parental_level_of_education','lunch','test_preparation_course']

# imputer = ColumnTransformer([
#     ("cat_imputer", cat_imputer,cat_features)
# ])

# filled_x = imputer.fit_transform(x)
# filled_x

In [84]:
num_features = x.select_dtypes(exclude='object').columns
cat_features = x.select_dtypes(include = 'object').columns

one_hot = OneHotEncoder()
std = StandardScaler()

preprocessor = ColumnTransformer([
    ('OneHotEncoder',one_hot,cat_features),
    ('StandardScaler',std,num_features)
    ])

transformed_x = preprocessor.fit_transform(df)
transformed_x

array([[ 1.        ,  0.        ,  0.        , ...,  1.        ,
         0.19399858,  0.39149181],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         1.42747598,  1.31326868],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.77010859,  1.64247471],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.12547206, -0.20107904],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.60515772,  0.58901542],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.15336989,  1.18158627]])

In [85]:
x_train,x_test,y_train,y_test = train_test_split(transformed_x,y,test_size=0.2, random_state=42)

In [86]:
def evaluate_model(y_test,y_preds):
    MAE = mean_absolute_error(y_test,y_preds)
    R2 = r2_score(y_test,y_preds)
    MSE = mean_squared_error(y_test,y_preds)
    RMSE = np.sqrt( mean_squared_error(y_test,y_preds))
    model_scores = [
        print(f"mae is {MAE}"),
        print(f'r2 is {R2}'),
        print(f'mse is {MSE}'),
        print(f"rmse is {RMSE}")
    ]
    return model_scores

In [87]:
def training_model(x_train,y_train):
    models = [RandomForestRegressor(),
             KNeighborsRegressor(),
             LinearRegression(),
             Ridge(),
             Lasso(),
             CatBoostRegressor(verbose = False),
             XGBRegressor(verbose=False),
             DecisionTreeRegressor(),
             SVR(),
             AdaBoostRegressor()]
    for model in models:
        if model.fit(x_train,y_train):
            y_preds = model.predict(x_test)
            print(model)
            evaluation=evaluate_model(y_test,y_preds)
            print('---------------------')
        
    return evaluation

In [88]:
training_model(x_train,y_train)

RandomForestRegressor()
mae is 4.626829166666666
r2 is 0.8542571297231208
mse is 35.46483686458333
rmse is 5.955236088064296
---------------------
KNeighborsRegressor()
mae is 5.621
r2 is 0.7838129945787431
mse is 52.6066
rmse is 7.253040741647602
---------------------
LinearRegression()
mae is 4.2158203125
r2 is 0.8803449074540941
mse is 29.116678771972655
rmse is 5.3959872842671395
---------------------
Ridge()
mae is 4.2111006880142625
r2 is 0.8805931485028737
mse is 29.056272192348324
rmse is 5.390387016935642
---------------------
Lasso()
mae is 5.157881810347763
r2 is 0.8253197323627853
mse is 42.5064168384116
rmse is 6.519694535667419
---------------------
<catboost.core.CatBoostRegressor object at 0x000001F6217EB790>
mae is 4.612531714976557
r2 is 0.8516318920747058
mse is 36.10365799356841
rmse is 6.008631956907363
---------------------
Parameters: { "verbose" } are not used.

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, cols

[None, None, None, None]

In [89]:
model1 =  LinearRegression()
model1.fit(x_train,y_train)

In [93]:
model1.score(x_test,y_test)

0.8803449074540941

In [95]:
import logging

In [97]:
logging.basicConfig(filename='logs',
                   filemode="w",
                   level=logging.INFO)