In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [12]:
from sklearn.model_selection  import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics  import  mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model  import  LinearRegression, Ridge, Lasso
from sklearn.tree  import  DecisionTreeRegressor
from sklearn.ensemble  import  RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm  import  SVR
from sklearn.neighbors  import  KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [4]:
data = pd.read_csv('data/stud.csv')
data.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [5]:
x = data.drop(columns = ['math_score'], axis = 1)
y = data['math_score']

num_features = x.select_dtypes(exclude = ['object']).columns
cat_features = x.select_dtypes(include = ['object']).columns

num_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("standardscaler", num_transformer, num_features),
        ("onehotencoder", oh_transformer, cat_features)
    ]
)

preprocessor

In [6]:
x_preprocessed = preprocessor.fit_transform(x)
x_preprocessed[1]

array([1.42747598, 1.31326868, 1.        , 0.        , 0.        ,
       0.        , 1.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 1.        , 1.        , 0.        ])

In [7]:
print(data.shape)
print(x.shape)
print(x_preprocessed.shape)
print(y.shape)

(1000, 8)
(1000, 7)
(1000, 19)
(1000,)


In [8]:
x_train, x_test, y_train, y_test = train_test_split(x_preprocessed, y, test_size = 0.2, random_state = 43)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((800, 19), (200, 19), (800,), (200,))

In [9]:
def model_evaluation_score(true, predicted):
    mse = mean_squared_error(true, predicted)
    mae = mean_absolute_error(true, predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(true, predicted)
    return mae, rmse, r2

In [18]:
models = {
    "Linear Regression" : LinearRegression(),
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    "K-Neareset Neighbors" : KNeighborsRegressor(),
    "Decision Tree" : DecisionTreeRegressor(),
    "Random Forest" : RandomForestRegressor(),
    "Gradient Boosting" : GradientBoostingRegressor(),
    "Support Vector Machine" : SVR(),
    "AdaBoost Regressor" : AdaBoostRegressor()
}

model_list = []
r2 = []

for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(x_train, y_train)
    
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    
    train_mae, train_rmse, train_r2 = model_evaluation_score(y_train, y_train_pred)
    test_mae, test_rmse, test_r2 = model_evaluation_score(y_test, y_test_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print("PREDICTION FOR TRAIN DATA")
    print("MAE: {:.4f}", train_mae)
    print("RMSE: {:.4f}", train_rmse)
    print("R2: {:.4f}", train_r2)
    
    print('\n')
    
    print("PREDICTION FOR TEST DATA")
    print("MAE: {:.4f}", test_mae)
    print("RMSE: {:.4f}", test_rmse)
    print("R2: {:.4f}", test_r2)
    
    r2.append(test_r2)
    print('\n' + '-'*50 + '\n')

Linear Regression
PREDICTION FOR TRAIN DATA
MAE: {:.4f} 4.223669212026629
RMSE: {:.4f} 5.31104495850797
R2: {:.4f} 0.8773599321481419


PREDICTION FOR TEST DATA
MAE: {:.4f} 4.350269129511031
RMSE: {:.4f} 5.398628193862824
R2: {:.4f} 0.8703918752942987

--------------------------------------------------

Lasso
PREDICTION FOR TRAIN DATA
MAE: {:.4f} 5.131455041851829
RMSE: {:.4f} 6.541538656090209
R2: {:.4f} 0.8139488962528082


PREDICTION FOR TEST DATA
MAE: {:.4f} 5.524797826756963
RMSE: {:.4f} 6.843618129877163
R2: {:.4f} 0.7917251039811901

--------------------------------------------------

Ridge
PREDICTION FOR TRAIN DATA
MAE: {:.4f} 4.222963892769963
RMSE: {:.4f} 5.311265883015809
R2: {:.4f} 0.8773497289733


PREDICTION FOR TEST DATA
MAE: {:.4f} 4.351644940211582
RMSE: {:.4f} 5.401384028007416
R2: {:.4f} 0.8702595195746656

--------------------------------------------------

K-Neareset Neighbors
PREDICTION FOR TRAIN DATA
MAE: {:.4f} 4.7852500000000004
RMSE: {:.4f} 6.056694643120124
R

In [26]:
pd.DataFrame(list(zip(model_list, r2)), columns = ["Models", "R2 Score"]).sort_values(by = ["R2 Score"], ascending = False)

Unnamed: 0,Models,R2 Score
0,Linear Regression,0.870392
2,Ridge,0.87026
6,Gradient Boosting,0.85258
5,Random Forest,0.846915
8,AdaBoost Regressor,0.829545
7,Support Vector Machine,0.793198
1,Lasso,0.791725
3,K-Neareset Neighbors,0.790061
4,Decision Tree,0.76182
