In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [6]:
# Import regression algorithms from scikit-learn
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.model_selection import RandomizedSearchCV

# Import evaluation metrics from scikit-learn
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [10]:
df = pd.read_csv('data/stud.csv')

In [11]:
x = df.drop(columns=['math_score'],axis=1)
y = df['math_score']

In [12]:
num_features = [features for features in df.columns if df[features].dtype !='O']
cat_features = [features for features in df.columns if df[features].dtype =='O']

print('numerical featues',len(num_features),num_features)
print('categorical featues',len(cat_features),cat_features)

numerical featues 3 ['math_score', 'reading_score', 'writing_score']
categorical featues 5 ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']


In [13]:
num_features = x.select_dtypes(exclude='object').columns
cat_features = x.select_dtypes(include='object').columns

from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
categoric_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder",categoric_transformer,cat_features),
        ("StandardScaler",numeric_transformer,num_features),
    ]
)

x = preprocessor.fit_transform(x)

In [14]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.33,random_state=42)

In [17]:
def evaluate(true,predicted):
    mse = mean_squared_error(true,predicted)
    mae = mean_absolute_error(true,predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(true,predicted)
    return mae,rmse,r2


In [23]:
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Elastic Net": ElasticNet(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "AdaBoost": AdaBoostRegressor(),
    "SVM": SVR(),
    "KNN": KNeighborsRegressor(),
    "XGBoost": XGBRegressor(),
}
model_list = []
r2_list = []

for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(x_train,y_train)
    
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    model_train_mae,model_train_rmse,model_train_r2 = evaluate(y_train,y_train_pred)

    model_test_mae,model_test_rmse,model_test_r2 = evaluate(y_test,y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    r2_list.append(model_test_r2)

    print("model performance on training set")

    print("Training: MAE: {:.2f}, RMSE: {:.2f}, R2: {:.2f}".format(model_train_mae,model_train_rmse,model_train_r2))

    print("-"*50)

    print("model performance on test set")

    print("Test: MAE: {:.2f}, RMSE: {:.2f}, R2: {:.2f}".format(model_test_mae,model_test_rmse,model_test_r2))

    print("="*50)
    print("\n")




Linear Regression
model performance on training set
Training: MAE: 4.21, RMSE: 5.27, R2: 0.88
--------------------------------------------------
model performance on test set
Test: MAE: 4.38, RMSE: 5.50, R2: 0.88


Ridge Regression
model performance on training set
Training: MAE: 4.21, RMSE: 5.27, R2: 0.88
--------------------------------------------------
model performance on test set
Test: MAE: 4.37, RMSE: 5.50, R2: 0.88


Lasso Regression
model performance on training set
Training: MAE: 5.18, RMSE: 6.54, R2: 0.81
--------------------------------------------------
model performance on test set
Test: MAE: 5.35, RMSE: 6.81, R2: 0.81


Elastic Net
model performance on training set
Training: MAE: 6.11, RMSE: 7.69, R2: 0.73
--------------------------------------------------
model performance on test set
Test: MAE: 6.42, RMSE: 7.97, R2: 0.74


Decision Tree
model performance on training set
Training: MAE: 0.02, RMSE: 0.31, R2: 1.00
--------------------------------------------------
model p

In [21]:
model_list

['Linear Regression',
 'Ridge Regression',
 'Lasso Regression',
 'Elastic Net',
 'Decision Tree',
 'Random Forest',
 'Gradient Boosting',
 'AdaBoost',
 'SVM',
 'KNN',
 'XGBoost']

In [24]:
r2_list

[0.8753252282107616,
 0.8756024232806043,
 0.8091873451472333,
 0.7383422365076477,
 0.7087688473377418,
 0.8479939219423879,
 0.8671864311761569,
 0.8358749028768204,
 0.7261533399139886,
 0.775936396012358,
 0.8258627653121948]

In [25]:
pd.DataFrame(list(zip(model_list,r2_list)),columns=['Model','R2 Score']).sort_values(by='R2 Score',ascending=False)

Unnamed: 0,Model,R2 Score
1,Ridge Regression,0.875602
0,Linear Regression,0.875325
6,Gradient Boosting,0.867186
5,Random Forest,0.847994
7,AdaBoost,0.835875
10,XGBoost,0.825863
2,Lasso Regression,0.809187
9,KNN,0.775936
3,Elastic Net,0.738342
8,SVM,0.726153
