In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.model_selection import KFold,cross_val_score
import warnings
warnings.simplefilter(action='ignore')

In [4]:
df = pd.read_csv('Port_preprocessed.csv')

In [5]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

In [28]:
X = df.iloc[:,:-1]
calc_vif(X)

Unnamed: 0,variables,VIF
0,Medu,14.271689
1,Fedu,10.661153
2,studytime,2.633045
3,famrel,9.63937
4,school_GP,3.84659
5,sex_F,2.781735
6,address_U,3.988242
7,famsize_LE3,1.566942
8,Pstatus_T,8.205285
9,Mjob_health,1.433492


In [30]:
var_inf= pd.DataFrame(calc_vif(X))

In [31]:
drop_cols = var_inf.loc[var_inf['VIF']>7]['variables']
t=list(drop_cols)

In [32]:
df1 = df.drop([t[0],t[1],t[2],t[3],t[4],t[5]],axis = 1)
df1.head()

Unnamed: 0,studytime,school_GP,sex_F,address_U,famsize_LE3,Mjob_health,Mjob_services,Mjob_teacher,Fjob_health,Fjob_teacher,reason_home,reason_reputation,guardian_father,famsup_yes,activities_yes,nursery_yes,higher_yes,internet_yes,romantic_no,Grade
0,0.333333,1,1,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,1,7.333333
1,0.333333,1,1,1,0,0,0,0,0,0,0,0,1,1,0,0,1,1,1,10.333333
2,0.333333,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,1,1,1,12.333333
3,0.666667,1,1,1,0,1,0,0,0,0,1,0,0,1,1,1,1,1,0,14.0
4,0.333333,1,1,1,0,0,0,0,0,0,1,0,1,1,0,1,1,0,1,12.333333


In [33]:
X = df1.iloc[:,:-1]
calc_vif(X)

Unnamed: 0,variables,VIF
0,studytime,2.618255
1,school_GP,3.752235
2,sex_F,2.613552
3,address_U,3.832599
4,famsize_LE3,1.457318
5,Mjob_health,1.266234
6,Mjob_services,1.485736
7,Mjob_teacher,1.437371
8,Fjob_health,1.142836
9,Fjob_teacher,1.176902


In [20]:
#Identify input and target columns
input_cols,target_cols=df1.columns[:-1],df1.columns[-1]
input_df,targets=df1[input_cols].copy(),df1[target_cols].copy()

In [6]:
#X_train,X_test,y_train,y_test=train_test_split(input_df,targets,test_size=0.10,random_state=42)

In [21]:
#y_train.mean()
#Linear Regression
#from sklearn.metrics import r2_score
scores= [] #to store r squared
mse_list= [] #to store RMSE
mae_list = []
#r2 = []
model1 = LinearRegression()
cv = KFold(n_splits=4, random_state=42, shuffle=True)
for train_index, test_index in cv.split(input_df):
    X_train, X_test, y_train, y_test = input_df.loc[train_index], input_df.loc[test_index], targets.loc[train_index], targets.loc[test_index]
    model1.fit(X_train, y_train)
    y_predict = model1.predict(X_test)
    scores.append(model1.score(X_test, y_test))
    #r2.append(r2_score(y_test, y_predict))
    mse_fold = mean_squared_error(y_test, y_predict)
    mse_list.append(mse_fold)
    mae_fold = mean_absolute_error(y_test, y_predict)
    mae_list.append(mae_fold)
print('MSE',np.mean(mse_list))
print('MAE',np.mean(mae_list))
print('R-squared',np.mean(scores))

MSE 6.829265287575714
MAE 2.0201961302216125
R-squared 0.12729122524411587


In [22]:
#SVR
scores= [] #to store r squared
mse_list= [] #to store RMSE
mae_list = []
model2 = LinearSVR()
cv = KFold(n_splits=5, random_state=42, shuffle=True)
for train_index, test_index in cv.split(input_df):
    X_train, X_test, y_train, y_test = input_df.loc[train_index], input_df.loc[test_index], targets.loc[train_index], targets.loc[test_index]
    model2.fit(X_train, y_train)
    y_predict = model2.predict(X_test)
    scores.append(model2.score(X_test, y_test))
    mse_fold = mean_squared_error(y_test, y_predict)
    mse_list.append(mse_fold)
    mae_fold = mean_absolute_error(y_test, y_predict)
    mae_list.append(mae_fold)
print('MSE',np.mean(mse_list))
print('MAE',np.mean(mae_list))
print('R-squared',np.mean(scores))

MSE 6.790504748656889
MAE 2.0148024955929347
R-squared 0.13311114200154853


In [23]:
scores= [] #to store r squared
mse_list= [] #to store RMSE
mae_list = []
model4=DecisionTreeRegressor()
cv = KFold(n_splits=5, random_state=42, shuffle=True)
for train_index, test_index in cv.split(input_df):
    X_train, X_test, y_train, y_test = input_df.loc[train_index], input_df.loc[test_index], targets.loc[train_index], targets.loc[test_index]
    model4.fit(X_train, y_train)
    y_predict = model4.predict(X_test)
    scores.append(model4.score(X_test, y_test))
    mse_fold = mean_squared_error(y_test, y_predict)
    mse_list.append(mse_fold)
    mae_fold = mean_absolute_error(y_test, y_predict)
    mae_list.append(mae_fold)
print('MSE',np.mean(mse_list))
print('MAE',np.mean(mae_list))
print('R-squared',np.mean(scores))

MSE 13.932073257030803
MAE 2.853832240216285
R-squared -0.7968622017851993


In [24]:
scores= [] #to store r squared
mse_list= [] #to store RMSE
mae_list = []
model5=XGBRegressor()
cv = KFold(n_splits=5, random_state=42, shuffle=True)
for train_index, test_index in cv.split(input_df):
    X_train, X_test, y_train, y_test = input_df.loc[train_index], input_df.loc[test_index], targets.loc[train_index], targets.loc[test_index]
    model5.fit(X_train, y_train)
    y_predict = model5.predict(X_test)
    scores.append(model5.score(X_test, y_test))
    mse_fold = mean_squared_error(y_test, y_predict)
    mse_list.append(mse_fold)
    mae_fold = mean_absolute_error(y_test, y_predict)
    mae_list.append(mae_fold)
print('MSE',np.mean(mse_list))
print('MAE',np.mean(mae_list))
print('R-squared',np.mean(scores))

MSE 10.172614237410128
MAE 2.5016240450619796
R-squared -0.30150172375333123


In [25]:
scores= [] #to store r squared
mse_list= [] #to store RMSE
mae_list = []
model6=RandomForestRegressor()
cv = KFold(n_splits=5, random_state=42, shuffle=True)
for train_index, test_index in cv.split(input_df):
    X_train, X_test, y_train, y_test = input_df.loc[train_index], input_df.loc[test_index], targets.loc[train_index], targets.loc[test_index]
    model6.fit(X_train, y_train)
    y_predict = model6.predict(X_test)
    scores.append(model6.score(X_test, y_test))
    mse_fold = mean_squared_error(y_test, y_predict)
    mse_list.append(mse_fold)
    mae_fold = mean_absolute_error(y_test, y_predict)
    mae_list.append(mae_fold)
print('MSE',np.mean(mse_list))
print('MAE',np.mean(mae_list))
print('R-squared',np.mean(scores))

MSE 7.703926050795182
MAE 2.135062378057745
R-squared 0.02254130459695771


In [26]:
scores= [] #to store r squared
mse_list= [] #to store RMSE
mae_list = []
model7=AdaBoostRegressor()
cv = KFold(n_splits=5, random_state=42, shuffle=True)
for train_index, test_index in cv.split(input_df):
    X_train, X_test, y_train, y_test = input_df.loc[train_index], input_df.loc[test_index], targets.loc[train_index], targets.loc[test_index]
    model7.fit(X_train, y_train)
    y_predict = model7.predict(X_test)
    scores.append(model7.score(X_test, y_test))
    mse_fold = mean_squared_error(y_test, y_predict)
    mse_list.append(mse_fold)
    mae_fold = mean_absolute_error(y_test, y_predict)
    mae_list.append(mae_fold)
print('MSE',np.mean(mse_list))
print('MAE',np.mean(mae_list))
print('R-squared',np.mean(scores))

MSE 7.4205302133839455
MAE 2.1042854174078505
R-squared 0.055951367944390776


In [27]:
scores= [] #to store r squared
mse_list= [] #to store RMSE
mae_list = []
model8=GradientBoostingRegressor()
cv = KFold(n_splits=20, random_state=42, shuffle=True)
for train_index, test_index in cv.split(input_df):
    X_train, X_test, y_train, y_test = input_df.loc[train_index], input_df.loc[test_index], targets.loc[train_index], targets.loc[test_index]
    model8.fit(X_train, y_train)
    y_predict = model8.predict(X_test)
    scores.append(model8.score(X_test, y_test))
    mse_fold = mean_squared_error(y_test, y_predict)
    mse_list.append(mse_fold)
    mae_fold = mean_absolute_error(y_test, y_predict)
    mae_list.append(mae_fold)
print('MSE',np.mean(mse_list))
print('MAE',np.mean(mae_list))
print('R-squared',np.mean(scores))

MSE 7.308372292178989
MAE 2.058145996940518
R-squared 0.05350798579171974


Support Vector regressor performs the best for this dataset.