In [21]:
# Import module/s
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from math import sqrt
import statsmodels.api as sm

In [22]:
# Import data
model_data = pd.read_csv('Admission_Predict.csv',header=0)

# Reformat column names. Change space to underscore. 
model_data.columns = model_data.columns.str.replace(' ', '_')

In [23]:
# Final Model: GRE Score, TOEFL Score, LOR, CGPA
y_var = 'Chance_of_Admit_'
x_vars = [
'GRE_Score',
'TOEFL_Score',
# 'LN_GRE',
# 'LN_TOEFL',
# 'University_Rating',
# 'SOP',
'LOR_',
'CGPA'
# ,'Research'
]

# Get the values of y (dependent variable)
y_values = model_data[y_var]

# Get the values of x (independent variables)
x_values = model_data[x_vars]

# Add constant to independent/predictor variables
x_values = sm.add_constant(x_values)

x_values

Unnamed: 0,const,GRE_Score,TOEFL_Score,LOR_,CGPA
0,1.0,337,118,4.5,9.65
1,1.0,324,107,4.5,8.87
2,1.0,316,104,3.5,8.00
3,1.0,322,110,2.5,8.67
4,1.0,314,103,3.0,8.21
...,...,...,...,...,...
395,1.0,324,110,3.5,9.04
396,1.0,325,107,3.5,9.11
397,1.0,330,116,4.5,9.45
398,1.0,312,103,4.0,8.78


In [24]:
# Split the data into training (70%) and testing (30%) datasets
x_train, x_test, y_train, y_test = train_test_split(x_values,y_values,test_size=0.3,random_state=1234)

# Fit regression model
model = sm.OLS(y_train, x_train).fit()

# Predict the values of y in both the training and testing datasets
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)

# Get the value of m (slope of regression line) in y = mx + b
m_value = model.params.loc[x_vars]

model.summary()

0,1,2,3
Dep. Variable:,Chance_of_Admit_,R-squared:,0.807
Model:,OLS,Adj. R-squared:,0.804
Method:,Least Squares,F-statistic:,286.9
Date:,"Mon, 13 Feb 2023",Prob (F-statistic):,8.18e-97
Time:,15:07:07,Log-Likelihood:,380.72
No. Observations:,280,AIC:,-751.4
Df Residuals:,275,BIC:,-733.3
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.4693,0.126,-11.653,0.000,-1.718,-1.221
GRE_Score,0.0021,0.001,3.140,0.002,0.001,0.003
TOEFL_Score,0.0034,0.001,2.840,0.005,0.001,0.006
LOR_,0.0236,0.006,4.123,0.000,0.012,0.035
CGPA,0.1248,0.014,9.228,0.000,0.098,0.151

0,1,2,3
Omnibus:,60.232,Durbin-Watson:,2.036
Prob(Omnibus):,0.0,Jarque-Bera (JB):,120.188
Skew:,-1.098,Prob(JB):,7.970000000000001e-27
Kurtosis:,5.341,Cond. No.,11300.0


Interpretation:

R-squared:	0.807   -> The model can explain 80.7% of the variation in the values of Chance_of_Admit.

Prob (F-statistic):	8.18e-97    -> significant


p-value for the t-tests
    const = 0.000
    GRE_Score = 0.002
    TOEFL_Score = 0.005
    LOR_ = 0.000
    CGPA = 0.000

All p-values are below 0.05. There is a significant relationship between the predictors and the dependent variable.

In [25]:
# Test for multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calculate VIF Factors. For each X, calculate VIF and save in dataframe
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(x_values.values, i) for i in range(x_values.shape[1])]
vif["Predictors"] = x_values.columns

# Inspect VIF Factors. VIF should be <10
vif.round(1)

Unnamed: 0,VIF Factor,Predictors
0,1077.3,const
1,4.2,GRE_Score
2,4.1,TOEFL_Score
3,1.8,LOR_
4,4.8,CGPA


Interpretation:

All predictors (x's) have VIF < 10. No multicollinearity issues.

In [26]:
# Test for Homoscedasticity 
# Perform White's test. p-value should be >= 0.05
from statsmodels.stats.diagnostic import het_white
white_test = het_white(model.resid,  model.model.exog)

#define labels to use for output of White's test
labels = ['Test Statistic', 'Test Statistic p-value', 'F-Statistic', 'F-Test p-value']

#print results of White's test
print(dict(zip(labels, white_test)))

{'Test Statistic': 21.571992734494398, 'Test Statistic p-value': 0.08784708876377695, 'F-Statistic': 1.5800416125640413, 'F-Test p-value': 0.08452839082423688}


Interpretation:

The p-value (0.0878) is not less than 0.05. There is no sufficient evidence to say that heteroskedasticity is present in the data.

In [27]:
# Assumption of Independent Errors
# Durbin-Watson test. For this test, a value of 2, or close to it, is ideal.
print(model.resid)
sm.stats.stattools.durbin_watson(model.resid)     

217    0.031221
349    0.005329
188    0.052519
267    0.064531
222   -0.036459
         ...   
204    0.007276
53     0.048519
294   -0.022347
211   -0.018276
303   -0.001932
Length: 280, dtype: float64


2.036016702634327

Interpretation:

Value (2.0360) is close to 2. The residual error terms are independent of each other.

In [35]:
# Compute for error metrics using the train dataset
rmse_train = sqrt(mean_squared_error(y_train, y_pred_train))
mape_train = mean_absolute_percentage_error(y_train, y_pred_train)
mae_train = mean_absolute_error(y_train, y_pred_train)

# Compute for error metrics using the test dataset
rmse_test = sqrt(mean_squared_error(y_test, y_pred_test))
mape_test = mean_absolute_percentage_error(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)

print('MAPE (Train): ',mape_train*100)
print('MAPE (Train): ',mape_test*100)

MAPE (Train):  7.184923961652341
MAPE (Train):  7.834492108967156


Interpretation:

Mean Absolute Percentage Errors (MAPE) are low (below 10%).