In [28]:
# Import module/s
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from math import sqrt
import statsmodels.api as sm

In [29]:
# Import data
model_data = pd.read_csv('Admission_Predict.csv',header=0)

# Reformat column names. Change space to underscore. 
model_data.columns = model_data.columns.str.replace(' ', '_')

In [30]:
# Final Model: GRE Score, TOEFL Score, LOR, CGPA
y_var = 'Chance_of_Admit_'
x_vars = [
'GRE_Score',
'TOEFL_Score',
# 'LN_GRE',
# 'LN_TOEFL',
#'University_Rating',
#'SOP',
'LOR_',
'CGPA'
,'Research'
]

# Get the values of y (dependent variable)
y_values = model_data[y_var]

# Get the values of x (independent variables)
x_values = model_data[x_vars]

# Add constant to independent/predictor variables
x_values = sm.add_constant(x_values)

x_values

Unnamed: 0,const,GRE_Score,TOEFL_Score,LOR_,CGPA,Research
0,1.0,337,118,4.5,9.65,1
1,1.0,324,107,4.5,8.87,1
2,1.0,316,104,3.5,8.00,1
3,1.0,322,110,2.5,8.67,1
4,1.0,314,103,3.0,8.21,0
...,...,...,...,...,...,...
395,1.0,324,110,3.5,9.04,1
396,1.0,325,107,3.5,9.11,1
397,1.0,330,116,4.5,9.45,1
398,1.0,312,103,4.0,8.78,0


In [31]:
# Split the data into training (70%) and testing (30%) datasets
x_train, x_test, y_train, y_test = train_test_split(x_values,y_values,test_size=0.3,random_state=1234)

# Fit regression model
model = sm.OLS(y_train, x_train).fit()

# Predict the values of y in both the training and testing datasets
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)

# Get the value of m (slope of regression line) in y = mx + b
m_value = model.params.loc[x_vars]

model.summary()

0,1,2,3
Dep. Variable:,Chance_of_Admit_,R-squared:,0.812
Model:,OLS,Adj. R-squared:,0.808
Method:,Least Squares,F-statistic:,236.0
Date:,"Mon, 13 Feb 2023",Prob (F-statistic):,4.4000000000000003e-97
Time:,16:55:48,Log-Likelihood:,384.32
No. Observations:,280,AIC:,-756.6
Df Residuals:,274,BIC:,-734.8
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.3014,0.140,-9.320,0.000,-1.576,-1.027
GRE_Score,0.0015,0.001,2.173,0.031,0.000,0.003
TOEFL_Score,0.0036,0.001,2.976,0.003,0.001,0.006
LOR_,0.0219,0.006,3.841,0.000,0.011,0.033
CGPA,0.1245,0.013,9.303,0.000,0.098,0.151
Research,0.0248,0.009,2.671,0.008,0.007,0.043

0,1,2,3
Omnibus:,55.941,Durbin-Watson:,2.034
Prob(Omnibus):,0.0,Jarque-Bera (JB):,104.283
Skew:,-1.055,Prob(JB):,2.2699999999999997e-23
Kurtosis:,5.119,Cond. No.,12700.0


Interpretation:

R-squared:	0.807   -> The model can explain 80.7% of the variation in the values of Chance_of_Admit.

Prob (F-statistic):	8.18e-97    -> significant


p-value for the t-tests
    const = 0.000
    GRE_Score = 0.002
    TOEFL_Score = 0.005
    LOR_ = 0.000
    CGPA = 0.000

All p-values are below 0.05. There is a significant relationship between the predictors and the dependent variable.

In [32]:
# Test for multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calculate VIF Factors. For each X, calculate VIF and save in dataframe
vif = pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(x_values.values, i) for i in range(x_values.shape[1])]
vif["Predictors"] = x_values.columns

# Inspect VIF Factors. VIF should be <10
vif.round(1)

Unnamed: 0,VIF Factor,Predictors
0,1354.4,const
1,4.6,GRE_Score
2,4.1,TOEFL_Score
3,1.8,LOR_
4,4.8,CGPA
5,1.5,Research


Interpretation:

All predictors (x's) have VIF < 10. No multicollinearity issues.

In [33]:
# Test for Homoscedasticity 
# Perform White's test. p-value should be >= 0.05
from statsmodels.stats.diagnostic import het_white
white_test = het_white(model.resid,  model.model.exog)

#define labels to use for output of White's test
labels = ['Test Statistic', 'Test Statistic p-value', 'F-Statistic', 'F-Test p-value']

#print results of White's test
print(dict(zip(labels, white_test)))

{'Test Statistic': 33.19385383202266, 'Test Statistic p-value': 0.02283123542421026, 'F-Statistic': 1.8404391100860198, 'F-Test p-value': 0.019074334059984627}


Interpretation:

The p-value (0.0878) is not less than 0.05. There is no sufficient evidence to say that heteroskedasticity is present in the data.

In [34]:
# Assumption of Independent Errors
# Durbin-Watson test. For this test, a value of 2, or close to it, is ideal.
print(model.resid)
sm.stats.stattools.durbin_watson(model.resid)     

217    0.023937
349    0.017035
188    0.049582
267    0.052179
222   -0.017834
         ...   
204    0.011507
53     0.039680
294   -0.035265
211   -0.021556
303   -0.008840
Length: 280, dtype: float64


2.034497860947805

Interpretation:

Value (2.0360) is close to 2. The residual error terms are independent of each other.

In [35]:
# Compute for error metrics using the train dataset
rmse_train = sqrt(mean_squared_error(y_train, y_pred_train))
mape_train = mean_absolute_percentage_error(y_train, y_pred_train)
mae_train = mean_absolute_error(y_train, y_pred_train)

# Compute for error metrics using the test dataset
rmse_test = sqrt(mean_squared_error(y_test, y_pred_test))
mape_test = mean_absolute_percentage_error(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)

print('MAPE (Train): ',mape_train*100)
print('MAPE (Train): ',mape_test*100)

MAPE (Train):  7.075864631499861
MAPE (Train):  7.941236025373702


Interpretation:

Mean Absolute Percentage Errors (MAPE) are low (below 10%).