In [3]:
# Importing the Necessary Python Packages Required For Predictive Analysis
import pandas as pd  
import numpy as np  
from sklearn.model_selection import train_test_split 
import statsmodels.formula.api as smf
from statsmodels.formula.api import logit
import statsmodels.api as sm

In [4]:
data = pd.read_csv('dropout_data.csv') #Reading In The Data
data.head() #Check that data has been successfully imported

Unnamed: 0.1,Unnamed: 0,moqual,faqual,admingrade,eduspecneeds,prevgrade,ownschshp,prevqual,displaced,debtor,paidfeetodate,gender,secsemgrade,target,course
0,0,19,12,127.3,0,122.0,0,1,1,0,1,1,0.0,0,171
1,1,1,3,142.5,0,160.0,0,1,1,0,0,1,13.666667,1,9254
2,2,37,37,124.8,0,122.0,0,1,1,0,0,1,0.0,0,9070
3,3,38,37,119.6,0,122.0,0,1,1,0,1,0,12.4,1,9773
4,4,37,38,141.5,0,100.0,0,1,0,0,1,0,13.0,1,8014


In [5]:
y = data['target']  # response variable
x = data  # explanatory variables
x.shape
# split the data into the training data and test data
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    train_size = 0.80, 
    shuffle = True,
    random_state = 42,  # Set a seed for reproducible output
)
# fit logistic regression model with dropping no variables
NoDroppedVariablesFormula = "target ~ moqual +faqual + admingrade + eduspecneeds + prevgrade + ownschshp + prevqual + debtor + paidfeetodate + gender + secsemgrade + course + displaced"
NoDroppedVariablesModel = smf.glm(formula = NoDroppedVariablesFormula, data = x_train, family = sm.families.Binomial())
NoDroppedVariablesResults = NoDroppedVariablesModel.fit()
print(NoDroppedVariablesResults.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                 target   No. Observations:                 2904
Model:                            GLM   Df Residuals:                     2890
Model Family:                Binomial   Df Model:                           13
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -974.04
Date:                Fri, 24 Feb 2023   Deviance:                       1948.1
Time:                        22:44:19   Pearson chi2:                 4.32e+03
No. Iterations:                     7                                         
Covariance Type:            nonrobust                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        -3.9898      0.772     -5.168

In [6]:
# Making Predictions for test data for the model where no variables have been dropped(unknown data to the model)
x_test["ProbabilityNoDroppedVariables"] = 1/(1+np.exp(-(-3.9898 + -0.0106*x_test["moqual"] + 0.0039*x_test["faqual"]+ 0.0130*x_test["admingrade"] + -0.0430*x_test["eduspecneeds"] + -0.0031*x_test["prevgrade"] + 1.2680*x_test["ownschshp"] + -0.0119*x_test["prevqual"] + -0.8303*x_test["debtor"] + 3.0597*x_test["paidfeetodate"] + -0.7536*x_test["gender"] + 0.4861*x_test["secsemgrade"] + -0.0005*x_test["course"] + 0.0014*x_test["displaced"]))) 
x_test[:5]

Unnamed: 0.1,Unnamed: 0,moqual,faqual,admingrade,eduspecneeds,prevgrade,ownschshp,prevqual,displaced,debtor,paidfeetodate,gender,secsemgrade,target,course,ProbabilityNoDroppedVariables
693,847,19,19,122.2,0,118.0,0,1,0,0,1,0,10.25,1,9147,0.637037
3220,3927,38,37,122.3,0,130.0,0,1,1,1,0,0,12.0,0,9773,0.049477
432,524,5,3,183.5,0,180.0,0,1,1,0,1,0,0.0,0,9556,0.01922
183,223,19,1,150.0,0,127.0,1,1,0,0,1,0,12.192857,1,9500,0.945923
2620,3198,19,19,110.0,0,110.0,0,3,0,0,1,1,0.0,0,9254,0.004565


In [7]:
# ProbabilityNoDroppedVariables > 0.5 translates to 1 (High Risk Of Dropping Out)
# ProbabilityNoDroppedVariables < 0.5 translates to 0 (Low Risk Of Dropping Out)
x_test["PredictionNoDroppedVariables"]=np.where(x_test["ProbabilityNoDroppedVariables"] > 0.5, 1, 0)
x_test[:5]

Unnamed: 0.1,Unnamed: 0,moqual,faqual,admingrade,eduspecneeds,prevgrade,ownschshp,prevqual,displaced,debtor,paidfeetodate,gender,secsemgrade,target,course,ProbabilityNoDroppedVariables,PredictionNoDroppedVariables
693,847,19,19,122.2,0,118.0,0,1,0,0,1,0,10.25,1,9147,0.637037,1
3220,3927,38,37,122.3,0,130.0,0,1,1,1,0,0,12.0,0,9773,0.049477,0
432,524,5,3,183.5,0,180.0,0,1,1,0,1,0,0.0,0,9556,0.01922,0
183,223,19,1,150.0,0,127.0,1,1,0,0,1,0,12.192857,1,9500,0.945923,1
2620,3198,19,19,110.0,0,110.0,0,3,0,0,1,1,0.0,0,9254,0.004565,0


In [8]:
# Comparing the predictions (newly added col) with actual (x_test)
# TruePredictionNoDroppedVariables => 1 if they are same and 0 if different
x_test["TruePredictionNoDroppedVariables"]= np.where(x_test["PredictionNoDroppedVariables"] == x_test["target"], 1, 0)
x_test[:5]

Unnamed: 0.1,Unnamed: 0,moqual,faqual,admingrade,eduspecneeds,prevgrade,ownschshp,prevqual,displaced,debtor,paidfeetodate,gender,secsemgrade,target,course,ProbabilityNoDroppedVariables,PredictionNoDroppedVariables,TruePredictionNoDroppedVariables
693,847,19,19,122.2,0,118.0,0,1,0,0,1,0,10.25,1,9147,0.637037,1,1
3220,3927,38,37,122.3,0,130.0,0,1,1,1,0,0,12.0,0,9773,0.049477,0,1
432,524,5,3,183.5,0,180.0,0,1,1,0,1,0,0.0,0,9556,0.01922,0,1
183,223,19,1,150.0,0,127.0,1,1,0,0,1,0,12.192857,1,9500,0.945923,1,1
2620,3198,19,19,110.0,0,110.0,0,3,0,0,1,1,0.0,0,9254,0.004565,0,1


In [9]:
print("True Prediction: ", len(x_test.query("TruePredictionNoDroppedVariables == 1")))
print("False Prediction: ", len(x_test.query("TruePredictionNoDroppedVariables == 0")))
print("Prediction Accuracy: ", len(x_test.query("TruePredictionNoDroppedVariables == 1"))/len(y_test))

True Prediction:  617
False Prediction:  109
Prediction Accuracy:  0.849862258953168


In [10]:
y = data['target']  # response variable
x = data  # explanatory variables
x.shape
# split the data into the training data and test data
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    train_size = 0.80, 
    shuffle = True,
    random_state = 42,  # Set a seed for reproducible output
)
#Fit logistic regression model for the Dropped Variables with a lower significance level 
DroppedVariablesFormula = "target ~ moqual + admingrade +  ownschshp + debtor + paidfeetodate + gender + secsemgrade "
DroppedVariablesModel = smf.glm(formula = DroppedVariablesFormula, data = x_train, family = sm.families.Binomial())
DroppedVariablesResults = DroppedVariablesModel.fit()
print(DroppedVariablesResults.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                 target   No. Observations:                 2904
Model:                            GLM   Df Residuals:                     2896
Model Family:                Binomial   Df Model:                            7
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -1070.2
Date:                Fri, 24 Feb 2023   Deviance:                       2140.4
Time:                        22:44:19   Pearson chi2:                 2.88e+03
No. Iterations:                     6                                         
Covariance Type:            nonrobust                                         
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        -7.3790      0.598    -12.348

In [11]:
# Making Predictions for test data for the model with dropped 
x_test["ProbabilityDroppedVariables"] = 1/(1+np.exp(-(-7.3790 + -0.0101*x_test["moqual"] + 0.0206*x_test["admingrade"] + 1.4003*x_test["ownschshp"] + -0.8712*x_test["debtor"] + 2.9573*x_test["paidfeetodate"] + -0.7166*x_test["gender"] + 0.2743*x_test["secsemgrade"]))) 
x_test[:5]

Unnamed: 0.1,Unnamed: 0,moqual,faqual,admingrade,eduspecneeds,prevgrade,ownschshp,prevqual,displaced,debtor,paidfeetodate,gender,secsemgrade,target,course,ProbabilityDroppedVariables
693,847,19,19,122.2,0,118.0,0,1,0,0,1,0,10.25,1,9147,0.67157
3220,3927,38,37,122.3,0,130.0,0,1,1,1,0,0,12.0,0,9773,0.056093
432,524,5,3,183.5,0,180.0,0,1,1,0,1,0,0.0,0,9556,0.333566
183,223,19,1,150.0,0,127.0,1,1,0,0,1,0,12.192857,1,9500,0.961624
2620,3198,19,19,110.0,0,110.0,0,3,0,0,1,1,0.0,0,9254,0.044608


In [12]:
# ProbabilityDroppedVariables > 0.5 translates to 1 (High Risk Of Dropping Out)
# ProbabilityDroppedVariables < 0.5 translates to 0 (Low Risk Of Dropping Out)
x_test["PredictionDroppedVariables"]=np.where(x_test["ProbabilityDroppedVariables"] > 0.5, 1, 0)
x_test[:5]

Unnamed: 0.1,Unnamed: 0,moqual,faqual,admingrade,eduspecneeds,prevgrade,ownschshp,prevqual,displaced,debtor,paidfeetodate,gender,secsemgrade,target,course,ProbabilityDroppedVariables,PredictionDroppedVariables
693,847,19,19,122.2,0,118.0,0,1,0,0,1,0,10.25,1,9147,0.67157,1
3220,3927,38,37,122.3,0,130.0,0,1,1,1,0,0,12.0,0,9773,0.056093,0
432,524,5,3,183.5,0,180.0,0,1,1,0,1,0,0.0,0,9556,0.333566,0
183,223,19,1,150.0,0,127.0,1,1,0,0,1,0,12.192857,1,9500,0.961624,1
2620,3198,19,19,110.0,0,110.0,0,3,0,0,1,1,0.0,0,9254,0.044608,0


In [13]:
# Comparing the predictions (newly added col) with actual (x_test)
# TruePredictionDroppedVariables => 1 if they are same and 0 if different
x_test["TruePredictionDroppedVariables"]= np.where(x_test["PredictionDroppedVariables"] == x_test["target"], 1, 0)
x_test[:5]

Unnamed: 0.1,Unnamed: 0,moqual,faqual,admingrade,eduspecneeds,prevgrade,ownschshp,prevqual,displaced,debtor,paidfeetodate,gender,secsemgrade,target,course,ProbabilityDroppedVariables,PredictionDroppedVariables,TruePredictionDroppedVariables
693,847,19,19,122.2,0,118.0,0,1,0,0,1,0,10.25,1,9147,0.67157,1,1
3220,3927,38,37,122.3,0,130.0,0,1,1,1,0,0,12.0,0,9773,0.056093,0,1
432,524,5,3,183.5,0,180.0,0,1,1,0,1,0,0.0,0,9556,0.333566,0,1
183,223,19,1,150.0,0,127.0,1,1,0,0,1,0,12.192857,1,9500,0.961624,1,1
2620,3198,19,19,110.0,0,110.0,0,3,0,0,1,1,0.0,0,9254,0.044608,0,1


In [14]:
  print("True Prediction: ", len(x_test.query("TruePredictionDroppedVariables == 1")))
print("False Prediction: ", len(x_test.query("TruePredictionDroppedVariables == 0")))
print("Prediction Accuracy: ", len(x_test.query("TruePredictionDroppedVariables == 1"))/len(y_test))

True Prediction:  610
False Prediction:  116
Prediction Accuracy:  0.8402203856749312
