In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()

from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

In [57]:
raw_data = pd.read_csv('../Part_5_ML/S36_L243/binary_predictors.csv')

In [58]:
raw_data

Unnamed: 0,SAT,Admitted,Gender
0,1363,No,Male
1,1792,Yes,Female
2,1954,Yes,Female
3,1653,No,Male
4,1593,No,Male
...,...,...,...
163,1722,Yes,Female
164,1750,Yes,Male
165,1555,No,Male
166,1524,No,Male


In [59]:
data = raw_data.copy()
data['Gender'] = data['Gender'].map({'Female':1, 'Male':0})
data['Admitted'] = data['Admitted'].map({'Yes':1, 'No':0})
data
#Male and No are references

Unnamed: 0,SAT,Admitted,Gender
0,1363,0,0
1,1792,1,1
2,1954,1,1
3,1653,0,0
4,1593,0,0
...,...,...,...
163,1722,1,1
164,1750,1,0
165,1555,0,0
166,1524,0,0


In [60]:
data.describe()

Unnamed: 0,SAT,Admitted,Gender
count,168.0,168.0,168.0
mean,1695.27381,0.559524,0.464286
std,183.019017,0.497928,0.500214
min,1334.0,0.0,0.0
25%,1547.5,0.0,0.0
50%,1691.5,1.0,0.0
75%,1844.5,1.0,1.0
max,2050.0,1.0,1.0


In [61]:
y = data['Admitted']
x1 = data['Gender']

In [62]:
x = sm.add_constant(x1)
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()
results_log.summary() # our model
#p-value is 0.000, significant
#Gender is significant
#log(odds) = -0.64 + 2.08*Gender
#log(ODDSfemale/ODDSmale) = 2.08*(1-0) , 1 for Female 0 for Male
#ODDSfemale = 7.99*ODDSmale
#Female has 8 more chance to admittance compare a man

Optimization terminated successfully.
         Current function value: 0.572260
         Iterations 5


0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,166.0
Method:,MLE,Df Model:,1.0
Date:,"Tue, 27 Jun 2023",Pseudo R-squ.:,0.1659
Time:,23:18:55,Log-Likelihood:,-96.14
converged:,True,LL-Null:,-115.26
Covariance Type:,nonrobust,LLR p-value:,6.283e-10

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.6436,0.222,-2.901,0.004,-1.078,-0.209
Gender,2.0786,0.363,5.727,0.000,1.367,2.790


In [63]:
np.exp(2.0786)#coef of gender

7.993270498536442

In [64]:
y = data['Admitted']
x1 = data[['SAT','Gender']]

In [65]:
x = sm.add_constant(x1)
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()
results_log.summary()
#Log-likelihood is increased, LLR p-value is decreased
#Therefore model is more significant
#P-value of Gender has changed to 0.022 rather than 0.000, cause of SAT
#Because, SAT is a outstanding predictor

Optimization terminated successfully.
         Current function value: 0.120117
         Iterations 10


0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,165.0
Method:,MLE,Df Model:,2.0
Date:,"Tue, 27 Jun 2023",Pseudo R-squ.:,0.8249
Time:,23:18:56,Log-Likelihood:,-20.18
converged:,True,LL-Null:,-115.26
Covariance Type:,nonrobust,LLR p-value:,5.1180000000000006e-42

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-68.3489,16.454,-4.154,0.000,-100.598,-36.100
SAT,0.0406,0.010,4.129,0.000,0.021,0.060
Gender,1.9449,0.846,2.299,0.022,0.287,3.603


In [66]:
np.exp(1.9449) #coef of new gender after SAT is given
#Female has 7 times more chance by admittance when Male and Famale both have same SAT score

#The more the data such SAT, Gender, Skills, etc. the better the estimation and interpretation

6.992932526814459

In [67]:
#Interpretation of accuracy of model

In [68]:
results_log.predict()

array([0.00, 1.00, 1.00, 0.23, 0.02, 0.99, 1.00, 1.00, 1.00, 0.01, 1.00,
       1.00, 0.76, 0.00, 0.60, 1.00, 0.11, 0.12, 0.51, 1.00, 1.00, 1.00,
       0.00, 0.01, 0.97, 1.00, 0.48, 0.99, 1.00, 0.99, 0.00, 0.83, 0.25,
       1.00, 1.00, 1.00, 0.31, 1.00, 0.23, 0.00, 0.02, 0.45, 1.00, 0.00,
       0.99, 0.00, 0.99, 0.00, 0.00, 0.01, 0.00, 1.00, 0.92, 0.02, 1.00,
       0.00, 0.37, 0.98, 0.12, 1.00, 0.00, 0.78, 1.00, 1.00, 0.98, 0.00,
       0.00, 0.00, 1.00, 0.00, 0.78, 0.12, 0.00, 0.99, 1.00, 1.00, 0.00,
       0.30, 1.00, 1.00, 0.00, 1.00, 1.00, 0.85, 1.00, 1.00, 0.00, 1.00,
       1.00, 0.89, 0.83, 0.00, 0.98, 0.97, 0.00, 1.00, 1.00, 0.03, 0.99,
       0.96, 1.00, 0.00, 1.00, 0.01, 0.01, 1.00, 1.00, 1.00, 0.00, 0.00,
       0.02, 0.33, 0.00, 1.00, 0.09, 0.00, 0.97, 0.00, 0.75, 1.00, 1.00,
       0.01, 0.01, 0.00, 1.00, 0.00, 0.99, 0.57, 0.54, 0.87, 0.83, 0.00,
       1.00, 0.00, 0.00, 0.00, 1.00, 0.04, 0.00, 0.01, 1.00, 0.99, 0.52,
       1.00, 1.00, 0.05, 0.00, 0.00, 0.00, 0.68, 1.

In [69]:
np.set_printoptions(formatter={'float':lambda x: "{0:0.2f}".format(x)})
results_log.predict()
#0s and 1s and values between 0-1
#values belove 0.5 rounded to the bottom
#values above 0.5 rounded to the top

#these are predicted values by the model

array([0.00, 1.00, 1.00, 0.23, 0.02, 0.99, 1.00, 1.00, 1.00, 0.01, 1.00,
       1.00, 0.76, 0.00, 0.60, 1.00, 0.11, 0.12, 0.51, 1.00, 1.00, 1.00,
       0.00, 0.01, 0.97, 1.00, 0.48, 0.99, 1.00, 0.99, 0.00, 0.83, 0.25,
       1.00, 1.00, 1.00, 0.31, 1.00, 0.23, 0.00, 0.02, 0.45, 1.00, 0.00,
       0.99, 0.00, 0.99, 0.00, 0.00, 0.01, 0.00, 1.00, 0.92, 0.02, 1.00,
       0.00, 0.37, 0.98, 0.12, 1.00, 0.00, 0.78, 1.00, 1.00, 0.98, 0.00,
       0.00, 0.00, 1.00, 0.00, 0.78, 0.12, 0.00, 0.99, 1.00, 1.00, 0.00,
       0.30, 1.00, 1.00, 0.00, 1.00, 1.00, 0.85, 1.00, 1.00, 0.00, 1.00,
       1.00, 0.89, 0.83, 0.00, 0.98, 0.97, 0.00, 1.00, 1.00, 0.03, 0.99,
       0.96, 1.00, 0.00, 1.00, 0.01, 0.01, 1.00, 1.00, 1.00, 0.00, 0.00,
       0.02, 0.33, 0.00, 1.00, 0.09, 0.00, 0.97, 0.00, 0.75, 1.00, 1.00,
       0.01, 0.01, 0.00, 1.00, 0.00, 0.99, 0.57, 0.54, 0.87, 0.83, 0.00,
       1.00, 0.00, 0.00, 0.00, 1.00, 0.04, 0.00, 0.01, 1.00, 0.99, 0.52,
       1.00, 1.00, 0.05, 0.00, 0.00, 0.00, 0.68, 1.

In [70]:
np.array(data['Admitted'])
#these are actual values, if there is an admittance or not

array([0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0], dtype=int64)

In [71]:
results_log.pred_table()
#returns a table which compares predicted and actual values
#69 out of 74 of 0s are correct predicted
#90 out of 94 of 1s are correct predicted
#overall significancy is (90+69)/(74+94) = close to 95

array([[69.00, 5.00],
       [4.00, 90.00]])

In [72]:
#confusion matrix
cm_df = pd.DataFrame(results_log.pred_table())
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index={0:'Actual 0', 1:'Actual 1'})
cm_df
# for 5 + 4 = 9 predictions model got confused

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,69.0,5.0
Actual 1,4.0,90.0


In [73]:
cm = np.array(cm_df)
accuracy_train = (cm[0,0] + cm[1,1]) / cm.sum()
accuracy_train
#(69 + 90)/total
#159/168

0.9464285714285714

In [74]:
#Test the model
test = pd.read_csv('../Part_5_ML/S36_L248/test_dataset.csv')
test

Unnamed: 0,SAT,Admitted,Gender
0,1323,No,Male
1,1725,Yes,Female
2,1762,Yes,Female
3,1777,Yes,Male
4,1665,No,Male
5,1556,Yes,Female
6,1731,Yes,Female
7,1809,Yes,Female
8,1930,Yes,Female
9,1708,Yes,Male


In [75]:
test['Admitted'] = test['Admitted'].map({'Yes':1, 'No':0})
test['Gender'] = test['Gender'].map({'Female':1, 'Male':0})
test
#use model to make predictions based on the test data
#compare those with actual outcome(real life observation/example)
#calculate the accuracy
#create a confusion matrix

Unnamed: 0,SAT,Admitted,Gender
0,1323,0,0
1,1725,1,1
2,1762,1,1
3,1777,1,0
4,1665,0,0
5,1556,1,1
6,1731,1,1
7,1809,1,1
8,1930,1,1
9,1708,1,0


In [76]:
x

Unnamed: 0,const,SAT,Gender
0,1.0,1363,0
1,1.0,1792,1
2,1.0,1954,1
3,1.0,1653,0
4,1.0,1593,0
...,...,...,...
163,1.0,1722,1
164,1.0,1750,0
165,1.0,1555,0
166,1.0,1524,0


In [79]:
test_actual = test['Admitted'] #actual outcomes that were observed
test_data = test.drop(['Admitted'], axis=1) # drop the outputs
test_data = sm.add_constant(test_data) # add constant and voila! here is our input variable-
#test_data = test_data[x.columns.values]
test_data

Unnamed: 0,const,SAT,Gender
0,1.0,1323,0
1,1.0,1725,1
2,1.0,1762,1
3,1.0,1777,0
4,1.0,1665,0
5,1.0,1556,1
6,1.0,1731,1
7,1.0,1809,1
8,1.0,1930,1
9,1.0,1708,0


In [82]:
def confusion_matrix(data, actual_values, model): 
    pred_values = model.predict(data) #predict the values using logit model
    bins = np.array([0,0.5,1]) # create a histogram that 0-0.5 round to 0, 0.5-1 round to 1
    cm = np.histogram2d(actual_values, pred_values, bins=bins)[0] #confusion matrix
    accuracy = (cm[0,0]+cm[1,1])/cm.sum() # calculate accuracy
    return cm, accuracy

#data: data frame or array
    #is formatted same way as input without actual(observed) values
    # const, var1, var2, ..., order is important
#actual_values: data frame or array
    #actual(observed) values from test_data
    #in logistic regression this should be single column with 0s and 1s
    

In [85]:
cm = confusion_matrix(test_data, test_actual, results_log) # cm : cm and accuracy
cm
#training accuracy is higher than test accuracy cause of overfitting

(array([[5.00, 1.00],
        [1.00, 12.00]]),
 0.8947368421052632)

In [87]:
cm_df = pd.DataFrame(cm[0])
cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index={0:'Actual 0', 1:'Actual 1'})
cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5.0,1.0
Actual 1,1.0,12.0


In [88]:
#missclassification rate
print('Missclassification rate:' +str((1+1)/19))

Missclassification rate:0.10526315789473684
