# Binary Predictors with SM

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

# apply a fix to the statsmodels library
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)

In [2]:
raw_data = pd.read_csv('2.02. Binary predictors.csv')

In [3]:
# there is one  additional piece of information is Gender

raw_data.head()

Unnamed: 0,SAT,Admitted,Gender
0,1363,No,Male
1,1792,Yes,Female
2,1954,Yes,Female
3,1653,No,Male
4,1593,No,Male


In [4]:
# so the male is the baseline or the reference group

data = raw_data.copy()

data['Admitted'] = data['Admitted'].map({'Yes':1, 'No':0})
data['Gender'] = data['Gender'].map({'Female':1, 'Male':0})

In [5]:
data.head()

Unnamed: 0,SAT,Admitted,Gender
0,1363,0,0
1,1792,1,1
2,1954,1,1
3,1653,0,0
4,1593,0,0


In [6]:
y = data['Admitted']

In [7]:
x1 = data['Gender']

In [8]:
# the LLR p-value is significant, so the model is significant
# the Gender is significant too
# the model is log(odds) = -0.64 + 2.08 * Gender
# the Gender are two values 1 or 0
# so there is only a unit change no other option
# use the same method to simplify
# log(oddsfemale / oddsmale) = 2.08
# so the oddsfemale = 7.99 * oddsmale
# so it is the interpretation of binary predictors coefficients
# in the previous it can know the SAT and Admitted has a strong relationship

x = sm.add_constant(x1)
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()
results_log.summary()

Optimization terminated successfully.
         Current function value: 0.572260
         Iterations 5


  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,166.0
Method:,MLE,Df Model:,1.0
Date:,"Thu, 09 Jan 2020",Pseudo R-squ.:,0.1659
Time:,16:51:09,Log-Likelihood:,-96.14
converged:,True,LL-Null:,-115.26
Covariance Type:,nonrobust,LLR p-value:,6.283e-10

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.6436,0.222,-2.901,0.004,-1.078,-0.209
Gender,2.0786,0.363,5.727,0.000,1.367,2.790


In [9]:
np.exp(2.0786)

7.993270498536442

In [10]:
y = data['Admitted']

In [11]:
# so the new logistic regression are including both predictors

x1 = data[['SAT','Gender']]

In [12]:
# compare the Log-Likelihood it is much higher
# so including the both predictors is a better one
# but the Gender is no long have 0.000 it become 0.022
# and the new coefficient of Gender is 1.9449
# so given the same SAT score, a female has 6.99 higher odds to get admitted than a male
# so it can know in this particular university or degree it is muhc easier for females to enter

x = sm.add_constant(x1)
reg_log = sm.Logit(y,x)
results_log = reg_log.fit()
results_log.summary()

Optimization terminated successfully.
         Current function value: 0.120117
         Iterations 10


0,1,2,3
Dep. Variable:,Admitted,No. Observations:,168.0
Model:,Logit,Df Residuals:,165.0
Method:,MLE,Df Model:,2.0
Date:,"Thu, 09 Jan 2020",Pseudo R-squ.:,0.8249
Time:,16:51:09,Log-Likelihood:,-20.18
converged:,True,LL-Null:,-115.26
Covariance Type:,nonrobust,LLR p-value:,5.1180000000000006e-42

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-68.3489,16.454,-4.154,0.000,-100.598,-36.100
SAT,0.0406,0.010,4.129,0.000,0.021,0.060
Gender,1.9449,0.846,2.299,0.022,0.287,3.603


In [13]:
np.exp(1.9449)

6.992932526814459

In [14]:
# it can see the 0 or 1 values
# also can see the values like 0.78, 0.37...
# this values are probabilities
# in the model it means the probability of being admitted
# the value is below 0.5 means there are less than 50% chance of admitted
# the value is above 0.5 means there are more than 50% chance of admitted
# so it will round down

np.set_printoptions(formatter = {'float': lambda x: '{0:0.2f}'.format(x)})
results_log.predict()

array([0.00, 1.00, 1.00, 0.23, 0.02, 0.99, 1.00, 1.00, 1.00, 0.01, 1.00,
       1.00, 0.76, 0.00, 0.60, 1.00, 0.11, 0.12, 0.51, 1.00, 1.00, 1.00,
       0.00, 0.01, 0.97, 1.00, 0.48, 0.99, 1.00, 0.99, 0.00, 0.83, 0.25,
       1.00, 1.00, 1.00, 0.31, 1.00, 0.23, 0.00, 0.02, 0.45, 1.00, 0.00,
       0.99, 0.00, 0.99, 0.00, 0.00, 0.01, 0.00, 1.00, 0.92, 0.02, 1.00,
       0.00, 0.37, 0.98, 0.12, 1.00, 0.00, 0.78, 1.00, 1.00, 0.98, 0.00,
       0.00, 0.00, 1.00, 0.00, 0.78, 0.12, 0.00, 0.99, 1.00, 1.00, 0.00,
       0.30, 1.00, 1.00, 0.00, 1.00, 1.00, 0.85, 1.00, 1.00, 0.00, 1.00,
       1.00, 0.89, 0.83, 0.00, 0.98, 0.97, 0.00, 1.00, 1.00, 0.03, 0.99,
       0.96, 1.00, 0.00, 1.00, 0.01, 0.01, 1.00, 1.00, 1.00, 0.00, 0.00,
       0.02, 0.33, 0.00, 1.00, 0.09, 0.00, 0.97, 0.00, 0.75, 1.00, 1.00,
       0.01, 0.01, 0.00, 1.00, 0.00, 0.99, 0.57, 0.54, 0.87, 0.83, 0.00,
       1.00, 0.00, 0.00, 0.00, 1.00, 0.04, 0.00, 0.01, 1.00, 0.99, 0.52,
       1.00, 1.00, 0.05, 0.00, 0.00, 0.00, 0.68, 1.

In [15]:
# we can do is compare the actual values we observed
# if there are 80% of the predicted values coincide with the actual values, it can say the model has 80% accuracy

np.array(data['Admitted'])

array([0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0])

In [16]:
results_log.pred_table()

array([[69.00, 5.00],
       [4.00, 90.00]])

In [17]:
# it is confusion matrix
# it called confusion means shows how confused our model is 
# for 69 observations the model predicted 0 and the true value was 0
# for 90 observations the model predicted 1 and the true value was 1
# these cells indicate and how many cases the model did its job well
# for 4 observations the model predicted 0 while the true value was 1
# for 5 observations the model predicted 1 while the true value eas 0
# these cells indicate and how many cases the model got confused
# the 69+90 of the cases the model was correct
# the 4+5 of the cases the model was incorrect
# overall the model made an accurate prediction in 159 out of 168 cases
# so the model get 159/168 = 94.6% accuracy

cm_df = pd.DataFrame(results_log.pred_table())

cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index = {0:'Actual 0', 1:'Actual 1'})

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,69.0,5.0
Actual 1,4.0,90.0


In [18]:
# it can calculate the accuracy

cm = np.array(cm_df)
accuracy_train = (cm[0,0] + cm[1,1]) / cm.sum()
accuracy_train

0.9464285714285714

In [19]:
test = pd.read_csv('2.03. Test dataset.csv')

In [20]:
# the same file
# but already split
# the test is 10%, so there are 19 observations for the test

test

Unnamed: 0,SAT,Admitted,Gender
0,1323,No,Male
1,1725,Yes,Female
2,1762,Yes,Female
3,1777,Yes,Male
4,1665,No,Male
5,1556,Yes,Female
6,1731,Yes,Female
7,1809,Yes,Female
8,1930,Yes,Female
9,1708,Yes,Male


In [21]:
test['Admitted'] = test['Admitted'].map({'Yes':1, 'No':0})
test['Gender'] = test['Gender'].map({'Female':1, 'Male':0})

In [22]:
test

Unnamed: 0,SAT,Admitted,Gender
0,1323,0,0
1,1725,1,1
2,1762,1,1
3,1777,1,0
4,1665,0,0
5,1556,1,1
6,1731,1,1
7,1809,1,1
8,1930,1,1
9,1708,1,0


In [23]:
# it will use our model to make predictions based on the test data
# and will compare those with the actual outcome
# and calculate the accuracy, create a confusion matrix
# the test dats look the same as the input data on which the regression was trained
# order is very important, because the coefficients of the reg will expect it
# so if fail to deliver the correct order, the prediction will be wrong

x

Unnamed: 0,const,SAT,Gender
0,1.0,1363,0
1,1.0,1792,1
2,1.0,1954,1
3,1.0,1653,0
4,1.0,1593,0
...,...,...,...
163,1.0,1722,1
164,1.0,1750,0
165,1.0,1555,0
166,1.0,1524,0


In [24]:
test_actual = test['Admitted']

In [25]:
test_data = test.drop(['Admitted'], axis = 1)

In [26]:
test_data = sm.add_constant(test_data)

In [27]:
# so the test data looks exactly like the input data
# and the variabel remain in the same order

test_data

Unnamed: 0,const,SAT,Gender
0,1.0,1323,0
1,1.0,1725,1
2,1.0,1762,1
3,1.0,1777,0
4,1.0,1665,0
5,1.0,1556,1
6,1.0,1731,1
7,1.0,1809,1
8,1.0,1930,1
9,1.0,1708,0


In [28]:
# there are three arguments, input data, the actual values and the model
# so this function will used the already created regression model to make prediction based on the data
# and summarize the values in a table
# create a histogram, where if values are between 0 amd 0.5, will be considered 0
# if they are between 0.5 and 1, will be considered 1

def confusion_matrix(data, actual_values, model):
    
    pred_values = model.predict(data)
    bins = np.array([0, 0.5, 1])
    cm = np.histogram2d(actual_values, pred_values, bins = bins)[0]
    accuracy = (cm[0,0]+cm[1,1]) / cm.sum()
    return cm, accuracy

In [29]:
# the left part is confusion matirx, the right part is the accuracy 
# almost always the training accuracy is higher than the test accuracy
# because of the overfitting
# the regression fitted the training data as well as possible
# doen't mean that the prediction is true for all the values from the population
# that's why the model is never seen the test data

cm = confusion_matrix(test_data, test_actual, results_log)
cm

(array([[5.00, 1.00],
        [1.00, 12.00]]), 0.8947368421052632)

In [30]:
cm_df = pd.DataFrame(cm[0])

cm_df.columns = ['Predicted 0', 'Predicted 1']
cm_df = cm_df.rename(index = {0:'Actual 0', 1:'Actual 1'})

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,5.0,1.0
Actual 1,1.0,12.0


In [31]:
print('Missclassification rate:' + str((1+1) / 19))

Missclassification rate:0.10526315789473684
