In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

import sklearn
from sklearn import metrics 
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

%matplotlib inline

In [2]:
df = sm.datasets.fair.load_pandas().data

In [3]:
df.head()

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs
0,3.0,32.0,9.0,3.0,3.0,17.0,2.0,5.0,0.111111
1,3.0,27.0,13.0,3.0,1.0,14.0,3.0,4.0,3.230769
2,4.0,22.0,2.5,0.0,1.0,16.0,3.0,5.0,1.4
3,4.0,37.0,16.5,4.0,3.0,16.0,5.0,5.0,0.727273
4,5.0,27.0,9.0,1.0,1.0,14.0,3.0,4.0,4.666666


In [4]:
# Create check function
def affair_check(x):
    if x != 0:
        return 1
    else:
        return 0

# Apply to DataFrame
df['Had_Affair'] = df['affairs'].apply(affair_check)

In [5]:
df.head()

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs,Had_Affair
0,3.0,32.0,9.0,3.0,3.0,17.0,2.0,5.0,0.111111,1
1,3.0,27.0,13.0,3.0,1.0,14.0,3.0,4.0,3.230769,1
2,4.0,22.0,2.5,0.0,1.0,16.0,3.0,5.0,1.4,1
3,4.0,37.0,16.5,4.0,3.0,16.0,5.0,5.0,0.727273,1
4,5.0,27.0,9.0,1.0,1.0,14.0,3.0,4.0,4.666666,1


In [6]:
# Groupby Had Affair column
df.groupby('Had_Affair').mean()

Unnamed: 0_level_0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs
Had_Affair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,4.329701,28.390679,7.989335,1.238813,2.504521,14.322977,3.405286,3.833758,0.0
1,3.647345,30.537019,11.15246,1.728933,2.261568,13.972236,3.463712,3.884559,2.187243


In [7]:
# Create new DataFrames for the Categorical Variables
occ_dummies = pd.get_dummies(df['occupation'])
hus_occ_dummies = pd.get_dummies(df['occupation_husb'])

# Let's take a quick look at the results
occ_dummies.head()

Unnamed: 0,1.0,2.0,3.0,4.0,5.0,6.0
0,0,1,0,0,0,0
1,0,0,1,0,0,0
2,0,0,1,0,0,0
3,0,0,0,0,1,0
4,0,0,1,0,0,0


In [8]:
# Create column names for the new DataFrames
occ_dummies.columns = ['occ1','occ2','occ3','occ4','occ5','occ6']
hus_occ_dummies.columns = ['hocc1','hocc2','hocc3','hocc4','hocc5','hocc6']

In [9]:
# Set X as new DataFrame without the occupation columns or the Y target
X = df.drop(['occupation','occupation_husb','Had_Affair'],axis=1)

In [10]:
# Concat the dummy DataFrames Together
dummies = pd.concat([occ_dummies,hus_occ_dummies],axis=1)

In [12]:
# Now Concat the X DataFrame with the dummy variables
X = pd.concat([X,dummies],axis=1)

# Preview of Result
X.head()

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,affairs,occ1,occ2,occ3,...,occ3.1,occ4,occ5,occ6,hocc1,hocc2,hocc3,hocc4,hocc5,hocc6
0,3.0,32.0,9.0,3.0,3.0,17.0,0.111111,0,1,0,...,0,0,0,0,0,0,0,0,1,0
1,3.0,27.0,13.0,3.0,1.0,14.0,3.230769,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2,4.0,22.0,2.5,0.0,1.0,16.0,1.4,0,0,1,...,1,0,0,0,0,0,0,0,1,0
3,4.0,37.0,16.5,4.0,3.0,16.0,0.727273,0,0,0,...,0,0,1,0,0,0,0,0,1,0
4,5.0,27.0,9.0,1.0,1.0,14.0,4.666666,0,0,1,...,1,0,0,0,0,0,0,1,0,0


In [13]:
# Set Y as Target class, Had Affair
Y = df.Had_Affair

# Preview
Y.head()

0    1
1    1
2    1
3    1
4    1
Name: Had_Affair, dtype: int64

In [14]:
X.corr()

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,affairs,occ1,occ2,occ3,...,occ3.1,occ4,occ5,occ6,hocc1,hocc2,hocc3,hocc4,hocc5,hocc6
rate_marriage,1.0,-0.111127,-0.128978,-0.129161,0.078794,0.079869,-0.178068,0.017372,-0.019697,-0.053082,...,-0.053082,0.068882,-0.002109,0.008878,0.042022,-0.038992,-0.022514,0.003303,0.003256,0.039561
age,-0.111127,1.0,0.894082,0.673902,0.136598,0.02796,-0.089964,-0.042701,-0.034223,-0.066371,...,-0.066371,0.040982,0.079533,0.030676,-0.147273,-0.057368,0.01161,-0.048989,0.105525,0.083212
yrs_married,-0.128978,0.894082,1.0,0.772806,0.132683,-0.109058,-0.087737,-0.036117,0.004668,-0.021261,...,-0.021261,-0.026816,0.07682,-0.004912,-0.147531,-0.033451,0.008046,-0.031121,0.092462,0.042921
children,-0.129161,0.673902,0.772806,1.0,0.141845,-0.141918,-0.070278,-0.025718,0.081182,-0.063298,...,-0.063298,-0.003235,0.033274,-0.02683,-0.140584,0.00119,-0.005538,-0.008032,0.053965,0.02426
religious,0.078794,0.136598,0.132683,0.141845,1.0,0.032245,-0.125933,-0.012237,-0.013129,-0.034986,...,-0.034986,0.043996,0.00426,0.011784,-0.021699,0.00999,0.00817,-0.008491,-6.3e-05,0.006558
educ,0.079869,0.02796,-0.109058,-0.141918,0.032245,1.0,-0.01774,0.028309,-0.217719,-0.335615,...,-0.335615,0.477505,-0.022121,0.22692,0.069309,-0.160756,-0.052723,-0.031422,0.04254,0.223167
affairs,-0.178068,-0.089964,-0.087737,-0.070278,-0.125933,-0.01774,1.0,-0.010209,0.002542,0.019951,...,0.019951,-0.043153,0.01808,0.02929,-0.004192,0.013502,0.013706,0.003795,-0.025392,0.004696
occ1,0.017372,-0.042701,-0.036117,-0.025718,-0.012237,0.028309,-0.010209,1.0,-0.031798,-0.070957,...,-0.070957,-0.051217,-0.0292,-0.010627,0.089898,-0.021502,-0.001148,-0.00874,-0.019507,0.018385
occ2,-0.019697,-0.034223,0.004668,0.081182,-0.013129,-0.217719,0.002542,-0.031798,1.0,-0.348075,...,-0.348075,-0.251243,-0.143237,-0.052128,-0.03185,0.183782,-0.020904,-0.009786,-0.093292,-0.059107
occ3,-0.053082,-0.066371,-0.021261,-0.063298,-0.034986,-0.335615,0.019951,-0.070957,-0.348075,1.0,...,1.0,-0.560645,-0.319631,-0.116322,-0.012093,-0.000638,0.090043,0.011248,0.003021,-0.101673


In [None]:
# Dropping one column of each dummy variable set to avoid multicollinearity
X = X.drop('occ1',axis=1)
X = X.drop('hocc1',axis=1)

# Drop affairs column so Y target makes sense
X = X.drop('affairs',axis=1)

# Preview
X.head()

In [None]:
logit_model=sm.Logit(Y,sm.add_constant(X))

In [None]:
result=logit_model.fit()


In [None]:
stats1=result.summary()
stats2=result.summary2()
print(stats1)
print(stats2)

In [None]:
X = X.drop(['children','educ','occ2','occ3','occ4','hocc2','hocc3','hocc4','hocc4','hocc5','hocc6'],axis=1)

# Preview
X.head()

In [None]:
logit_model2=sm.Logit(Y,sm.add_constant(X))
result2=logit_model2.fit()
stats1=result2.summary()
stats2=result2.summary2()
print(stats1)
print(stats2)

In [None]:
preds = result2.pred_table()

In [None]:
confmtrx = np.array(preds)

In [None]:
confusion = pd.DataFrame(confmtrx, index=['Had Affair', 'Did Not Have Affair'],
columns=['predicted_affair', 'predicted_not_affair'])
confusion

In [None]:
#class_report=classification_report(Y,result2.predict(X))
#print(class_report)
#result2.predict(X)

In [None]:
#result2.predict(X)

In [None]:
TP = confusion.loc['Had Affair', 'predicted_affair']
FP = confusion.loc['Did Not Have Affair', 'predicted_affair']
TN = confusion.loc['Did Not Have Affair', 'predicted_not_affair']
FN = confusion.loc['Had Affair', 'predicted_not_affair']

In [None]:
TPR=(float(TP) / (TP + FN))
TPN=(float(TN) / (TN + FP)) 
PPV=(float(TP) / (TP + FP)) 
NPV=(float(TN) / (TN + FN)) 
FNR=(float(FN) / (FN + TP))
FPR=(float(FP) / (FP + TN))
FDR=(float(FP) / (FP + TP))
FOR=(float(FN) / (FN + TN))
TS=(float(TP) / (TP+FN + FP))
ACC=(float(TP+TN) / (TP+FP+FN + TN))  #print((TP + TN) / float(len(y_test)))
F1=2*TP/(2*TP+FP+FN)

print ("sensitivity, recall, hit rate, or true positive rate (TPR): ",TPR)
print ("specificity, selectivity or true negative rate (TNR): ",TPN)
print ("precision or positive predictive value (PPV): ",PPV)
print ("negative predictive value (NPV): ",NPV)
print ("miss rate or false negative rate (FNR): ",FNR)
print ("fall-out or false positive rate (FPR): ",FPR)
print ("false discovery rate (FDR): ",FDR)
print ("false omission rate (FOR): ",FOR)
print ("Threat score (TS) or Critical Success Index (CSI): ",TS)
print("")
print ("accuracy (ACC): ",ACC)
print ("F1: ",F1)
