# 1. Set up and module installation

In [158]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
import seaborn as sns
import statsmodels.api as sm
import numpy as np

In [159]:
#load our cleaned data in

In [160]:
data_path = "02_cleaned_data.csv"

In [161]:
data = pd.read_csv(data_path)

In [162]:
data.head()

Unnamed: 0.1,Unnamed: 0,femaleh,hhsize,rural,Low_education,Low_income,Sumatra,Java and Bali,Kalimantan,Sulawesi,Childlabour_05,age_0t6,age_7t12,age_13t15,age_16t18,age_19t60,age_61,Childlabour_06
0,0,0,10,1,1.0,1.0,1,0,0,0,0,0.0,0.0,0.1,0.2,0.7,0.0,0
1,1,0,7,1,1.0,1.0,1,0,0,0,0,0.0,0.142857,0.0,0.142857,0.714286,0.0,1
2,2,0,6,1,1.0,1.0,1,0,0,0,1,0.0,0.333333,0.166667,0.166667,0.333333,0.0,0
3,3,0,5,0,1.0,0.0,1,0,0,0,1,0.0,0.0,0.0,0.4,0.6,0.0,1
4,4,0,3,0,1.0,0.0,1,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0


In [163]:
data=data.drop(["Unnamed: 0"], axis=1)

In [164]:
data.head()

Unnamed: 0,femaleh,hhsize,rural,Low_education,Low_income,Sumatra,Java and Bali,Kalimantan,Sulawesi,Childlabour_05,age_0t6,age_7t12,age_13t15,age_16t18,age_19t60,age_61,Childlabour_06
0,0,10,1,1.0,1.0,1,0,0,0,0,0.0,0.0,0.1,0.2,0.7,0.0,0
1,0,7,1,1.0,1.0,1,0,0,0,0,0.0,0.142857,0.0,0.142857,0.714286,0.0,1
2,0,6,1,1.0,1.0,1,0,0,0,1,0.0,0.333333,0.166667,0.166667,0.333333,0.0,0
3,0,5,0,1.0,0.0,1,0,0,0,1,0.0,0.0,0.0,0.4,0.6,0.0,1
4,0,3,0,1.0,0.0,1,0,0,0,0,0.0,0.0,0.0,0.0,1.0,0.0,0


# 2. Get test and train splits, scale data

In [165]:
# Assign X (data) and y (target), reshape the y array
X = data.drop(["Childlabour_06"], axis=1)
y = data["Childlabour_06"]
print(X.shape, y.shape)

(4353, 16) (4353,)


In [166]:
#Split our data into training and testing

In [167]:
from sklearn.model_selection import train_test_split

In [168]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [169]:
type(X_test)

pandas.core.frame.DataFrame

In [170]:
X_test.shape

(1089, 16)

In [171]:
X_train.shape

(3264, 16)

In [172]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter = 10000)
classifier

LogisticRegression(max_iter=10000)

In [173]:
#Fit (train) or model using the training data

In [174]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=10000)

# 3. Check model fit

In [175]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9384191176470589
Testing Data Score: 0.9403122130394858


In [176]:
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0 0 0 0 0 0 0 0 0 0]
First 10 Actual labels: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [177]:
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
1084,0,0
1085,0,1
1086,0,0
1087,0,0


# 4. Analysis of first model

In [178]:
#Let's get a statistical summary of our logistic model
import statsmodels.api as sm
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.175908
         Iterations 8
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.220     
Dependent Variable: Childlabour_06   AIC:              1563.4512 
Date:               2020-10-26 02:42 BIC:              1665.5092 
No. Observations:   4353             Log-Likelihood:   -765.73   
Df Model:           15               LL-Null:          -981.98   
Df Residuals:       4337             LLR p-value:      9.9903e-83
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     8.0000                                       
-----------------------------------------------------------------
                  Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
-----------------------------------------------------------------
femaleh           0.1430   0.2593   0.5516 0.5812 -0.3652  0.6513
hhsize            0.1204   0.0462   2.6046 0.0092  0.0298  0.2111


In [179]:
#Hmmmmm... Low r-squared score. Damn
#Consider scaling household size?

In [180]:
#confusion matrix

In [181]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, predictions)
print(confusion_matrix)

[[1017   10]
 [  55    7]]


In [182]:
# Referring to documentation www.scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
#the confusion matrix returns an output array as below for a binary target variable:
#  [(tn, fp, 
 #   fn, tp)]
    

In [183]:
#So the confusion matrix tells us of 1089 predictions in the X_test and y_test data set, there were 1014 True Negative results, 
#and 7 True Positives for a total of 1021 correct predictions


In [184]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1027
           1       0.41      0.11      0.18        62

    accuracy                           0.94      1089
   macro avg       0.68      0.55      0.57      1089
weighted avg       0.92      0.94      0.92      1089



In [185]:
#So overall our initial model doesn't seem that great.
#We can implement some changes to make it work better

# 5. Implementing k-folds cross validation

In [186]:
# 
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from numpy import mean
from numpy import std

In [187]:
# 
# prepare the cross-validation procedure
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# evaluate model
scores_kfold = cross_val_score(classifier, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores_kfold), std(scores_kfold)))


Accuracy: 0.938 (0.006)


In [188]:
# 
y_pred = cross_val_predict(classifier, X, y, cv=cv, n_jobs=-1)
conf_mat_kfold = confusion_matrix(y, y_pred)
conf_mat_kfold

array([[4063,   31],
       [ 237,   22]], dtype=int64)

In [189]:
# 
classification_report_kfold=classification_report(y, y_pred)
print(classification_report_kfold)

              precision    recall  f1-score   support

           0       0.94      0.99      0.97      4094
           1       0.42      0.08      0.14       259

    accuracy                           0.94      4353
   macro avg       0.68      0.54      0.55      4353
weighted avg       0.91      0.94      0.92      4353



In [190]:
# 
#Hmmm, interesting. So it seems recall has actually fallen slightly, whilst precision has increased. 
#Overall F1 and accuracy seems unchanged.
#This is likely because of the highly imbalanced nature of the data set- kfolds cross validation fails on highly imbalanced datasets
#To retify this we will create stratified k-fold cross-validation

# 6. Stratified k-fold cross validation

In [191]:
from sklearn.model_selection import StratifiedKFold

In [192]:
kstrat = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

In [193]:
# evaluate model
scores_kstrat = cross_val_score(classifier, X, y, scoring='accuracy', cv=kstrat, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores_kstrat), std(scores_kstrat)))


Accuracy: 0.938 (0.001)


In [194]:
y_pred = cross_val_predict(classifier, X, y, cv=kstrat, n_jobs=-1)
conf_mat_kstrat = confusion_matrix(y, y_pred)
conf_mat_kstrat

array([[4064,   30],
       [ 238,   21]], dtype=int64)

In [195]:
classification_report_kstrat=classification_report(y, y_pred)
print(classification_report_kstrat)

              precision    recall  f1-score   support

           0       0.94      0.99      0.97      4094
           1       0.41      0.08      0.14       259

    accuracy                           0.94      4353
   macro avg       0.68      0.54      0.55      4353
weighted avg       0.91      0.94      0.92      4353



In [196]:
#So k-stratification didn't seem to help that much.
#Whilst we have ensured that there are an equal amount of y=1 observations in all our splits, still high imbalanced data set.
#So, we have tried to deal with imbalanced data by stratified sampling, but our data just seems too imbalanced to really fix this.
#So in our next step, we will attempt to train the model with more observation where y=1

# 7. Second model, using SMOTE

In [197]:
#Our quite bad at actually correctly identifying/predicting households where children go on to enter the labour market.
#This is likely due to the highly imblanced dataset we are using. We have tried k-folds cross validation, but this does not seem to have helped much
#Recalling our Exploratory Data Analysis only 5% of households actually have household labour- according to our survey earlier

In [198]:
from imblearn.over_sampling import SMOTE

In [199]:
os = SMOTE(random_state=0)


In [200]:
y_train.value_counts()

0    3067
1     197
Name: Childlabour_06, dtype: int64

In [201]:
smt = SMOTE()
X_train_os, y_train_os = smt.fit_sample(X_train, y_train)

In [202]:
np.bincount(y_train_os)

array([3067, 3067], dtype=int64)

In [203]:
#We now have perfectly balanced sample sizes in our training data set

In [204]:
classifier.fit(X_train_os, y_train_os)

LogisticRegression(max_iter=10000)

In [205]:
print(f"Training Data Score: {classifier.score(X_train_os, y_train_os)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.778284969025106
Testing Data Score: 0.7741046831955923


In [206]:
predictions = classifier.predict(X_test)

In [207]:
type(predictions)

numpy.ndarray

In [208]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.98      0.78      0.87      1027
           1       0.17      0.74      0.27        62

    accuracy                           0.77      1089
   macro avg       0.57      0.76      0.57      1089
weighted avg       0.93      0.77      0.83      1089



In [209]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, predictions)
print(confusion_matrix)

[[797 230]
 [ 16  46]]


In [210]:
#Hmmm, so our accuracy and our precision have decreased somewhat, 
#However, we are picking up more true positives and our recall has improved after using SMOTE, without too much decrease in F1 score

In [211]:
#Let's get a statistical summary of our logistic model
import statsmodels.api as sm
logit_model2=sm.Logit(y_train_os, X_train_os)
result2=logit_model2.fit()
print(result2.summary2())

Optimization terminated successfully.
         Current function value: 0.474015
         Iterations 6
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.316    
Dependent Variable: Childlabour_06   AIC:              5847.2180
Date:               2020-10-26 02:43 BIC:              5954.7636
No. Observations:   6134             Log-Likelihood:   -2907.6  
Df Model:           15               LL-Null:          -4251.8  
Df Residuals:       6118             LLR p-value:      0.0000   
Converged:          1.0000           Scale:            1.0000   
No. Iterations:     6.0000                                      
----------------------------------------------------------------
                 Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
----------------------------------------------------------------
femaleh         -1.0320   0.1559  -6.6202 0.0000 -1.3375 -0.7265
hhsize           0.1079   0.0236   4.5722 0.0000  0.0617  0.1542
rural        

# 8. Combining SMOTE and repeated K-folds validation

In [212]:
#So our minority over-sampling approach using SMOTE seemed to work well.
#Now that we have balanced data, we can also be a bit more confident that any cross-validation will also likely work better
#Let's see if we can't combine repeated kfolds validation and SMOTE to get a better model fitting process.

In [213]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score

In [214]:
#We could simply load in our oversampled dataset created in Section 7 and then apply kfolds.
#However, to avoid overfitting we want to split into training and testing folds. 
#and then on each fold we want to oversample, train classifier and then validate
#Happily, we can actually create code that runs this all for us, using pipeline

In [215]:
cv_os = KFold(n_splits=10, random_state=1, shuffle=True)

In [216]:
from imblearn.pipeline import Pipeline, make_pipeline

In [217]:
imb_pipeline = make_pipeline(SMOTE(random_state=42), 
                              LogisticRegression(max_iter = 1000))
cross_val_score(imb_pipeline, X_train, y_train, scoring='recall', cv=cv_os)

array([0.55555556, 0.80952381, 0.57142857, 0.83333333, 0.82608696,
       0.76923077, 0.75      , 0.78947368, 0.63157895, 0.52941176])

In [218]:
result3=imb_pipeline.fit( X_train, y_train)

In [219]:
y_test_os_predict=imb_pipeline.predict(X_test)


In [220]:
y_os_kf_pred=pd.Series(y_test_os_predict)

In [221]:
print(classification_report(y_test, y_os_kf_pred))

              precision    recall  f1-score   support

           0       0.98      0.78      0.87      1027
           1       0.17      0.76      0.28        62

    accuracy                           0.78      1089
   macro avg       0.58      0.77      0.57      1089
weighted avg       0.94      0.78      0.83      1089



In [222]:
from sklearn.metrics import confusion_matrix
confusion_matrix_hat = confusion_matrix(y_test, y_os_kf_pred)
print(confusion_matrix_hat)

[[798 229]
 [ 15  47]]


In [223]:
print(result3.summary())

AttributeError: 'Pipeline' object has no attribute 'summary'

In [224]:
#-------------------------------------------------------------------------------------------------------
#What happens if we just use cross-validation on previously created over-sampled datset?

In [225]:
# prepare the cross-validation procedure
hat2 = KFold(n_splits=10, random_state=1, shuffle=True)
# evaluate model
logreg=LogisticRegression(max_iter = 1000)
scores_kfold = cross_val_score(logreg, X_train_os, y_train_os, scoring='accuracy', cv=hat2, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores_kfold), std(scores_kfold)))

Accuracy: 0.774 (0.017)


In [226]:
# 
y_pred_os = cross_val_predict(logreg, X_test, y_test, cv=cv, n_jobs=-1)
conf_mat_hat2 = confusion_matrix(y_test, y_pred_os)
conf_mat_hat2

array([[1022,    5],
       [  60,    2]], dtype=int64)

In [227]:
# 
classification_report_hat2=classification_report(y_test, y_pred_os)
print(classification_report_hat2)

              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1027
           1       0.29      0.03      0.06        62

    accuracy                           0.94      1089
   macro avg       0.62      0.51      0.51      1089
weighted avg       0.91      0.94      0.92      1089



# 9. Feature Selection using Recursive Feature Elimination

In [228]:
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV

In [229]:
logreg_rfe = LogisticRegression(max_iter = 1000)

In [230]:
rfe = RFE(logreg_rfe, n_features_to_select=10)

In [231]:
rfe = rfe.fit(X_train, y_train.values.ravel())
print(rfe.support_)
print(rfe.ranking_)

[False False  True  True False False False  True False  True  True  True
  True  True  True  True]
[3 4 1 1 6 2 5 1 7 1 1 1 1 1 1 1]


In [232]:
# summarize all features
for i in range(X_train_os.shape[1]):
    print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

Column: 0, Selected False, Rank: 3.000
Column: 1, Selected False, Rank: 4.000
Column: 2, Selected True, Rank: 1.000
Column: 3, Selected True, Rank: 1.000
Column: 4, Selected False, Rank: 6.000
Column: 5, Selected False, Rank: 2.000
Column: 6, Selected False, Rank: 5.000
Column: 7, Selected True, Rank: 1.000
Column: 8, Selected False, Rank: 7.000
Column: 9, Selected True, Rank: 1.000
Column: 10, Selected True, Rank: 1.000
Column: 11, Selected True, Rank: 1.000
Column: 12, Selected True, Rank: 1.000
Column: 13, Selected True, Rank: 1.000
Column: 14, Selected True, Rank: 1.000
Column: 15, Selected True, Rank: 1.000


In [156]:
type(X_train_os)

pandas.core.frame.DataFrame

In [233]:
X_train_os.iloc[:, [2,3,7,9,10,11,12,13,14,15]].head()

Unnamed: 0,rural,Low_education,Kalimantan,Childlabour_05,age_0t6,age_7t12,age_13t15,age_16t18,age_19t60,age_61
0,1,1.0,0,0,0.0,0.5,0.0,0.0,0.5,0.0
1,0,1.0,0,0,0.0,0.0,0.0,0.0,1.0,0.0
2,0,1.0,0,0,0.285714,0.0,0.0,0.0,0.714286,0.0
3,0,1.0,0,1,0.0,0.4,0.0,0.2,0.4,0.0
4,0,0.0,0,0,0.0,0.2,0.2,0.2,0.4,0.0
