# Set up and module installation

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
import seaborn as sns
import statsmodels.api as sm
import numpy as np

In [2]:
#load our cleaned data in

In [3]:
data_path = "cleaned_data.csv"

In [4]:
data = pd.read_csv(data_path)

In [5]:
data.head()

Unnamed: 0.1,Unnamed: 0,femaleh,hhsize,f0t6,m0t6,f7t12,m7t12,f13t15,f16t18,f19t60,...,Quintile 1,Quintile 2,Quintile 3,Quintile 4,Sumatra,Java and Bali,Kalimantan,Sulawesi,Childlabour_05,Childlabour_06
0,0,0,10,0.0,0.0,0.0,0.0,0.1,0.1,0.5,...,1,0,0,0,1,0,0,0,0,0
1,1,0,7,0.0,0.0,0.142857,0.0,0.0,0.142857,0.571429,...,1,0,0,0,1,0,0,0,0,1
2,2,0,6,0.0,0.0,0.0,0.333333,0.166667,0.166667,0.166667,...,1,0,0,0,1,0,0,0,1,0
3,3,0,5,0.0,0.0,0.0,0.0,0.0,0.2,0.2,...,0,1,0,0,1,0,0,0,1,1
4,4,0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,...,0,0,0,0,1,0,0,0,0,0


In [6]:
data=data.drop(["Unnamed: 0"], axis=1)

In [7]:
data.head()

Unnamed: 0,femaleh,hhsize,f0t6,m0t6,f7t12,m7t12,f13t15,f16t18,f19t60,m13t15,...,Quintile 1,Quintile 2,Quintile 3,Quintile 4,Sumatra,Java and Bali,Kalimantan,Sulawesi,Childlabour_05,Childlabour_06
0,0,10,0.0,0.0,0.0,0.0,0.1,0.1,0.5,0.0,...,1,0,0,0,1,0,0,0,0,0
1,0,7,0.0,0.0,0.142857,0.0,0.0,0.142857,0.571429,0.0,...,1,0,0,0,1,0,0,0,0,1
2,0,6,0.0,0.0,0.0,0.333333,0.166667,0.166667,0.166667,0.0,...,1,0,0,0,1,0,0,0,1,0
3,0,5,0.0,0.0,0.0,0.0,0.0,0.2,0.2,0.0,...,0,1,0,0,1,0,0,0,1,1
4,0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,...,0,0,0,0,1,0,0,0,0,0


# Get test and train splits, scale data

In [8]:
# Assign X (data) and y (target), reshape the y array
X = data.drop(["Childlabour_06"], axis=1)
y = data["Childlabour_06"]
print(X.shape, y.shape)

(4353, 28) (4353,)


In [9]:
#Split our data into training and testing

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [12]:
type(X_test)

pandas.core.frame.DataFrame

In [13]:
X_test.shape

(1089, 28)

In [14]:
X_train.shape

(3264, 28)

In [16]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter = 10000)
classifier

LogisticRegression(max_iter=10000)

In [17]:
#Fit (train) or model using the training data

In [18]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=10000)

# Check model fit

In [19]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9402573529411765
Testing Data Score: 0.9375573921028466


In [20]:
predictions = classifier.predict(X_test)
print(f"First 10 Predictions:   {predictions[:10]}")
print(f"First 10 Actual labels: {y_test[:10].tolist()}")

First 10 Predictions:   [0 0 0 0 0 0 0 0 0 0]
First 10 Actual labels: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [21]:
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
1084,0,0
1085,0,1
1086,0,0
1087,0,0


In [22]:
#Let's get a statistical summary of our logistic model
import statsmodels.api as sm
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.173464
         Iterations 9
                            Results: Logit
Model:                 Logit             Pseudo R-squared:  0.231     
Dependent Variable:    Childlabour_06    AIC:               1566.1812 
Date:                  2020-10-24 11:03  BIC:               1744.7826 
No. Observations:      4353              Log-Likelihood:    -755.09   
Df Model:              27                LL-Null:           -981.98   
Df Residuals:          4325              LLR p-value:       5.0302e-79
Converged:             1.0000            Scale:             1.0000    
No. Iterations:        9.0000                                         
----------------------------------------------------------------------
                        Coef.  Std.Err.    z    P>|z|   [0.025  0.975]
----------------------------------------------------------------------
femaleh                 0.4588   0.2975  1.5422 0.1230 -0.1243  1.0418
hhs

In [23]:
#Hmmmmm... Low r-squared score. Damn
#Consider scaling household size?

# Analysis of initial model performance

In [24]:
#confusion matrix

In [25]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, predictions)
print(confusion_matrix)

[[1014   13]
 [  55    7]]


In [26]:
# Referring to documentation www.scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
#the confusion matrix returns an output array as below for a binary target variable:
#  [(tn, fp, 
 #   fn, tp)]
    

In [27]:
#So the confusion matrix tells us of 1089 predictions in the X_test and y_test data set, there were 1014 True Negative results, 
#and 7 True Positives for a total of 1021 correct predictions


In [28]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1027
           1       0.35      0.11      0.17        62

    accuracy                           0.94      1089
   macro avg       0.65      0.55      0.57      1089
weighted avg       0.91      0.94      0.92      1089



In [29]:
#So overall our initial model doesn't seem that great.
#We can implement some changes to make it work better

# 1 Implementing k-folds cross validation

In [61]:
# 2
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from numpy import mean
from numpy import std

In [62]:
# 3
# prepare the cross-validation procedure
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# evaluate model
scores_kfold = cross_val_score(classifier, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores_kfold), std(scores_kfold)))


Accuracy: 0.937 (0.006)


In [63]:
# 4
y_pred = cross_val_predict(classifier, X, y, cv=cv, n_jobs=-1)
conf_mat_kfold = confusion_matrix(y, y_pred)
conf_mat_kfold

array([[4059,   35],
       [ 238,   21]], dtype=int64)

In [64]:
# 5
classification_report_kfold=classification_report(y, y_pred)
print(classification_report_kfold)

              precision    recall  f1-score   support

           0       0.94      0.99      0.97      4094
           1       0.38      0.08      0.13       259

    accuracy                           0.94      4353
   macro avg       0.66      0.54      0.55      4353
weighted avg       0.91      0.94      0.92      4353



In [None]:
# 6
#Hmmm, interesting. So it seems recall has actually fallen slightly, whilst precision has increased. 
#Overall F1 and accuracy seems unchanged.
#This is likely because of the highly imbalanced nature of the data set- kfolds cross validation fails on highly imbalanced datasets
#To retify this we will create stratified k-fold cross-validation

# Stratified k-fold cross validation

In [84]:
from sklearn.model_selection import StratifiedKFold

In [85]:
kstrat = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

In [86]:
# evaluate model
scores_kstrat = cross_val_score(classifier, X, y, scoring='accuracy', cv=kstrat, n_jobs=-1)
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))


Accuracy: 0.937 (0.006)


In [87]:
y_pred = cross_val_predict(classifier, X, y, cv=kstrat, n_jobs=-1)
conf_mat_kstrat = confusion_matrix(y, y_pred)
conf_mat_kstrat

array([[4063,   31],
       [ 240,   19]], dtype=int64)

In [88]:
classification_report_kstrat=classification_report(y, y_pred)
print(classification_report_kstrat)

              precision    recall  f1-score   support

           0       0.94      0.99      0.97      4094
           1       0.38      0.07      0.12       259

    accuracy                           0.94      4353
   macro avg       0.66      0.53      0.55      4353
weighted avg       0.91      0.94      0.92      4353



In [95]:
#So k-stratification didn't seem to help that much.
#Whilst we have ensured that there are an equal amount of y=1 observations in all our splits, still high imbalanced data set.
#So, we have tried to deal with imbalanced data by stratified sampling, but our data just seems too imbalanced to really fix this.
#So in our next step, we will attempt to train the model with more observation where y=1

# Second model, using SMOTE

In [96]:
#It seems quite bad at actually correctly identifying/predicting households where children go on to enter the labour market.
#This may be due to the highly imblanced dataset we are using, 
#Recalling our Exploratory Data Analysis only 5% of households actually have household labour- according to our survey earlier

In [97]:
from imblearn.over_sampling import SMOTE

In [98]:
os = SMOTE(random_state=0)


In [99]:
y_train.value_counts()

0    3067
1     197
Name: Childlabour_06, dtype: int64

In [100]:
smt = SMOTE()
X_train_os, y_train_os = smt.fit_sample(X_train, y_train)

In [102]:
np.bincount(y_train_os)

array([3067, 3067], dtype=int64)

In [103]:
#We now have perfectly balanced sample sizes in our training data set

In [104]:
classifier.fit(X_train_os, y_train_os)

LogisticRegression(max_iter=10000)

In [106]:
print(f"Training Data Score: {classifier.score(X_train_os, y_train_os)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8177372024779915
Testing Data Score: 0.7805325987144169


In [107]:
predictions = classifier.predict(X_test)

In [108]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.97      0.79      0.87      1027
           1       0.14      0.58      0.23        62

    accuracy                           0.78      1089
   macro avg       0.56      0.69      0.55      1089
weighted avg       0.92      0.78      0.84      1089



In [109]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, predictions)
print(confusion_matrix)

[[814 213]
 [ 26  36]]


In [None]:
#Hmmm, so our accuracy has precision somewhat, 
#However, we are picking up more true positives and our recall has improved after using SMOTE, without too much decrease in F1 score