In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

# A. Data

Read data that were imputed for missing values and also were discovered as the top features from the Random Forest model:

* DL0 -  Who is the main income earner in your household? 
* DG6 - How are you related to the household head? 
* DL1 - In the past 12 months, were you mainly...? (Employed, unemployed, student, etc.)
* MT1A - Who decides on who should have a phone in your household?
* FL4 - What or who do you depend on the most for financial advice?
* DG3 - What is your marital status?
* GN5 - Who decides what kind of financial services you can personally use? 
* GN3 - Who controls assets (i.e., savings, land, and livestock) in your household? 
* GN4 - Who decides what kind of financial services your household uses? 
* MT2 - Do you personally own a mobile phone? 
* MT10 - Do you personally have an active/working SIM card? 
* DG1 - What year were you born?
* GN2 - Who in your household decides what purchases are made to meet daily household needs like food, clothing, and cleaning supplies? 
* DG4 - What is your highest level of education? 
* DG8a - How many adults do you have in the household? 

In [2]:
train = pd.read_csv("../data/train-cleaned.csv", low_memory=False, usecols=['DL0', 'GN4', 'MT2', 'is_female'])
test = pd.read_csv("../data/test-cleaned.csv", low_memory=False, usecols=['DL0', 'GN4', 'MT2'])

In [3]:
train.head()

Unnamed: 0,is_female,DL0,MT2,GN4
0,1.0,1.0,2.0,99.0
1,1.0,2.0,2.0,2.0
2,1.0,2.0,2.0,2.0
3,1.0,2.0,1.0,99.0
4,1.0,2.0,2.0,1.0


In [4]:
test.head()

Unnamed: 0,DL0,MT2,GN4
0,2.0,2.0,3.0
1,2.0,1.0,1.0
2,2.0,2.0,2.0
3,2.0,2.0,2.0
4,2.0,2.0,1.0


In [5]:
print train.shape
print test.shape

(18255, 4)
(27285, 3)


# B. Hypothesis

If a person owns a mobile phone (MT2) and makes financial decisions (GN4), then that person is a male. This person is also likely the household head.

In [6]:
print train['MT2'].value_counts()
train['GN4'].value_counts()

2.0    9333
1.0    8922
Name: MT2, dtype: int64


1.0     5218
3.0     4793
2.0     4679
4.0     2302
99.0    1177
96.0      86
Name: GN4, dtype: int64

In [7]:
# Coalesce 4, 96, 99 as 'other' category
train['GN4'] = train['GN4'].map({1:1, 2:2, 3:3, 4:4, 96:4, 99:4})
train['GN4'].value_counts()

1    5218
3    4793
2    4679
4    3565
Name: GN4, dtype: int64

In [8]:
# Take string concat (cartesian product) of the values of MT2 and GN4
MT2_GN4 = train['MT2'].map(int).map(str)+train['GN4'].map(str)
print MT2_GN4[:5]
MT2_GN4.unique()

0    24
1    22
2    22
3    14
4    21
dtype: object


array(['24', '22', '14', '21', '13', '12', '11', '23'], dtype=object)

In [9]:
# 1-hot encode the cartesian product values of MT2 and GN4
oneHot = OneHotEncoder()
MT2_GN4 = oneHot.fit_transform(MT2_GN4.values.reshape(-1,1))
MT2_GN4.toarray()

array([[ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  1.,  0.,  0.],
       [ 0.,  0.,  0., ...,  1.,  0.,  0.],
       ..., 
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  1., ...,  0.,  0.,  0.]])

In [10]:
# Label household head as 1
train['DL0'] = train['DL0'].map({1:1, 2:0})
train['DL0'].head()

0    1
1    0
2    0
3    0
4    0
Name: DL0, dtype: int64

In [11]:
x = np.append(MT2_GN4.toarray(), train['DL0'].values.reshape(-1,1), axis=1)
x

array([[ 0.,  0.,  0., ...,  0.,  1.,  1.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 1.,  0.,  0., ...,  0.,  0.,  1.],
       [ 1.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  1., ...,  0.,  0.,  0.]])

## Linear Models

* Fit and predict logistic regression on MT2 and GN4 after they have been rescaled.

In [12]:
# Rescale data from [0,1]
scaler = MinMaxScaler()
train[['MT2', 'GN4']] = scaler.fit_transform(train[['MT2', 'GN4']]) 
test[['MT2', 'GN4']] = scaler.fit_transform(test[['MT2', 'GN4']])

In [13]:
xTrainVal, xTestVal, yTrainVal, yTestVal = train_test_split(train[['MT2', 'GN4']], train['is_female'], test_size=.5)

In [14]:
# Logistic Regression
lreg = LogisticRegression()
lreg.fit(xTrainVal, yTrainVal)
yPredVal = lreg.predict_proba(xTestVal)[:,1]
roc_auc_score(yTestVal, yPredVal)

0.69046064957775011

* Fit and predict linear models on 1-hot encoded values of the cartesian product of MT2 and GN4.

In [15]:
xTrainVal, xTestVal, yTrainVal, yTestVal = train_test_split(x, train['is_female'], test_size=.5)

In [16]:
# Logistic Regression
lreg = LogisticRegression()
lreg.fit(xTrainVal, yTrainVal)
yPredVal = lreg.predict_proba(xTestVal)[:,1]
roc_auc_score(yTestVal, yPredVal)

0.88937974147739318

In [17]:
# Gradient Boosting
gb = GradientBoostingClassifier()
gb.fit(xTrainVal, yTrainVal)
yPredVal = gb.predict_proba(xTestVal)[:,1]
roc_auc_score(yTestVal, yPredVal)

0.89450825070097684

In [18]:
# SVM
svc = SVC(probability=True)
svc.fit(xTrainVal, yTrainVal)
yPredVal = svc.predict_proba(xTestVal)[:,1]
roc_auc_score(yTestVal, yPredVal)

0.84963801339087708