In [1]:
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# A. Data

Read data that were imputed for missing values and also were discovered as the top features from the Random Forest model:

* DL0 -  Who is the main income earner in your household? 
* DG6 - How are you related to the household head? 
* DL1 - In the past 12 months, were you mainly...? (Employed, unemployed, student, etc.)
* MT1A - Who decides on who should have a phone in your household?
* FL4 - What or who do you depend on the most for financial advice?
* DG3 - What is your marital status?
* GN5 - Who decides what kind of financial services you can personally use? 
* GN3 - Who controls assets (i.e., savings, land, and livestock) in your household? 
* GN4 - Who decides what kind of financial services your household uses? 
* MT2 - Do you personally own a mobile phone? 
* MT10 - Do you personally have an active/working SIM card? 
* DG1 - What year were you born?
* GN2 - Who in your household decides what purchases are made to meet daily household needs like food, clothing, and cleaning supplies? 
* DG4 - What is your highest level of education? 
* DG8a - How many adults do you have in the household? 

In [2]:
topFeat = ['DL0', 'DG6', 'DL1', 'MT1A', 'FL4', 'DG3', 'GN5', 'GN3', 'GN4', 'MT2', 'MT10', 'DG1', 'GN2', 'DG4', 'DG8a']
train = pd.read_csv("../data/train-cleaned.csv", low_memory=False, usecols=topFeat+['is_female'])
test = pd.read_csv("../data/test-cleaned.csv", low_memory=False, usecols=topFeat)

In [3]:
train.head()

Unnamed: 0,DG1,is_female,DG3,DG4,DG6,DG8a,DL0,DL1,MT1A,MT2,MT10,FL4,GN2,GN3,GN4,GN5
0,1975.0,1.0,3.0,5.0,2.0,4.0,1.0,1.0,99.0,2.0,2.0,99.0,99.0,99.0,99.0,99.0
1,1981.0,1.0,8.0,5.0,2.0,4.0,2.0,7.0,4.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0
2,1995.0,1.0,3.0,2.0,7.0,4.0,2.0,7.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
3,1980.0,1.0,3.0,5.0,2.0,2.0,2.0,7.0,2.0,1.0,1.0,2.0,2.0,2.0,99.0,99.0
4,1958.0,1.0,3.0,6.0,2.0,99.0,2.0,7.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0


In [4]:
test.head()

Unnamed: 0,DG1,DG3,DG4,DG6,DG8a,DL0,DL1,MT1A,MT2,MT10,FL4,GN2,GN3,GN4,GN5
0,1979.0,8.0,1.0,2.0,3.0,2.0,4.0,2.0,2.0,2.0,2.0,1.0,3.0,3.0,3.0
1,1993.0,1.0,6.0,3.0,5.0,2.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0
2,1980.0,3.0,6.0,2.0,2.0,2.0,7.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
3,1991.0,3.0,1.0,2.0,2.0,2.0,4.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0
4,1985.0,3.0,6.0,1.0,2.0,2.0,7.0,1.0,2.0,2.0,99.0,1.0,1.0,1.0,1.0


In [5]:
print train.shape
print test.shape

(18255, 16)
(27285, 15)


# B. Hypothesis

If a person owns a mobile phone (MT2) and makes financial decisions (GN4), then that person is a male.

In [6]:
print train['MT2'].value_counts()
train['GN4'].value_counts()

2.0    9333
1.0    8922
Name: MT2, dtype: int64


1.0     5218
3.0     4793
2.0     4679
4.0     2302
99.0    1177
96.0      86
Name: GN4, dtype: int64

In [7]:
# Coalesce 4, 96, 99 as 'other' category
train['GN4'] = train['GN4'].map({1:1, 2:2, 3:3, 4:4, 96:4, 99:4})
train['GN4'].value_counts()

1    5218
3    4793
2    4679
4    3565
Name: GN4, dtype: int64

In [8]:
# Take cartesian product of all MT2 and GN4 values
MT2_GN4 = train['MT2'].map(int).map(str)+train['GN4'].map(str)
MT2_GN4[:5]

0    24
1    22
2    22
3    14
4    21
dtype: object

In [9]:
# 1-hot encode the cartesian product values, MT2_GN4
oneHot = OneHotEncoder()
MT2_GN4 = oneHot.fit_transform(MT2_GN4.values.reshape(-1,1))
MT2_GN4.toarray()

array([[ 0.,  0.,  0., ...,  0.,  0.,  1.],
       [ 0.,  0.,  0., ...,  1.,  0.,  0.],
       [ 0.,  0.,  0., ...,  1.,  0.,  0.],
       ..., 
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  1., ...,  0.,  0.,  0.]])

## Linear Models

* Fit and predict logistic regression on MT2 and GN4 after they have been rescaled.

In [10]:
scaler = MinMaxScaler()
train[['MT2', 'GN4']] = scaler.fit_transform(train[['MT2', 'GN4']]) 
test[['MT2', 'GN4']] = scaler.fit_transform(test[['MT2', 'GN4']])

In [11]:
xTrainVal, xTestVal, yTrainVal, yTestVal = train_test_split(train[['MT2', 'GN4']], train['is_female'], test_size=.5)

In [12]:
lreg = LogisticRegression()
lreg.fit(xTrainVal, yTrainVal)
yPredVal = lreg.predict_proba(xTestVal)[:,1]
roc_auc_score(yTestVal, yPredVal)

0.69065671188898337

* Fit and predict logistic regression and SVM on 1-hot encoded values of the cartesian product of MT2 and GN4.

In [13]:
xTrainVal, xTestVal, yTrainVal, yTestVal = train_test_split(MT2_GN4, train['is_female'], test_size=.5)

In [14]:
lreg = LogisticRegression()
lreg.fit(xTrainVal, yTrainVal)
yPredVal = lreg.predict_proba(xTestVal)[:,1]
roc_auc_score(yTestVal, yPredVal)

0.78220303103341382

In [15]:
svc = SVC(C=1, gamma=1, probability=True)
svc.fit(xTrainVal, yTrainVal)
yPredVal = svc.predict_proba(xTestVal)[:,1]
roc_auc_score(yTestVal, yPredVal)

0.72551603406992271