In [1]:
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score

# A. Data

Read data that were imputed for missing values and also were discovered as the top features from the Random Forest model:

* DL0 -  Who is the main income earner in your household? 
* DG6 - How are you related to the household head? 
* DL1 - In the past 12 months, were you mainly...? (Employed, unemployed, student, etc.)
* MT1A - Who decides on who should have a phone in your household?
* FL4 - What or who do you depend on the most for financial advice?
* DG3 - What is your marital status?
* GN5 - Who decides what kind of financial services you can personally use? 
* GN3 - Who controls assets (i.e., savings, land, and livestock) in your household? 
* GN4 - Who decides what kind of financial services your household uses? 
* MT2 - Do you personally own a mobile phone? 
* MT10 - Do you personally have an active/working SIM card? 
* DG1 - What year were you born?
* GN2 - Who in your household decides what purchases are made to meet daily household needs like food, clothing, and cleaning supplies? 
* DG4 - What is your highest level of education? 
* DG8a - How many adults do you have in the household? 
* AA15 
* AA14

In [2]:
topFeat = ['DL0', 'DG6', 'DL1', 'MT1A', 'FL4', 'DG3', 'GN5', 'GN3', 'GN4', 'MT2', 'MT10', 'DG1', 'GN2', 'DG4', 'DG8a']
train = pd.read_csv("../data/train-cleaned.csv", low_memory=False, usecols=topFeat+['is_female'])
test = pd.read_csv("../data/test-cleaned.csv", low_memory=False, usecols=topFeat)

In [3]:
train.head()

Unnamed: 0,DG1,is_female,DG3,DG4,DG6,DG8a,DL0,DL1,MT1A,MT2,MT10,FL4,GN2,GN3,GN4,GN5
0,1975.0,1.0,3.0,5.0,2.0,4.0,1.0,1.0,99.0,2.0,2.0,99.0,99.0,99.0,99.0,99.0
1,1981.0,1.0,8.0,5.0,2.0,4.0,2.0,7.0,4.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0
2,1995.0,1.0,3.0,2.0,7.0,4.0,2.0,7.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
3,1980.0,1.0,3.0,5.0,2.0,2.0,2.0,7.0,2.0,1.0,1.0,2.0,2.0,2.0,99.0,99.0
4,1958.0,1.0,3.0,6.0,2.0,99.0,2.0,7.0,1.0,2.0,2.0,1.0,1.0,1.0,1.0,1.0


In [4]:
test.head()

Unnamed: 0,DG1,DG3,DG4,DG6,DG8a,DL0,DL1,MT1A,MT2,MT10,FL4,GN2,GN3,GN4,GN5
0,1979.0,8.0,1.0,2.0,3.0,2.0,4.0,2.0,2.0,2.0,2.0,1.0,3.0,3.0,3.0
1,1993.0,1.0,6.0,3.0,5.0,2.0,1.0,1.0,1.0,1.0,3.0,1.0,1.0,1.0,1.0
2,1980.0,3.0,6.0,2.0,2.0,2.0,7.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
3,1991.0,3.0,1.0,2.0,2.0,2.0,4.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0
4,1985.0,3.0,6.0,1.0,2.0,2.0,7.0,1.0,2.0,2.0,99.0,1.0,1.0,1.0,1.0


In [5]:
print train.shape
print test.shape

(18255, 16)
(27285, 15)


## 1. Coalesce Data Values

### General Profile

* DG6 - How are you related to the household head?

Replace data values by [consanguinity](https://en.wikipedia.org/wiki/Consanguinity). That is, parents and children are 1st degree relatives to you; grandparents, siblings, and grandchildren are 2nd degree relatives to you. Also, set 'myself' = 0 and 'spouse' = -1.

In [6]:
train['DG6'] = train['DG6'].map({1:0, 2:-1, 3:1, 4:1, 5:2, 6:2, 7:4, 9:9, 99:9})
test['DG6'] = test['DG6'].map({1:0, 2:-1, 3:1, 4:1, 5:2, 6:2, 7:4, 9:9, 99:9})

* DG3 - What is your marital status?

Set polygamy = 0, monogamy = -1, divorced = 1, separated = 2, living together = 3, single = 4, widow = 5, other = 9

In [7]:
train['DG3'] = train['DG3'].map({1:4, 2:0, 3:-1, 4:1, 5:2, 6:5, 7:3, 8:3, 96:9, 99:9})
test['DG3'] = test['DG3'].map({1:4, 2:0, 3:-1, 4:1, 5:2, 6:5, 7:3, 8:3, 96:9, 99:9})

* DG4 - What is your highest level of education?

The main categories are: illiterate, below primary school, primary school, middle school, secondary school, post-secondary, vocational degree, graduate, post-graduate, other

In [8]:
train['DG4'] = train['DG4'].map({1:0, 2:-1, 3:-1, 4:1, 5:2, 6:3, 7:4, 8:5, 9:5, 10:6, 11:7, 96:9, 99:9})
test['DG4'] = test['DG4'].map({1:0, 2:-1, 3:-1, 4:1, 5:2, 6:3, 7:4, 8:5, 9:5, 10:6, 11:7, 96:9, 99:9})

* DG1 - What year were you born?

Calculate the person's age, then cut age into 7 bins as follows: (15,25] = 1, (25,35] = 2, (35,45] = 3, (45,55] = 4, (55,65] = 5, (65,75] = 6, (75,100] = 7

In [9]:
ageTr = train['DG1'].apply(lambda x: 2017-x)
ageTr.describe()

count    18255.000000
mean        38.926815
std         14.740675
min         16.000000
25%         27.000000
50%         36.000000
75%         48.000000
max        100.000000
Name: DG1, dtype: float64

In [10]:
ageTe = test['DG1'].apply(lambda x: 2017-x)
ageTe.describe()

count    27285.000000
mean        39.001539
std         14.933928
min         16.000000
25%         27.000000
50%         36.000000
75%         49.000000
max        100.000000
Name: DG1, dtype: float64

In [11]:
train['DG1'] = pd.cut(ageTr, bins=[15,25,35,45,55,65,75,100], labels=[1,2,3,4,5,6,7])
test['DG1'] = pd.cut(ageTe, bins=[15,25,35,45,55,65,75,100], labels=[1,2,3,4,5,6,7])

* DG8a - How many adults do you have in the household?

In [12]:
print train['DG8a'].value_counts()
test['DG8a'].value_counts()

2.0     7649
3.0     3920
4.0     2892
1.0     1439
5.0     1187
6.0      563
99.0     229
7.0      220
8.0       88
9.0       41
10.0      23
0.0        3
13.0       1
Name: DG8a, dtype: int64


2.0     11579
3.0      5811
4.0      4231
1.0      2155
5.0      1697
6.0       877
99.0      382
7.0       316
8.0       140
9.0        63
10.0       28
0.0         4
11.0        2
Name: DG8a, dtype: int64

In [13]:
train['DG8a'] = train['DG8a'].map(lambda x: 10 if x>=7 or x==0 else x)
test['DG8a'] = test['DG8a'].map(lambda x: 10 if x>=7 or x==0 else x)

* Not relabeling DL0 - Who is the main income earner in your household?

### Employment

* In the past 12 months, were you mainly...? (Employed, unemployed, student, etc.)

Combine all work descriptions for which the salary is regular, all work yielding irregular salary, and all reasons preventing from work.

In [14]:
train['DL1'] = train['DL1'].map({1:0, 2:0, 3:3, 4:3, 5:1, 6:4, 7:2, 8:5, 9:6, 10:6, 96:10, 99:10})
test['DL1'] = test['DL1'].map({1:0, 2:0, 3:3, 4:3, 5:1, 6:4, 7:2, 8:5, 9:6, 10:6, 96:10, 99:10})

### Mobile Phone

* MT1A - Who decides on who should have a phone in your household?

Replace data values by consanguinity.

In [15]:
train['MT1A'] = train['MT1A'].map({1:0, 2:-1, 3:1, 4:1, 5:2, 8:2, 99:9})
test['MT1A'] = test['MT1A'].map({1:0, 2:-1, 3:1, 4:1, 5:2, 8:2, 99:9})

Not re-labeling:

MT2 - Do you personally own a mobile phone?  
MT10 - Do you personally have an active/working SIM card?

### Financials

* FL4 - What or who do you depend on the most for financial advice?

The main categories are: myself, spouse, for-profit institutions (bank, insurance companies, MFI), other people (lending groups, friends, family), superstition (religion, supernatural being), ads (radio, TV, internet, newspaper, billboards, leaflets), non-profit institutions (NGO, government), other

In [16]:
train['FL4'] = train['FL4'].map({1:0, 2:-1, 3:1, 4:1, 5:1, 6:2, 8:2, 9:3, 10:3, 11:3, 12:3, 13:3, 16:3, 14:4, 15:4, 7:5, 17:5, 96:9, 99:9})
test['FL4'] = test['FL4'].map({1:0, 2:-1, 3:1, 4:1, 5:1, 6:2, 8:2, 9:3, 10:3, 11:3, 12:3, 13:3, 16:3, 14:4, 15:4, 7:5, 17:5, 96:9, 99:9})

### General Decisions

Set decisions made by self = 0, spouse = -1, joint decision = 1, parents = 2, other = 9

In [17]:
train['GN5'] = train['GN5'].map({1:0, 2:-1, 3:1, 4:2, 96:9, 99:9})
test['GN5'] = test['GN5'].map({1:0, 2:-1, 3:1, 4:2, 96:9, 99:9})

In [18]:
train['GN4'] = train['GN4'].map({1:0, 2:-1, 3:1, 4:2, 96:9, 99:9})
test['GN4'] = test['GN4'].map({1:0, 2:-1, 3:1, 4:2, 96:9, 99:9})

In [19]:
train['GN3'] = train['GN3'].map({1:0, 2:-1, 3:1, 4:2, 96:9, 99:9})
test['GN3'] = test['GN3'].map({1:0, 2:-1, 3:1, 4:2, 96:9, 99:9})

In [20]:
train['GN2'] = train['GN2'].map({1:0, 2:-1, 3:1, 4:2, 96:9, 99:9})
test['GN2'] = test['GN2'].map({1:0, 2:-1, 3:1, 4:2, 96:9, 99:9})

In [21]:
train.head()

Unnamed: 0,DG1,is_female,DG3,DG4,DG6,DG8a,DL0,DL1,MT1A,MT2,MT10,FL4,GN2,GN3,GN4,GN5
0,3,1.0,-1,2,-1,4.0,1.0,0,9,2.0,2.0,9,9,9,9,9
1,3,1.0,3,2,-1,4.0,2.0,2,1,2.0,2.0,-1,0,-1,-1,-1
2,1,1.0,-1,-1,4,4.0,2.0,2,-1,2.0,2.0,-1,-1,-1,-1,-1
3,3,1.0,-1,2,-1,2.0,2.0,2,-1,1.0,1.0,-1,-1,-1,9,9
4,5,1.0,-1,3,-1,10.0,2.0,2,0,2.0,2.0,0,0,0,0,0


In [22]:
test.head()

Unnamed: 0,DG1,DG3,DG4,DG6,DG8a,DL0,DL1,MT1A,MT2,MT10,FL4,GN2,GN3,GN4,GN5
0,3,3,0,-1,3.0,2.0,3,-1,2.0,2.0,-1,0,1,1,1
1,1,4,3,1,5.0,2.0,0,0,1.0,1.0,1,0,0,0,0
2,3,-1,3,-1,2.0,2.0,2,-1,2.0,2.0,-1,-1,-1,-1,-1
3,2,-1,0,-1,2.0,2.0,3,-1,2.0,1.0,-1,-1,-1,-1,-1
4,2,-1,3,0,2.0,2.0,2,0,2.0,2.0,9,0,0,0,0


## 2. Rescale Data Values

Since non-tree based models are not scale invariant, use MinMaxScaler() to rescale the data from [0,1].

In [23]:
scaler = MinMaxScaler()
train[train.columns] = scaler.fit_transform(train[train.columns]) 
test[test.columns] = scaler.fit_transform(test[test.columns]) 

In [24]:
train.head()

Unnamed: 0,DG1,is_female,DG3,DG4,DG6,DG8a,DL0,DL1,MT1A,MT2,MT10,FL4,GN2,GN3,GN4,GN5
0,0.333333,1.0,0.0,0.3,0.0,0.333333,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,0.333333,1.0,0.4,0.3,0.0,0.333333,1.0,0.2,0.2,1.0,1.0,0.0,0.1,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.5,0.333333,1.0,0.2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.333333,1.0,0.0,0.3,0.0,0.111111,1.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,0.666667,1.0,0.0,0.4,0.0,1.0,1.0,0.2,0.1,1.0,1.0,0.1,0.1,0.1,0.1,0.1


In [25]:
test.head()

Unnamed: 0,DG1,DG3,DG4,DG6,DG8a,DL0,DL1,MT1A,MT2,MT10,FL4,GN2,GN3,GN4,GN5
0,0.333333,0.4,0.1,0.0,0.222222,1.0,0.3,0.0,1.0,1.0,0.0,0.1,0.2,0.2,0.2
1,0.0,0.5,0.4,0.2,0.444444,1.0,0.0,0.1,0.0,0.0,0.2,0.1,0.1,0.1,0.1
2,0.333333,0.0,0.4,0.0,0.111111,1.0,0.2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.166667,0.0,0.1,0.0,0.111111,1.0,0.3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.166667,0.0,0.4,0.1,0.111111,1.0,0.2,0.1,1.0,1.0,1.0,0.1,0.1,0.1,0.1


# B. SVM

* The regularization parameter (C) determines how much space should be around the decision boundary to prevent misclassification. 
* The gamma parameter in SVM dictates how far should it reach to the training samples to determine the decision boundary. If decreasing gamma increases accuracy, it means that the data are sparse and far away from the decision boundary. 
* Kernels calculate the separation line in higher dimension.

In [26]:
xTrain = train.loc[:, ~train.columns.isin(["is_female"])]
yTrain = train["is_female"]

In [27]:
# gridSVC = GridSearchCV(SVC(), 
#                        param_grid={
#                           "C": [1, 10, 100, 1000],
#                           "gamma": [1, .1, .01, .001]})
# gridSVC.fit(xTrain, yTrain)
# gridSVC.best_params_

In [28]:
xTrainVal, xTestVal, yTrainVal, yTestVal = train_test_split(xTrain, yTrain, test_size=.5)

In [29]:
model = SVC(C=1000, gamma=.1, probability=True)
model.fit(xTrainVal, yTrainVal)

SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [30]:
yPredVal = model.predict_proba(xTestVal)[:,1]
roc_auc_score(yTestVal, yPredVal)

0.9272308095706896

In [31]:
yTest = model.predict_proba(test)[:,1]
submissionSVC = pd.DataFrame({"test_id": range(0, len(yTest)), "is_female": yTest})
submissionSVC = submissionSVC[["test_id", "is_female"]]  # change column order
submissionSVC.to_csv("../submission/submissionSVC.csv", index=False)