In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split

# A. Read Data

In [2]:
train = pd.read_csv("../data/train.csv", low_memory=False)
train.drop(["train_id"], axis=1, inplace=True)
train.head()

Unnamed: 0,AA3,AA4,AA5,AA6,AA7,AA14,AA15,DG1,is_female,DG3,...,GN1,GN1_OTHERS,GN2,GN2_OTHERS,GN3,GN3_OTHERS,GN4,GN4_OTHERS,GN5,GN5_OTHERS
0,3,32,3.0,,323011,3854,481,1975,1,3,...,99.0,,99,,99,,99,,99,
1,2,26,,8.0,268131,2441,344,1981,1,8,...,,,1,,2,,2,,2,
2,1,16,,7.0,167581,754,143,1995,1,3,...,1.0,,2,,2,,2,,2,
3,4,44,5.0,,445071,5705,604,1980,1,3,...,,,2,,2,,99,,99,
4,4,43,,6.0,436161,5645,592,1958,1,3,...,,,1,,1,,1,,1,


In [3]:
# Train data dimensions
train.shape

(18255, 1234)

In [4]:
test = pd.read_csv("../data/test.csv", low_memory=False)
test.drop(["test_id"], axis=1, inplace=True)
test.head()

Unnamed: 0,AA3,AA4,AA5,AA6,AA7,AA14,AA15,DG1,DG3,DG3A,...,GN1,GN1_OTHERS,GN2,GN2_OTHERS,GN3,GN3_OTHERS,GN4,GN4_OTHERS,GN5,GN5_OTHERS
0,4,41,,7.0,417211,4479,535,1979,8,1,...,2.0,,1,,3,,3,,3,
1,3,32,2.0,,322011,3803,476,1993,1,4,...,1.0,,1,,1,,1,,1,
2,3,36,5.0,,365011,5610,585,1980,3,99,...,2.0,,2,,2,,2,,2,
3,2,24,,7.0,247061,2550,350,1991,3,2,...,2.0,,2,,2,,2,,2,
4,3,35,,8.0,358071,3233,400,1985,3,4,...,1.0,,1,,1,,1,,1,


In [5]:
test.shape

(27285, 1233)

In [6]:
# Variables listed in the data dictionary
var = list(pd.read_excel("../data/data-dictionary.xlsx", 
                         sheet_name="Codebook")["Column Name"]) + \
      list(pd.read_excel("../data/data-dictionary.xlsx", 
                         sheet_name="AA Locations", 
                         header=None)[0].dropna())
len(var)

1113

## 1. Discrepancy Between Data and its Dictionary

In [7]:
# Number of variables present in train, but NOT present in dictionary (A-B)
len(set(train.columns) - set(var))

155

In [8]:
# Variables present in test, but NOT present in train (A-B)
set(test.columns) - set(train.columns)

set()

In [9]:
# Variables present in train, but NOT present in test
set(train.columns) - set(test.columns)

{'is_female'}

In [10]:
# Number of variables present in dictionary, but NOT present in train
len(set(var) - set(train.columns))

30

## 2. Discover NaNs

In [11]:
# Empty columns with no data
emptyCol = train.columns[train.isnull().sum()==len(train)]
emptyCol

Index([u'DG4_OTHERS', u'G2P2_2_OTHERS', u'G2P2_10_OTHERS', u'G2P2_12_OTHERS',
       u'G2P2_15_OTHERS', u'MT12_99', u'MT13_4_OTHERS', u'MT13_96_OTHERS',
       u'MT14_3_OTHERS', u'MT14_5_OTHERS', u'MT14_7_OTHERS', u'MM3_15',
       u'MM3_16', u'MM4_16', u'MM5_4', u'MM5_5', u'MM5_15', u'MM5_16',
       u'MM5A_4', u'MM5A_5', u'MM5A_15', u'MM5A_16', u'MM6_16', u'MM7_4',
       u'MM7_5', u'MM7_15', u'MM7_16', u'MM8_15', u'MM8_16', u'MM11_4',
       u'MM11_5', u'MM11_5_OTHERS', u'MM11_11_OTHERS', u'MM11_15', u'MM11_16',
       u'MM15_OTHERS', u'MM17_13', u'MM17_15', u'MM17_17', u'MM17_19',
       u'MM17_22', u'MM17_96', u'MM17A', u'MM38_OTHERS', u'MM40_14',
       u'MM40_96', u'MMP4_7', u'MMP4_8', u'MMP4_96', u'FB28_3_OTHERS'],
      dtype='object')

In [12]:
# Remove empty columns from train and test data
train.drop(emptyCol, axis=1, inplace=True)
test.drop(emptyCol, axis=1, inplace=True)
print train.shape
print test.shape

(18255, 1184)
(27285, 1183)


In [13]:
# Columns where atleast 50% of the data are missing
sparseCol = train.columns[train.isnull().sum()>=len(train)/2.]
sparseCol

Index([u'AA5', u'DG3A_OTHERS', u'DG9b', u'DG9c', u'DG10b', u'DG10c', u'DG11b',
       u'DG11c', u'DG12B_1', u'DG12B_2',
       ...
       u'FB28_1_OTHERS', u'FB28_2_OTHERS', u'FB28_4_OTHERS', u'FB28_96_OTHERS',
       u'FB29_OTHERS', u'GN1_OTHERS', u'GN2_OTHERS', u'GN3_OTHERS',
       u'GN4_OTHERS', u'GN5_OTHERS'],
      dtype='object', length=823)

In [14]:
# NaN counts for columns with atleast 50% missing data
train[sparseCol].isnull().sum().unique()

array([12602, 18205, 10018, 11451, 11678, 12862, 12002, 13158, 14691,
       18246, 17508, 18222, 17974,  9257, 18153, 18021, 17947, 10985,
       15815, 18238, 18144, 14264, 14628, 18249, 15645, 18182, 17704,
       18232, 18241, 18179, 18235, 18208, 17524, 17965, 17860, 18212,
       18011, 18200, 18001, 18251, 18227, 18254, 18253, 18250,  9333,
       18244, 18230, 15095, 18240, 12082, 18234, 11243, 18233, 17672,
       15596, 17805, 18046, 18218, 18245, 16572, 18247, 18003, 16681,
       18055, 18252, 17572, 11285, 14806, 18199, 18135, 11496, 18248,
       17876, 18229, 11310,  9987, 18048, 18101, 18155, 18215, 18112,
       18133, 18049, 18217, 18124, 18116, 18170, 18242, 18209, 18183,
       17788, 17890, 17352, 18169, 18187, 18054, 18198, 18143, 18211,
       18194, 18125, 18180, 17752, 18157, 18239, 18223, 18220, 18216,
       18236, 18128, 18181, 17182, 18243, 17183, 18204, 18190, 18172,
       18191, 18219, 18201, 18224, 18226, 18207, 18210, 18214, 18031,
       18225, 18062,

In [15]:
# Remove columns with atleast 50% missing data from train and test data
train.drop(sparseCol, axis=1, inplace=True)
test.drop(sparseCol, axis=1, inplace=True)
print train.shape
print test.shape

(18255, 361)
(27285, 360)


In [16]:
# Variables present in train, but NOT present in data dictionary (A-B)
set(train.columns) - set(var)

{'DL14', 'DL4_96', 'DL4_99', 'LN2_RIndLngBEOth', 'LN2_WIndLngBEOth'}

According to the data dictionary:  
DL4_96 = DL4_24,  
DL4_99 = DL4_25,   
LN2_RIndLngBEOth = LN2_3 Reading specification,   
LN2_WIndLngBEOth = LN2_4 Writing specification,  
DL14 description is unknown

Let's rename columns DL4_96 and DL4_99 in train and test data to match with the dictionary labels.

In [17]:
# Rename columns in train and test data to match with dictionary labels
train.rename(columns={"DL4_96": "DL4_24", "DL4_99": "DL4_25"}, inplace=True)
test.rename(columns={"DL4_96": "DL4_24", "DL4_99": "DL4_25"}, inplace=True)

In [18]:
# 123 columns have NaN in train data
missColTr = train.columns[train.isnull().sum()>0]
missColTr

Index([u'AA6', u'DG9a', u'DL5', u'MT1A', u'MT7', u'MT15', u'MT17_1', u'MT17_2',
       u'MT17_3', u'MT17_4',
       ...
       u'FB19B_2', u'FB19B_3', u'FB19B_4', u'FB19B_5', u'FB19B_96', u'FB20',
       u'FB24', u'LN2_RIndLngBEOth', u'LN2_WIndLngBEOth', u'GN1'],
      dtype='object', length=123)

In [19]:
# Counts of NaN in train data
train[missColTr].isnull().sum().unique()

array([5653,  232, 6806,  757, 8922, 7012, 6173, 6759, 6380, 7839, 8670,
         86,  425,   37,   11,    6,   47,   21,   10,    7,   34,   20,
        159,   16, 2129, 5293, 7319, 7397, 4049, 6580,  769,  444,  297,
        247,  263,   63, 1494, 5541, 6914, 6911, 4025])

In [20]:
# 123 columns have NaN in test data
missColTe = test.columns[test.isnull().sum()>0]
missColTe

Index([u'AA6', u'DG9a', u'DL5', u'MT1A', u'MT7', u'MT15', u'MT17_1', u'MT17_2',
       u'MT17_3', u'MT17_4',
       ...
       u'FB19B_2', u'FB19B_3', u'FB19B_4', u'FB19B_5', u'FB19B_96', u'FB20',
       u'FB24', u'LN2_RIndLngBEOth', u'LN2_WIndLngBEOth', u'GN1'],
      dtype='object', length=123)

In [21]:
# Counts of NaN in test data
test[missColTe].isnull().sum().unique()

array([ 8459,   386, 10207,  1158, 13479, 10590,  8976, 10134,  9622,
       11736, 12977,   141,   658,    35,     5,     8,    93,    32,
          51,    13,    15,    68,    31,   272,    25,  3169,  7846,
       10911, 10946,  5999,  9869,  1127,   716,   462,   374,   381,
          74,  2227,  8281, 10204, 10224,  6143])

In [22]:
# 123 columns with NaN are the same in train and test data
set(missColTr).symmetric_difference(set(missColTe))

set()

## 3. Clean Strings

In [23]:
# Variables that contain strings
print train.columns[train.dtypes==object]
print test.columns[test.dtypes==object]

Index([u'LN2_RIndLngBEOth', u'LN2_WIndLngBEOth'], dtype='object')
Index([u'LN2_RIndLngBEOth', u'LN2_WIndLngBEOth'], dtype='object')


### LN2_RIndLngBEOth

In [24]:
# frequency table
train["LN2_RIndLngBEOth"].value_counts()

Hindi                        5198
Marathi                      1023
Tamil                         941
Telugu                        914
Bengali                       891
Oriya                         478
Gujarati                      410
Kannada                       399
Malayalam                     293
Punjabi                       194
Assamese                      180
Marathi & Hindi                85
English                        43
Urdu                           39
Manipuri                       36
Chattisgari                    31
None                           21
Rajasthani                     21
Konkani                        13
Tribal Language                12
Bhojpuri                       11
Hindi & Marathi                11
Nepali                         10
Mewari                         10
Thadou                         10
Hindi & Urdu                    7
Hindi & Rajasthani              6
Govan                           6
Karbi                           5
sanskrit      

In [25]:
# Convert all strings to lower case to solve Hindi vs HIndi issue
train["LN2_RIndLngBEOth"] = train["LN2_RIndLngBEOth"].str.lower()
train["LN2_WIndLngBEOth"] = train["LN2_WIndLngBEOth"].str.lower()
test["LN2_RIndLngBEOth"] = test["LN2_RIndLngBEOth"].str.lower()
test["LN2_WIndLngBEOth"] = test["LN2_WIndLngBEOth"].str.lower()

In [26]:
# If multiple languages listed, coalesce to the one with highest frequency
train["LN2_RIndLngBEOth"].loc[train["LN2_RIndLngBEOth"].str.contains("hindi", na=False)] = "hindi"
train["LN2_RIndLngBEOth"].loc[train["LN2_RIndLngBEOth"].str.contains("marathi", na=False)] = "marathi"
train["LN2_RIndLngBEOth"].loc[train["LN2_RIndLngBEOth"].str.contains("kannada", na=False)] = "kannada"
# train["LN2_RIndLngBEOth"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [27]:
test["LN2_RIndLngBEOth"].loc[test["LN2_RIndLngBEOth"].str.contains("hindi", na=False)] = "hindi"
test["LN2_RIndLngBEOth"].loc[test["LN2_RIndLngBEOth"].str.contains("marathi", na=False)] = "marathi"
test["LN2_RIndLngBEOth"].loc[test["LN2_RIndLngBEOth"].str.contains("tamil", na=False)] = "tamil"
test["LN2_RIndLngBEOth"].loc[test["LN2_RIndLngBEOth"].str.contains("kannada", na=False)] = "kannada"
# test["LN2_RIndLngBEOth"].value_counts()

In [28]:
# Coalesce languages with frequency <100, into 'other' category
# This takes care of test data having languages that train data doesn't have
readTr = train["LN2_RIndLngBEOth"].value_counts()
train["LN2_RIndLngBEOth"].replace(readTr[readTr<=100].index, "other", inplace=True)
train["LN2_RIndLngBEOth"].value_counts()

hindi        5327
marathi      1027
tamil         941
telugu        914
bengali       891
oriya         478
gujarati      410
kannada       405
malayalam     293
other         281
punjabi       194
assamese      180
Name: LN2_RIndLngBEOth, dtype: int64

In [29]:
readTe = test["LN2_RIndLngBEOth"].value_counts()
test["LN2_RIndLngBEOth"].replace(readTe[readTe<=100].index, "other", inplace=True)
test["LN2_RIndLngBEOth"].value_counts()

hindi        7847
marathi      1641
tamil        1455
telugu       1406
bengali      1360
oriya         685
gujarati      665
kannada       589
other         455
malayalam     453
punjabi       273
assamese      252
Name: LN2_RIndLngBEOth, dtype: int64

In [30]:
# Languages in test data that are NOT in train data
set(test["LN2_RIndLngBEOth"]) - set(train["LN2_RIndLngBEOth"])

set()

### LN2_WIndLngBEOth

In [31]:
train["LN2_WIndLngBEOth"].loc[train["LN2_WIndLngBEOth"].str.contains("hindi", na=False)] = "hindi"
train["LN2_WIndLngBEOth"].loc[train["LN2_WIndLngBEOth"].str.contains("kannada", na=False)] = "kannada"
train["LN2_WIndLngBEOth"].loc[train["LN2_WIndLngBEOth"].str.contains("marathi", na=False)] = "marathi"
# train["LN2_WIndLngBEOth"].value_counts()

In [32]:
test["LN2_WIndLngBEOth"].loc[test["LN2_WIndLngBEOth"].str.contains("hindi", na=False)] = "hindi"
test["LN2_WIndLngBEOth"].loc[test["LN2_WIndLngBEOth"].str.contains("marathi", na=False)] = "marathi"
test["LN2_WIndLngBEOth"].loc[test["LN2_WIndLngBEOth"].str.contains("telugu", na=False)] = "telugu"
test["LN2_WIndLngBEOth"].loc[test["LN2_WIndLngBEOth"].str.contains("tamil", na=False)] = "tamil"
# test["LN2_WIndLngBEOth"].value_counts()

In [33]:
writeTr = train["LN2_WIndLngBEOth"].value_counts()
train["LN2_WIndLngBEOth"].replace(writeTr[writeTr<=100].index, "other", inplace=True)
train["LN2_WIndLngBEOth"].value_counts()

hindi        5380
bengali      1083
tamil         949
telugu        895
marathi       834
oriya         504
gujarati      469
kannada       419
other         293
malayalam     255
assamese      134
punjabi       129
Name: LN2_WIndLngBEOth, dtype: int64

In [34]:
writeTe = test["LN2_WIndLngBEOth"].value_counts()
test["LN2_WIndLngBEOth"].replace(writeTe[writeTe<=100].index, "other", inplace=True)
test["LN2_WIndLngBEOth"].value_counts()

hindi        7966
bengali      1691
tamil        1466
telugu       1397
marathi      1382
oriya         710
gujarati      688
kannada       587
malayalam     396
other         391
punjabi       197
assamese      190
Name: LN2_WIndLngBEOth, dtype: int64

In [35]:
set(test["LN2_WIndLngBEOth"]) - set(train["LN2_WIndLngBEOth"])

set()

### Convert Strings to Float

Sklearn requires strings to be converted to floats. One such way to do it is to use the pd.factorize() function. 

In [36]:
# Since the Imputer only accepts numeric data, fill missing languages with Hindi and 
# then convert those strings to float
train["LN2_RIndLngBEOth"].fillna("hindi", inplace=True)
train["LN2_WIndLngBEOth"].fillna("hindi", inplace=True)
test["LN2_RIndLngBEOth"].fillna("hindi", inplace=True)
test["LN2_WIndLngBEOth"].fillna("hindi", inplace=True)

In [37]:
# Change strings to digit keys
train["LN2_R"] = pd.factorize(train["LN2_RIndLngBEOth"])[0]
train["LN2_W"] = pd.factorize(train["LN2_WIndLngBEOth"])[0]
test["LN2_R"] = pd.factorize(test["LN2_RIndLngBEOth"])[0]
test["LN2_W"] = pd.factorize(test["LN2_WIndLngBEOth"])[0]

In [38]:
# 12 keys for 12 languages
test["LN2_W"].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

## 4. Impute NaNs

Algorithms in Sklearn can't handle NaNs, so impute them.

In [39]:
# Train data predictors
xTr = train.loc[:, ~train.columns.isin(["is_female", "LN2_RIndLngBEOth", "LN2_WIndLngBEOth"])]
xTr.head()

Unnamed: 0,AA3,AA4,AA6,AA7,AA14,AA15,DG1,DG3,DG3A,DG4,...,LN2_2,LN2_3,LN2_4,GN1,GN2,GN3,GN4,GN5,LN2_R,LN2_W
0,3,32,,323011,3854,481,1975,3,4,5,...,1,1,1,99.0,99,99,99,99,0,0
1,2,26,8.0,268131,2441,344,1981,8,4,5,...,1,3,4,,1,2,2,2,1,1
2,1,16,7.0,167581,754,143,1995,3,2,2,...,1,2,2,1.0,2,2,2,2,0,0
3,4,44,,445071,5705,604,1980,3,4,5,...,1,4,5,,2,2,99,99,2,2
4,4,43,6.0,436161,5645,592,1958,3,4,6,...,4,4,4,,1,1,1,1,3,3


In [40]:
# Test data predictors
xTe = test.loc[:, ~test.columns.isin(["is_female", "LN2_RIndLngBEOth", "LN2_WIndLngBEOth"])]
xTe.head()

Unnamed: 0,AA3,AA4,AA6,AA7,AA14,AA15,DG1,DG3,DG3A,DG4,...,LN2_2,LN2_3,LN2_4,GN1,GN2,GN3,GN4,GN5,LN2_R,LN2_W
0,4,41,7.0,417211,4479,535,1979,8,1,1,...,1,1,1,2.0,1,3,3,3,0,0
1,3,32,,322011,3803,476,1993,1,4,6,...,5,5,5,1.0,1,1,1,1,1,0
2,3,36,,365011,5610,585,1980,3,99,6,...,5,5,5,2.0,2,2,2,2,2,1
3,2,24,7.0,247061,2550,350,1991,3,2,1,...,1,1,1,2.0,2,2,2,2,0,0
4,3,35,8.0,358071,3233,400,1985,3,4,6,...,1,1,1,1.0,1,1,1,1,0,0


In [41]:
# Since all data are categorical, replace NaNs with the most frequent levels
imputer = Imputer(missing_values="NaN", strategy="most_frequent", axis=0, copy=False)

In [42]:
xTrain = imputer.fit_transform(xTr)
xTest = imputer.fit_transform(xTe)

In [43]:
# Imputed train data predictors
xTrain = pd.DataFrame(xTrain, columns=list(xTr))
xTrain.head()

Unnamed: 0,AA3,AA4,AA6,AA7,AA14,AA15,DG1,DG3,DG3A,DG4,...,LN2_2,LN2_3,LN2_4,GN1,GN2,GN3,GN4,GN5,LN2_R,LN2_W
0,3.0,32.0,6.0,323011.0,3854.0,481.0,1975.0,3.0,4.0,5.0,...,1.0,1.0,1.0,99.0,99.0,99.0,99.0,99.0,0.0,0.0
1,2.0,26.0,8.0,268131.0,2441.0,344.0,1981.0,8.0,4.0,5.0,...,1.0,3.0,4.0,1.0,1.0,2.0,2.0,2.0,1.0,1.0
2,1.0,16.0,7.0,167581.0,754.0,143.0,1995.0,3.0,2.0,2.0,...,1.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,0.0,0.0
3,4.0,44.0,6.0,445071.0,5705.0,604.0,1980.0,3.0,4.0,5.0,...,1.0,4.0,5.0,1.0,2.0,2.0,99.0,99.0,2.0,2.0
4,4.0,43.0,6.0,436161.0,5645.0,592.0,1958.0,3.0,4.0,6.0,...,4.0,4.0,4.0,1.0,1.0,1.0,1.0,1.0,3.0,3.0


In [44]:
# Imputed test data predictors
xTest = pd.DataFrame(xTest, columns=list(xTe))
xTest.head()

Unnamed: 0,AA3,AA4,AA6,AA7,AA14,AA15,DG1,DG3,DG3A,DG4,...,LN2_2,LN2_3,LN2_4,GN1,GN2,GN3,GN4,GN5,LN2_R,LN2_W
0,4.0,41.0,7.0,417211.0,4479.0,535.0,1979.0,8.0,1.0,1.0,...,1.0,1.0,1.0,2.0,1.0,3.0,3.0,3.0,0.0,0.0
1,3.0,32.0,6.0,322011.0,3803.0,476.0,1993.0,1.0,4.0,6.0,...,5.0,5.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
2,3.0,36.0,6.0,365011.0,5610.0,585.0,1980.0,3.0,99.0,6.0,...,5.0,5.0,5.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0
3,2.0,24.0,7.0,247061.0,2550.0,350.0,1991.0,3.0,2.0,1.0,...,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0
4,3.0,35.0,8.0,358071.0,3233.0,400.0,1985.0,3.0,4.0,6.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0


In [45]:
# No more NaNs
print xTrain.columns[xTrain.isnull().sum()>0]
print xTest.columns[xTest.isnull().sum()>0]

Index([], dtype='object')
Index([], dtype='object')


# B. Run Classification Models

## 1. Random Forests

In [46]:
# Train data response
yTrain = train["is_female"]
yTrain.head()

0    1
1    1
2    1
3    1
4    1
Name: is_female, dtype: int64

### Train RF

In [47]:
rf =  RandomForestClassifier(n_estimators=100)

In [48]:
# Create validation dataset with 50% split
xTrainVal, xTestVal, yTrainVal, yTestVal = train_test_split(xTrain, yTrain, test_size=.5)

In [49]:
# Train on validation train data
rf.fit(X=xTrainVal, y=yTrainVal)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [50]:
# Predict on validation test data
yPredVal = rf.predict(xTestVal)

In [51]:
# Confusion matrix. More diagonal entries - good!
confusion_matrix(yTestVal, yPredVal, labels=[0,1])

array([[3696,  527],
       [ 527, 4378]])

In [52]:
print(classification_report(yTestVal, yPredVal))

             precision    recall  f1-score   support

          0       0.88      0.88      0.88      4223
          1       0.89      0.89      0.89      4905

avg / total       0.88      0.88      0.88      9128



The true positive rate (Precision) is 89% for each category. The Random Forest model correctly classifies 3688 out of 4230 males and 4396 out of 4898 females.

In [53]:
# Variable importance
sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), xTrain.columns), reverse=True)

[(0.0972, 'DL0'),
 (0.0971, 'DG6'),
 (0.0576, 'DL1'),
 (0.0557, 'MT1A'),
 (0.0253, 'FL4'),
 (0.0239, 'GN5'),
 (0.0201, 'DG3'),
 (0.0191, 'GN4'),
 (0.0159, 'DG1'),
 (0.0149, 'MT10'),
 (0.0149, 'GN2'),
 (0.0137, 'MT2'),
 (0.0123, 'GN3'),
 (0.0118, 'DG4'),
 (0.0115, 'DG8a'),
 (0.0115, 'AA14'),
 (0.0109, 'AA7'),
 (0.0108, 'DL5'),
 (0.0107, 'AA15'),
 (0.0078, 'DG5_4'),
 (0.0077, 'DL14'),
 (0.0073, 'GN1'),
 (0.0073, 'AA4'),
 (0.0061, 'DG9a'),
 (0.006, 'FL10'),
 (0.0057, 'FB20'),
 (0.0057, 'DL24'),
 (0.0055, 'LN1A'),
 (0.0055, 'IFI14_2'),
 (0.0053, 'FL9B'),
 (0.0051, 'MT17_3'),
 (0.005, 'MT1'),
 (0.005, 'FF2A'),
 (0.0049, 'DL15'),
 (0.0048, 'IFI14_1'),
 (0.0048, 'FL9A'),
 (0.0047, 'IFI16_2'),
 (0.0046, 'LN2_1'),
 (0.0046, 'IFI15_2'),
 (0.0045, 'LN1B'),
 (0.0043, 'FL9C'),
 (0.0043, 'FL8_1'),
 (0.0042, 'IFI17_1'),
 (0.0042, 'IFI16_1'),
 (0.0042, 'IFI15_1'),
 (0.0041, 'LN2_3'),
 (0.0041, 'FL8_7'),
 (0.0041, 'FL8_5'),
 (0.004, 'LN2_2'),
 (0.004, 'FL8_2'),
 (0.004, 'FL11'),
 (0.0039, 'LN2_4'),
 (0

### Predict with RF

In [54]:
# Predict on test data
yTest = rf.predict(xTest)

In [55]:
# Write prediction to CSV file
submissionRF = pd.DataFrame({"test_id": range(0, len(yTest)), "is_female": yTest})
submissionRF = submissionRF[["test_id", "is_female"]]
submissionRF.to_csv("../submission/submissionRF.csv", index=False)

This model yielded **ROC score of 0.88142** on Kaggle submission.