In [24]:
'''
Load the AllState training data.
Each row consists of one index, 116 categorical predictors, 14 continuous 
predictors, and one continuous response variable called loss.
'''
import pandas as pd
import numpy as np

df0 = pd.read_csv("data/train.csv", delimiter=",", header=0, index_col=0)
print("Shape of training data frame: %s\n" %(df0.shape,))

# Make a dictionary of the number of levels for each categorical feature.
catdict  = {"cat{0}".format(ind): 0 for ind in range(1,117)}
for var in catdict.keys():
    catdict[var] = len(df0[var].unique())
print("Total number of categorical feature levels: {0}".format(sum(catdict.values())))

Shape of training data frame: (188318, 131)

Total number of categorical feature levels: 1139


In [25]:
'''
Convert categorical variables into subsets of binary ones.
Postpone the dropping of one binary per category until after the low-variance variable removal.
'''
df1 = pd.get_dummies( df0, drop_first=False)
print("Shape of converted data frame: {0}".format(df1.shape))
print("\nConverted data frame head: {0}".format(df1.head()))

Shape of converted data frame: (188318, 1154)

Converted data frame head:        cont1     cont2     cont3     cont4     cont5     cont6     cont7  \
id                                                                         
1   0.726300  0.245921  0.187583  0.789639  0.310061  0.718367  0.335060   
2   0.330514  0.737068  0.592681  0.614134  0.885834  0.438917  0.436585   
5   0.261841  0.358319  0.484196  0.236924  0.397069  0.289648  0.315545   
10  0.321594  0.555782  0.527991  0.373816  0.422268  0.440945  0.391128   
11  0.273204  0.159990  0.527991  0.473202  0.704268  0.178193  0.247408   

      cont8    cont9   cont10    ...     cat116_P  cat116_Q  cat116_R  \
id                               ...                                    
1   0.30260  0.67135  0.83510    ...            0         0         0   
2   0.60087  0.35127  0.43919    ...            0         0         0   
5   0.27320  0.26076  0.32446    ...            0         0         0   
10  0.31796  0.32128  0.4446

In [26]:
'''
Eliminate categorical features with low variance.
'''
from sklearn.feature_selection import VarianceThreshold

cats     = [feature for feature in df1.columns.values if feature[:3]=="cat"]
conts    = [feature for feature in df1.columns.values if feature[:4]=="cont"]
prob     = 0.95
binvar   = prob * (1.0-prob)
sel      = VarianceThreshold(threshold=binvar)
sel.fit(df1[cats])
retain   = sel.get_support(indices=True)
features = [cats[ind] for ind in retain] + conts + ["loss"]
df2      = df1[features]
print("\nShape of reduced data frame: {0}".format(df2.shape))
print("\nList of remaining features (df2): \n{0}".format(df2.columns.values))


Shape of reduced data frame: (188318, 194)

List of remaining features (df2): 
['cat1_A' 'cat1_B' 'cat2_A' 'cat2_B' 'cat3_A' 'cat3_B' 'cat4_A' 'cat4_B'
 'cat5_A' 'cat5_B' 'cat6_A' 'cat6_B' 'cat8_A' 'cat8_B' 'cat9_A' 'cat9_B'
 'cat10_A' 'cat10_B' 'cat11_A' 'cat11_B' 'cat12_A' 'cat12_B' 'cat13_A'
 'cat13_B' 'cat23_A' 'cat23_B' 'cat25_A' 'cat25_B' 'cat26_A' 'cat26_B'
 'cat27_A' 'cat27_B' 'cat36_A' 'cat36_B' 'cat37_A' 'cat37_B' 'cat38_A'
 'cat38_B' 'cat44_A' 'cat44_B' 'cat50_A' 'cat50_B' 'cat53_A' 'cat53_B'
 'cat71_A' 'cat71_B' 'cat72_A' 'cat72_B' 'cat73_A' 'cat73_B' 'cat75_A'
 'cat75_B' 'cat79_B' 'cat79_D' 'cat80_B' 'cat80_D' 'cat81_B' 'cat81_D'
 'cat82_A' 'cat82_B' 'cat82_D' 'cat83_A' 'cat83_B' 'cat83_D' 'cat84_A'
 'cat84_C' 'cat86_B' 'cat86_C' 'cat86_D' 'cat87_B' 'cat87_D' 'cat88_A'
 'cat88_D' 'cat90_A' 'cat90_B' 'cat91_A' 'cat91_B' 'cat91_G' 'cat92_A'
 'cat92_H' 'cat93_C' 'cat93_D' 'cat94_B' 'cat94_C' 'cat94_D' 'cat95_C'
 'cat95_D' 'cat95_E' 'cat96_E' 'cat97_A' 'cat97_C' 'cat97_E' 'ca

In [27]:
'''
Eliminate one dummy binary per category not affected by the low-variance variable removal.
'''
remove = []
for key,nlevels in catdict.items():
    binlist = [feature for feature in features if key+"_" in feature]
    if len(binlist) == nlevels:
        remove.append(binlist[0])
keep = [feature for feature in features if feature not in remove]
df2 = df2[keep]
print("\nShape of reduced data frame: {0}".format(df2.shape))
print("\nList of remaining features (df2): \n{0}".format(df2.columns.values))


Shape of reduced data frame: (188318, 170)

List of remaining features (df2): 
['cat1_B' 'cat2_B' 'cat3_B' 'cat4_B' 'cat5_B' 'cat6_B' 'cat8_B' 'cat9_B'
 'cat10_B' 'cat11_B' 'cat12_B' 'cat13_B' 'cat23_B' 'cat25_B' 'cat26_B'
 'cat27_B' 'cat36_B' 'cat37_B' 'cat38_B' 'cat44_B' 'cat50_B' 'cat53_B'
 'cat71_B' 'cat72_B' 'cat73_A' 'cat73_B' 'cat75_A' 'cat75_B' 'cat79_B'
 'cat79_D' 'cat80_B' 'cat80_D' 'cat81_B' 'cat81_D' 'cat82_A' 'cat82_B'
 'cat82_D' 'cat83_A' 'cat83_B' 'cat83_D' 'cat84_A' 'cat84_C' 'cat86_B'
 'cat86_C' 'cat86_D' 'cat87_B' 'cat87_D' 'cat88_A' 'cat88_D' 'cat90_A'
 'cat90_B' 'cat91_A' 'cat91_B' 'cat91_G' 'cat92_A' 'cat92_H' 'cat93_C'
 'cat93_D' 'cat94_B' 'cat94_C' 'cat94_D' 'cat95_C' 'cat95_D' 'cat95_E'
 'cat96_E' 'cat97_A' 'cat97_C' 'cat97_E' 'cat97_G' 'cat98_A' 'cat98_C'
 'cat98_D' 'cat98_E' 'cat99_P' 'cat99_R' 'cat99_T' 'cat100_F' 'cat100_G'
 'cat100_H' 'cat100_I' 'cat100_J' 'cat100_K' 'cat100_L' 'cat101_A'
 'cat101_C' 'cat101_D' 'cat101_F' 'cat101_G' 'cat102_A' 'cat103_A'
 

In [28]:
'''
Replace loss by log-loss.
'''
df2.loc[:,'logloss'] = np.log(df2.loc[:,'loss'])
df2 = df2.drop(['loss'], axis=1)
print("Shape of df2 data frame: \n{0}".format(df2.shape))
print("\nHead of df2 data frame: \n{0}".format(df2.head(1)))

Shape of df2 data frame: 
(188318, 170)

Head of df2 data frame: 
    cat1_B  cat2_B  cat3_B  cat4_B  cat5_B  cat6_B  cat8_B  cat9_B  cat10_B  \
id                                                                            
1        0       1       0       1       0       0       0       1        0   

    cat11_B    ...        cont6    cont7   cont8    cont9  cont10    cont11  \
id             ...                                                            
1         1    ...     0.718367  0.33506  0.3026  0.67135  0.8351  0.569745   

      cont12    cont13    cont14   logloss  
id                                          
1   0.594646  0.822493  0.714843  7.702186  

[1 rows x 170 columns]


In [29]:
'''
Check mutual information between continuous variables and log-loss.
'''
from sklearn.feature_selection import mutual_info_regression

mi = mutual_info_regression(df2[conts], df2["logloss"])
mi /= np.max(mi)
print("\nMutual Informations: {0}\n".format(mi))


Mutual Informations: [ 0.73119671  1.          0.51198742  0.50042624  0.2597866   0.58276073
  0.64161673  0.42195019  0.8663861   0.65851771  0.91686439  0.97052901
  0.80057641  0.84759527]

