In [22]:
import numpy as np
import pandas as pd
import time
import os

paths = ['data/X_train.csv', 'data/Y_train.csv', 'data/X_test.csv' ]

# read all that is relevant for 
ALL_FEATURES = pd.read_csv(paths[0], sep = ',')
ALL_LABELS = pd.read_csv(paths[1], sep = ',')

# read test data
TEST = pd.read_csv(paths[2], sep = ',')

# Filling NaN values
print 'Filling NaN values:'
print ALL_FEATURES.isnull().sum()
print TEST.isnull().sum()
ALL_FEATURES.fillna(-1, inplace=True)
TEST.fillna(-1, inplace=True)
print

Filling NaN values:
Unnamed: 0                           0
CustomerMD5Key                       0
ReceivedDateTime                     0
SCID                                 4
SelectedPackage                      0
FirstDriverMaritalStatus             0
CarAnnualMileage                     0
CarFuelId                            0
CarUsageId                           0
FirstDriverAge                       0
CarInsuredValue                      0
CarAge                               0
FirstDriverDrivingLicenseNumberY     0
VoluntaryExcess                      0
CarParkingTypeId                     0
PolicyHolderNoClaimDiscountYears     0
FirstDriverDrivingLicenceType        0
CoverIsNoClaimDiscountSelected       0
CarDrivingEntitlement                0
CarTransmissionId                    0
SocioDemographicId                   0
PolicyHolderResidencyArea            0
AllDriversNbConvictions              0
TodayDate                            0
RatedDriverNumber                    0
IsPol

### Select features

In [23]:
FEATURES = ['CustomerMD5Key', 'SCID', 'SelectedPackage', 'FirstDriverMaritalStatus', 
            'CarAnnualMileage', 'CarFuelId', 'CarUsageId', 'FirstDriverAge', 'CarInsuredValue', 
            'CarAge', 'FirstDriverDrivingLicenseNumberY', 'VoluntaryExcess', 'CarParkingTypeId', 
            'PolicyHolderNoClaimDiscountYears', 'FirstDriverDrivingLicenceType', 'CoverIsNoClaimDiscountSelected', 
            'CarDrivingEntitlement', 'CarTransmissionId', 'SocioDemographicId', 'PolicyHolderResidencyArea', 
            'AllDriversNbConvictions', 'RatedDriverNumber', 'IsPolicyholderAHomeowner', 'CarMakeId', 
            'DaysSinceCarPurchase', 'NameOfPolicyProduct', 'AffinityCodeId']

ALL_FEATURES = ALL_FEATURES[ FEATURES ]
TEST = TEST[ FEATURES ]

### Convert categorical features to integers

In [24]:
ALL_FEATURES['age_dummy'] = (ALL_FEATURES['FirstDriverAge'] < 25) * 0.0
ALL_FEATURES['age_dummy'] += ((ALL_FEATURES['FirstDriverAge'] > 25) & (ALL_FEATURES['FirstDriverAge'] < 40)) * 1.0
ALL_FEATURES['age_dummy'] += ((ALL_FEATURES['FirstDriverAge'] > 40) & (ALL_FEATURES['FirstDriverAge'] < 55)) * 2.0
ALL_FEATURES['age_dummy'] += ((ALL_FEATURES['FirstDriverAge'] > 55) & (ALL_FEATURES['FirstDriverAge'] < 70)) * 3.0
ALL_FEATURES['age_dummy'] += (ALL_FEATURES['FirstDriverAge'] > 70) * 4.0

ALL_FEATURES['age_broker_interaction'] = ALL_FEATURES["SCID"].map(str) + ALL_FEATURES["age_dummy"].map(str)

TEST['age_dummy'] = (TEST['FirstDriverAge'] < 25) * 0.0
TEST['age_dummy'] += ((TEST['FirstDriverAge'] > 25) & (TEST['FirstDriverAge'] < 40)) * 1.0
TEST['age_dummy'] += ((TEST['FirstDriverAge'] > 40) & (TEST['FirstDriverAge'] < 55)) * 2.0
TEST['age_dummy'] += ((TEST['FirstDriverAge'] > 55) & (TEST['FirstDriverAge'] < 70)) * 3.0
TEST['age_dummy'] += (TEST['FirstDriverAge'] > 70) * 4.0

TEST['age_broker_interaction'] = TEST["SCID"].map(str) + TEST["age_dummy"].map(str)

In [26]:
CATEGORICAL_FEATURES = ['SCID', 'NameOfPolicyProduct', 'CarMakeId', 'age_broker_interaction']

for feature in CATEGORICAL_FEATURES:
    strings = ALL_FEATURES[feature].unique()
    mapping = dict(zip( strings, np.arange(len(strings)) ))
    
    ALL_FEATURES[ feature ] = ALL_FEATURES[ feature ].map(mapping)
    TEST[ feature ] = TEST[ feature ].map(mapping)

print ALL_FEATURES['age_broker_interaction']

# drop the age dummy
ALL_FEATURES.drop('age_dummy', axis=1, inplace=True)
TEST.drop('age_dummy', axis=1, inplace=True)



0            0
1            1
2            2
3            3
4            4
5            5
6            6
7            7
8            8
9            9
10          10
11          11
12          12
13          13
14          14
15          15
16           8
17          16
18          17
19          18
20          19
21          20
22          19
23          21
24          22
25          23
26          24
27          25
28          26
29          27
          ... 
2911288    139
2911289     20
2911290     14
2911291     37
2911292     20
2911293     42
2911294    115
2911295     19
2911296     88
2911297     52
2911298    115
2911299    159
2911300     61
2911301     32
2911302     15
2911303     93
2911304     27
2911305      0
2911306     84
2911307    585
2911308     77
2911309    127
2911310     37
2911311    113
2911312    184
2911313     61
2911314     11
2911315     11
2911316     10
2911317     18
Name: age_broker_interaction, dtype: int64


### Adding label column

In [27]:
TRAIN_ALL = pd.concat([ALL_FEATURES, ALL_LABELS['Converted']], axis = 1)
TRAIN_ALL.columns = list(TRAIN_ALL.columns)[:-1] + ['Label']

In [28]:
TRAIN_ALL

Unnamed: 0,CustomerMD5Key,SCID,SelectedPackage,FirstDriverMaritalStatus,CarAnnualMileage,CarFuelId,CarUsageId,FirstDriverAge,CarInsuredValue,CarAge,FirstDriverDrivingLicenseNumberY,VoluntaryExcess,CarParkingTypeId,PolicyHolderNoClaimDiscountYears,FirstDriverDrivingLicenceType,CoverIsNoClaimDiscountSelected,CarDrivingEntitlement,CarTransmissionId,SocioDemographicId,PolicyHolderResidencyArea,AllDriversNbConvictions,RatedDriverNumber,IsPolicyholderAHomeowner,CarMakeId,DaysSinceCarPurchase,NameOfPolicyProduct,AffinityCodeId,age_broker_interaction,Label
0,0x0af8dd0b86f6bdc5ecc29ee8a587e5a5,0,3.0,1.0,4001.0,1.0,1.0,58.0,3001.0,6.0,30.0,250.0,2.0,9.0,1.0,0.0,3.0,1.0,60.0,27.0,0.0,1.0,1.0,0,416.0,0,0.0,0,0.0
1,0xc2ab85a1e67787e05612f81b4d2d4b44,1,3.0,4.0,4001.0,1.0,4.0,55.0,0.0,14.0,30.0,200.0,3.0,9.0,1.0,1.0,1.0,1.0,99.0,24.0,0.0,1.0,1.0,1,435.0,0,63.0,1,0.0
2,0xd355bb50d71da053adb6aa0a2a4ff887,2,3.0,1.0,5001.0,1.0,0.0,31.0,1001.0,10.0,8.0,250.0,3.0,7.0,1.0,0.0,1.0,1.0,77.0,52.0,0.0,1.0,1.0,2,1098.0,1,39.0,2,0.0
3,0x7022e0ce1ea3c8a110ab6a9feba3cef0,3,3.0,2.0,5001.0,2.0,4.0,52.0,23001.0,1.0,8.0,250.0,2.0,7.0,1.0,0.0,1.0,1.0,42.0,40.0,0.0,1.0,1.0,3,4.0,2,0.0,3,0.0
4,0x3414ea05924ca2421f03d93a1f04af49,4,3.0,3.0,5001.0,1.0,1.0,29.0,2001.0,7.0,11.0,250.0,2.0,10.0,1.0,0.0,3.0,1.0,197.0,33.0,0.0,2.0,1.0,4,2107.0,1,31.0,4,0.0
5,0x4efd209f05292506dcc412e7a86c9a19,5,3.0,1.0,9001.0,1.0,4.0,33.0,0.0,11.0,11.0,50.0,3.0,9.0,1.0,0.0,3.0,1.0,81.0,36.0,0.0,1.0,0.0,5,1440.0,0,0.0,5,0.0
6,0x0f2783f526c08471ad86590b03813ff1,6,3.0,1.0,5001.0,2.0,4.0,65.0,0.0,14.0,30.0,250.0,3.0,8.0,1.0,0.0,1.0,1.0,72.0,33.0,0.0,1.0,1.0,4,1915.0,0,0.0,6,0.0
7,0xba88aedcef17d5791398b12af0395a85,7,3.0,2.0,9001.0,2.0,4.0,61.0,6001.0,3.0,30.0,200.0,2.0,13.0,1.0,1.0,2.0,1.0,22.0,35.0,0.0,1.0,1.0,4,761.0,3,0.0,7,0.0
8,0x7a318c242d262a78e520add9c27f7b4f,8,3.0,5.0,1001.0,1.0,4.0,76.0,9001.0,0.0,10.0,250.0,2.0,9.0,1.0,0.0,1.0,2.0,71.0,34.0,0.0,1.0,0.0,2,1.0,0,29.0,8,0.0
9,0x311a0075e7fa712cf0cfe9f4bb9652e0,9,3.0,2.0,11001.0,2.0,4.0,61.0,18001.0,1.0,20.0,250.0,2.0,9.0,1.0,1.0,1.0,2.0,1.0,25.0,0.0,1.0,1.0,6,428.0,0,57.0,9,0.0


### Split data

In [29]:
# Seed the random generator.
np.random.seed(1)
IDS = np.random.permutation(TRAIN_ALL['CustomerMD5Key'].unique())

TRAIN_IDS = IDS[                         :int(0.7 * len(IDS))]

TRAIN_IDS = set(TRAIN_IDS)
TRAIN_ALL['is_train_data'] = TRAIN_ALL['CustomerMD5Key'].apply(lambda x: x in TRAIN_IDS)

TRAIN = TRAIN_ALL[ TRAIN_ALL['is_train_data'] == True ].reset_index()
VALID = TRAIN_ALL[ TRAIN_ALL['is_train_data'] == False].reset_index()

### Count number of times presented.

In [30]:
for df in [TRAIN_ALL, TRAIN, VALID, TEST]:
    df['offer_count'] = df['CustomerMD5Key'].map(df['CustomerMD5Key'].value_counts())


### Categorical data to prior probabilities

In [32]:
start = time.time()
for category in ['SCID', 'SelectedPackage', 'FirstDriverMaritalStatus', 'CarFuelId',
                 'CarUsageId', 'FirstDriverDrivingLicenseNumberY', 'CarParkingTypeId', 'FirstDriverDrivingLicenceType',
                 'CoverIsNoClaimDiscountSelected', 'CarDrivingEntitlement', 'CarTransmissionId',
                 'SocioDemographicId', 'PolicyHolderResidencyArea',
                 'AllDriversNbConvictions', 'RatedDriverNumber', 'IsPolicyholderAHomeowner',
                 'CarMakeId', 'NameOfPolicyProduct', 'AffinityCodeId', 'age_broker_interaction']:
    
    groups = TRAIN.groupby(category)

    sizes = dict(groups.size())
    
    TRAIN[category] = TRAIN[category].apply(lambda x: x if sizes[x] > 100 else -1)
    
    groups = TRAIN.groupby(category)
    mapping = groups['Label'].sum() / groups.size()
    mapping = dict(zip(mapping.keys(), mapping.tolist()))
    
    
    TRAIN['%s_prior' % category] = TRAIN[category].map(mapping)
        
    VALID['%s_prior' % category] = VALID[category].map(mapping)
    
    TRAIN_ALL['%s_prior' % category] = TRAIN_ALL[category].map(mapping)
        
    TEST['%s_prior' % category] = TEST[category].map(mapping)
        
        
# Reset unknown priors to zero
for df in [TRAIN_ALL, TRAIN, VALID, TEST]:
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(0., inplace=True)

print 'Generating categorical prior features took %.2f seconds' % (time.time() - start)

Generating categorical prior features took 42.95 seconds


### Ranking features

In [33]:
start = time.time()

for df in [ALL, TRAIN, VALID, TEST]:
    groups = df.groupby('CustomerMD5Key')
    for feature in ['CarAnnualMileage', 'FirstDriverAge', 'CarInsuredValue', 'CarAge', 'VoluntaryExcess', 'CarParkingTypeId', 
                    'PolicyHolderNoClaimDiscountYears', 'FirstDriverDrivingLicenceType', 'CoverIsNoClaimDiscountSelected', 
                    'CarDrivingEntitlement', 'CarTransmissionId', 'DaysSinceCarPurchase'
                   ]:
        df['%s_ranked' % feature] = groups[feature].rank(pct=True)

print 'Generating ranking features took %.2f seconds' % (time.time() - start)

Generating ranking features took 893.49 seconds


### Drop is_train_data boolean

In [34]:
for df in [TRAIN_ALL, TRAIN, VALID]:
    df.drop('is_train_data', axis=1, inplace=True)

### Save the data

In [35]:
start = time.time()

if not os.path.exists('preprocessed'):
    os.mkdir('preprocessed')

for path, df in zip(['train_all.pkl', 'train.pkl', 'valid.pkl', 'test.pkl'],[TRAIN_ALL, TRAIN, VALID, TEST]):
    df.to_pickle(os.path.join('preprocessed', path))

print 'Writing took %.2f seconds' % (time.time() - start)

Writing took 28.60 seconds


# Part 2: Selection & Normalization

In [36]:
TRAIN_ALL = pd.read_pickle('preprocessed/train_all.pkl')
TRAIN = pd.read_pickle('preprocessed/train.pkl')
VALID = pd.read_pickle('preprocessed/valid.pkl')
TEST  = pd.read_pickle('preprocessed/test.pkl')

In [37]:
FEATURE_SELECTION = ['CarAnnualMileage', 'FirstDriverAge', 'CarInsuredValue', 'CarAge', 'VoluntaryExcess', 'CarParkingTypeId', 
            'PolicyHolderNoClaimDiscountYears', 'FirstDriverDrivingLicenceType', 'CoverIsNoClaimDiscountSelected', 
            'CarDrivingEntitlement', 'CarTransmissionId', 'DaysSinceCarPurchase', 
            
             'CarAnnualMileage_ranked', 'FirstDriverAge_ranked', 'CarInsuredValue_ranked', 'CarAge_ranked', 
             'VoluntaryExcess_ranked', 'CarParkingTypeId_ranked', 'PolicyHolderNoClaimDiscountYears_ranked', 
             'FirstDriverDrivingLicenceType_ranked', 'CoverIsNoClaimDiscountSelected_ranked', 
             'CarDrivingEntitlement_ranked', 'CarTransmissionId_ranked', 'DaysSinceCarPurchase_ranked',
            
            'offer_count',
                     
            'SCID_prior', 'SelectedPackage_prior', 'FirstDriverMaritalStatus_prior', 'CarFuelId_prior', 
            'CarUsageId_prior', 'FirstDriverDrivingLicenseNumberY_prior', 'CarParkingTypeId_prior', 
            'FirstDriverDrivingLicenceType_prior', 'CoverIsNoClaimDiscountSelected_prior', 'CarDrivingEntitlement_prior', 
            'CarTransmissionId_prior', 'SocioDemographicId_prior', 'PolicyHolderResidencyArea_prior', 
            'AllDriversNbConvictions_prior', 'RatedDriverNumber_prior', 'IsPolicyholderAHomeowner_prior', 
            'CarMakeId_prior', 'NameOfPolicyProduct_prior', 'AffinityCodeId_prior'
           ]

In [38]:
for feature in FEATURE_SELECTION:
    mean, sigma = TRAIN[feature].mean(), (TRAIN[feature].std() + 1e-6)
    TRAIN[feature] = (TRAIN[feature] - mean) / sigma
    VALID[feature] = (VALID[feature] - mean) / sigma
    TEST[feature]  = (TEST[feature] - mean) / sigma

In [40]:
X_train = TRAIN.as_matrix(FEATURE_SELECTION)
T_train = TRAIN.as_matrix(['Label']).flatten()

X_valid = VALID.as_matrix(FEATURE_SELECTION)
T_valid = VALID.as_matrix(['Label']).flatten()

X_test  = TEST.as_matrix(FEATURE_SELECTION)

np.savez('preprocessed/preprocess.npz', X_train=X_train, T_train=T_train, X_valid=X_valid, T_valid=T_valid, X_test=X_test)