In [None]:
import numpy as np
import pandas as pd
import os
# Seed the random generator.
np.random.seed(0)


paths = ['data/X_train.csv', 'data/Y_train.csv', 'data/X_test.csv' ]

# read all that is relevant for 
ALL_FEATURES = pd.read_csv(paths[0], sep = ',')
ALL_LABELS = pd.read_csv(paths[1], sep = ',')

# read test data
TEST_FEATURES = pd.read_csv(paths[2], sep = ',')

# Filling NaN values
print 'Filling NaN values:'
print ALL_FEATURES.isnull().sum()
print TEST_FEATURES.isnull().sum()
ALL_FEATURES.fillna(-1, inplace=True)
TEST_FEATURES.fillna(-1, inplace=True)
print

### Select features

In [None]:
FEATURES = ['CustomerMD5Key', 'SCID', 'SelectedPackage', 'FirstDriverMaritalStatus', 
            'CarAnnualMileage', 'CarFuelId', 'CarUsageId', 'FirstDriverAge', 'CarInsuredValue', 
            'CarAge', 'FirstDriverDrivingLicenseNumberY', 'VoluntaryExcess', 'CarParkingTypeId', 
            'PolicyHolderNoClaimDiscountYears', 'FirstDriverDrivingLicenceType', 'CoverIsNoClaimDiscountSelected', 
            'CarDrivingEntitlement', 'CarTransmissionId', 'SocioDemographicId', 'PolicyHolderResidencyArea', 
            'AllDriversNbConvictions', 'RatedDriverNumber', 'IsPolicyholderAHomeowner', 'CarMakeId', 
            'DaysSinceCarPurchase', 'NameOfPolicyProduct', 'AffinityCodeId']

ALL_FEATURES = ALL_FEATURES[ FEATURES ]
TEST_FEATURES = TEST_FEATURES[ FEATURES ]

### Convert categorical features to integers

In [None]:
CATEGORICAL_FEATURES = ['SCID', 'NameOfPolicyProduct', 'CarMakeId']

for feature in CATEGORICAL_FEATURES:
    strings = ALL_FEATURES[feature].unique()
    mapping = dict(zip( strings, np.arange(len(strings)) ))
    
    ALL_FEATURES[ feature ] = ALL_FEATURES[ feature ].map(mapping)
    TEST_FEATURES[ feature ] = TEST_FEATURES[ feature ].map(mapping)
    


### Adding label column

In [None]:
ALL = pd.concat([ALL_FEATURES, ALL_LABELS['Converted']], axis = 1)
ALL.columns = list(ALL.columns)[:-1] + ['Label']

In [None]:
ALL

### Split data

In [None]:
customers = np.random.permutation(ALL['CustomerMD5Key'].unique())

TRAIN_IDS = customers[                         :int(0.8 * len(customers))]

TRAIN_IDS = set(TRAIN_IDS)
ALL['is_train_data'] = ALL['CustomerMD5Key'].apply(lambda x: x in TRAIN_IDS)

TRAIN = ALL[ ALL['is_train_data'] == True ].reset_index()
VALID = ALL[ ALL['is_train_data'] == False].reset_index()

In [None]:
for category in ['SCID', 'SelectedPackage', 'FirstDriverMaritalStatus', 'CarFuelId',
                 'CarUsageId', 'FirstDriverDrivingLicenseNumberY', 'CarParkingTypeId', 'FirstDriverDrivingLicenceType',
                 'CoverIsNoClaimDiscountSelected', 'CarDrivingEntitlement', 'CarTransmissionId',
                 'SocioDemographicId', 'PolicyHolderResidencyArea',
                 'AllDriversNbConvictions', 'RatedDriverNumber', 'IsPolicyholderAHomeowner',
                 'CarMakeId', 'NameOfPolicyProduct', 'AffinityCodeId']:
    groups = TRAIN.groupby(category)

    mapping_sum = groups['Label'].sum()
    mapping_sum = dict(zip(mapping_sum.keys(), mapping_sum.tolist()))

    mapping_total = groups.size()
    mapping_total = dict(zip(mapping_total.keys(), mapping_total.tolist()))

    TRAIN['%s_prior' % category] = (TRAIN[category].map(mapping_sum) - TRAIN['Label']) /\
            (TRAIN[category].map(mapping_total) - 1)
        
    VALID['%s_prior' % category] = (VALID[category].map(mapping_sum) - VALID['Label']) /\
            (VALID[category].map(mapping_total) - 1)
    
    ALL['%s_prior' % category] = (ALL[category].map(mapping_sum) - ALL['Label']) /\
            (ALL[category].map(mapping_total) - 1)
        
        
# Reset unknown priors to zero
ALL.fillna(0., inplace=True)
TRAIN.fillna(0., inplace=True)
VALID.fillna(0., inplace=True)
# TEST.fillna(0., inplace=True)

### Drop is_train_data boolean

In [None]:
for df in [ALL, TRAIN, VALID]:
    df.drop('is_train_data', axis=1, inplace=True)

### Save the data

In [None]:
# if not os.path.exists('preprocessed'):
#     os.mkdir('preprocessed')

# for path, df in zip(['train_all.csv', 'train.csv', 'valid.csv'],[ALL, TRAIN, VALID]):
#     df.to_csv(os.path.join('preprocessed', path))

In [None]:
print ALL.isnull().sum()
print TEST.isnull().sum()