In [12]:
import pandas as pd
import numpy as np

from datetime import datetime

from sklearn import decomposition
from sklearn.ensemble import *
from sklearn.feature_selection import *
from sklearn.model_selection import *
from sklearn.preprocessing import *

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.regularizers import L1L2
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model, load_model
from tensorflow import feature_column

import matplotlib.pyplot as plt

import csv
import time
import copy
import os
import tempfile

In [13]:
from azureml.core import Workspace, Dataset

subscription_id = '4bf9de72-bf24-4d91-a718-11b60032a45f'
resource_group = 'funghi'
workspace_name = 'funghi-ml'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='5001_train_t')

df = dataset.to_pandas_dataframe()


In [14]:
df['Label'] = df['Claim_Amount'] > 0 
df['Label'] = df['Calendar_Year'] * 10 + df['Label'].astype(int)
df = df.groupby('Label').apply(lambda x: x.sample(frac=0.1))
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1318429 entries, (20050, 8610602) to (20071, 3778506)
Data columns (total 36 columns):
Row_ID            1318429 non-null int64
Household_ID      1318429 non-null int64
Vehicle           1318429 non-null int64
Calendar_Year     1318429 non-null int64
Model_Year        1318429 non-null int64
Blind_Make        1318429 non-null object
Blind_Model       1318429 non-null object
Blind_Submodel    1318429 non-null object
Cat1              1318429 non-null object
Cat2              1318429 non-null object
Cat3              1318429 non-null object
Cat4              1318429 non-null object
Cat5              1318429 non-null object
Cat6              1318429 non-null object
Cat7              1318429 non-null object
Cat8              1318429 non-null object
Cat9              1318429 non-null object
Cat10             1318429 non-null object
Cat11             1318429 non-null object
Cat12             1318429 non-null object
OrdCat            1317674 no

In [15]:
cats = ['Cat1','Cat2','Cat3','Cat4','Cat5','Cat6','Cat7','Cat8','Cat9','Cat10','Cat11','Cat12']
ncats = ['Vehicle','OrdCat','NVCat']
nums = ['Var1','Var2','Var3','Var4','Var5','Var6','Var7','Var8','NVVar1','NVVar2','NVVar3','NVVar4']
embedding = ['Blind_Make','Blind_Model','Blind_Submodel']

In [16]:
for c in cats:
    df[c] = df[c].replace('?', None)
    df[c] = df[c].replace('', None)
    
df['OrdCat'] = df['OrdCat'].fillna(value=0).apply(str).replace('0.0', None)

In [17]:
train, test = train_test_split(df, test_size=0.2, stratify=df['Label'])
train, val = train_test_split(train, test_size=0.2, stratify=train['Label'])

train = train.drop(['Label','Household_ID','Row_ID'], axis=1)
test = test.drop(['Label','Household_ID','Row_ID'], axis=1)
val = val.drop(['Label','Household_ID','Row_ID'], axis=1)

train_target = train.pop('Claim_Amount')
val_target = val.pop('Claim_Amount')
test_target = test.pop('Claim_Amount')

In [18]:
def get_feature_layer():
    feature_columns = []
    
    for c in cats:
        c_list = feature_column.categorical_column_with_vocabulary_list(c, df[c].unique())
        feature_columns.append(feature_column.indicator_column(c_list))

    for n in nums:
        feature_columns.append(feature_column.numeric_column(n))

    for n in ncats:
        n_list = feature_column.categorical_column_with_vocabulary_list(n, df[n].unique())
        feature_columns.append(feature_column.indicator_column(n_list))

    for e in embedding:
        v = df[e].unique()
        l = feature_column.categorical_column_with_vocabulary_list(e, v)
        feature_columns.append(feature_column.embedding_column(l, dimension=len(v)//10))
        
    feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
    
    return feature_layer


In [19]:
def get_model():
    model = tf.keras.Sequential([
        get_feature_layer(),
        layers.Dense(64, activation='relu', kernel_regularizer=L1L2(l1=0.0, l2=0.01)),
        layers.Dense(64, activation='relu', kernel_regularizer=L1L2(l1=0.0, l2=0.01)),
        layers.Dense(1, activation='linear')
    ])

    model.compile(optimizer='adam',
                  loss='mae',
                  metrics=['mae'])
    
    return model


In [20]:
def df_to_dataset(dataframe, labels, shuffle=True, batch_size=32):
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

train_ds = df_to_dataset(train, train_target)
val_ds = df_to_dataset(val, val_target)

In [21]:
model = get_model()

history = model.fit(train_ds,
          validation_data=val_ds,
          epochs=3)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



Epoch 1/3
Epoch 2/3
Epoch 3/3


In [64]:
def gini_normalized(actual, pred):
    n = tf.shape(actual)[1]
    indices = tf.nn.top_k(pred, k=n)[1][0]
    actual_sorted = tf.gather(actual[0], indices)
    cost = tf.reduce_sum(actual_sorted)
    loss_proportion = tf.cumsum(actual_sorted) / cost
    null_model = tf.compat.v1.to_double(tf.range(1, n + 1)) / tf.compat.v1.to_double(n)
    g = tf.subtract(loss_proportion, null_model)
    g = tf.reduce_sum(g) / tf.compat.v1.to_double(n)
    g /= (1.0 - tf.reduce_mean(actual)) / 2.0
    return g

In [30]:
model.evaluate(df_to_dataset(test, test_target, shuffle=False))



In [56]:
result = model.predict(df_to_dataset(test, test_target, shuffle=False))

In [69]:
y_pred = np.transpose(np.array([r if r[0] > 0 else [0] for r in result]))
y_true = np.transpose(np.array([[t] for t in test_target.values]))
score = gini_normalized(y_true,y_pred)

print(score.numpy())

0.07326340098358843


In [55]:
model.summary()
model.save('model.h5')

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_features_1 (DenseFeatu multiple                  801619    
_________________________________________________________________
dense_3 (Dense)              multiple                  31680     
_________________________________________________________________
dense_4 (Dense)              multiple                  4160      
_________________________________________________________________
dense_5 (Dense)              multiple                  65        
Total params: 837,524
Trainable params: 837,524
Non-trainable params: 0
_________________________________________________________________
