# Long-Term Customer Learning
For customers with greater or equal to 6 months records, apply LSTM sequence classification model.

## Load data
 For now, only testset(2000 customers) is used.

In [10]:
import sys
sys.path.append('./learn')

from learn_ann import get_learner_params_all
from tensorflow import keras
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold


DIR = './organized_dataset/'

seed = 7
np.random.seed(seed)

df = pd.read_csv(DIR+'feature_engineered_test.csv',header=0)
df_35 = pd.read_csv(DIR+'sorted_35_test.csv',header=0)

print('Data loading Done')
print(df.describe())

col_max_value = df.iloc[:,4:-1].max()
col_min_value = df.iloc[:,4:-1].min()

df.iloc[:,4:-1] = (df.iloc[:,4:-1] - col_min_value) / (col_max_value - col_min_value)
df.fillna(0, inplace=True)

df_35.iloc[:,4:-1] = (df_35.iloc[:,4:-1] - col_min_value) / (col_max_value - col_min_value)
df_35.fillna(0, inplace=True)

print('Done')

Data loading Done
        Unnamed: 0           CID          DATE   PREV_STATUS      N_CR_ABM  \
count  61579.00000  6.157900e+04  6.157900e+04  61579.000000  61579.000000   
mean   30789.00000  1.503712e+06  1.507144e+09      0.035207      0.193085   
std    17776.47045  6.052459e+05  2.600310e+07      0.184304      0.920835   
min        0.00000  1.315130e+05  1.461974e+09      0.000000      0.000000   
25%    15394.50000  1.050567e+06  1.485821e+09      0.000000      0.000000   
50%    30789.00000  1.835470e+06  1.506730e+09      0.000000      0.000000   
75%    46183.50000  1.970801e+06  1.530317e+09      0.000000      0.000000   
max    61578.00000  2.234805e+06  1.551312e+09      1.000000     47.000000   

          N_CR_BRCH  N_CR_CHQ      N_CR_EFT   N_CR_MOBILE   N_CR_ONLINE  \
count  61579.000000   61579.0  61579.000000  61579.000000  61579.000000   
mean       0.198542       0.0      0.910392      0.832881      0.381689   
std        0.777817       0.0      1.989308      2.607

### Retrieve Long-term customers' records

In [11]:
SHORT_TERM = 3

print('Term: ',SHORT_TERM)

temp = df.groupby('CID').count()
temp.rename(columns={temp.columns[0]:'length'}, inplace=True)

# get as Series of CID with contract-length less than SHORT_TERM
# temp = temp.loc[temp['length'] >= SHORT_TERM].iloc[:,0]
temp = temp.loc[temp['length'] < SHORT_TERM].iloc[:,0]
print('# of short-term CID: ',temp.shape[0])

short_cid_set = set(temp.index.tolist())

# remove short-term customers
for cid in short_cid_set:
    df.drop(df[df['CID']==cid].index, inplace=True)

long_df = df
print('dataset: ',long_df.shape)
temp = long_df.groupby('CID').count()
temp.rename(columns={temp.columns[0]:'length'}, inplace=True)
temp = temp.loc[temp['length'] >= SHORT_TERM].iloc[:,0]
print('# of long-term CID: ',temp.shape[0])
print('Done')


Term:  3
# of short-term CID:  20
dataset:  (61552, 101)
# of long-term CID:  1982
Done


## Organizing Data
 Separate customers into group of customers with same length of records. Put each group into dictionary.
 So, Returning dict looks like: `dict = {1:group_1_month, 2:group_2_month, ..., 35:group_35_month}`

In [12]:

# get df's dictionary = {length:df,...}
def get_dict_dfs(df,short_term=6):
    dict_of_df = {}
  
    length_df = df.groupby('CID').count()
    length_df.rename(columns={length_df.columns[0]:'length'}, inplace=True)

    # exclude 35-months customers.
    for length in range(short_term,35):
        print('Processing length:',length)
        temp = length_df[length_df['length'] == length]
        short_cid_list = set(temp.index.values)

        current_df = pd.DataFrame(columns=df.columns)
        for cid in short_cid_list:
            temp = df.loc[df['CID']==cid].sort_values(by='DATE')
            current_df = current_df.append(temp, ignore_index=True)

        dict_of_df[length] = current_df
    
    # For 35-months customers
    length = 35
    print('For length:',length)
    dict_of_df[length] = df_35

    return dict_of_df

dict_dfs = get_dict_dfs(df,short_term = SHORT_TERM)
print('Done')

Processing length: 3
Processing length: 4
Processing length: 5
Processing length: 6
Processing length: 7
Processing length: 8
Processing length: 9
Processing length: 10
Processing length: 11
Processing length: 12
Processing length: 13
Processing length: 14
Processing length: 15
Processing length: 16
Processing length: 17
Processing length: 18
Processing length: 19
Processing length: 20
Processing length: 21
Processing length: 22
Processing length: 23
Processing length: 24
Processing length: 25
Processing length: 26
Processing length: 27
Processing length: 28
Processing length: 29
Processing length: 30
Processing length: 31
Processing length: 32
Processing length: 33
Processing length: 34
For length: 35
Done


## (X,y) Batch Generator
 The dataset we have now is matrix. For LSTM to work, we must transform matrix into Tensor. (adding 3rd dimesion for time)
 Each tensor has `dimension = (batch_size, sequence_length, num_of_features_per_timestep)`. `sequence_length` means number of timesteps in other words.
 We set `sequence_length = 6`.

In [13]:
BATCH_SIZE = 100

NUM_OF_FEATURES = df.iloc[0,3:-1].count()
# NUM_OF_FEATURE = 40
INDEX_OF_LAST_FEATURE = NUM_OF_FEATURES + 3

print('# of features: ',NUM_OF_FEATURES)

#### get next training batch of customers with 35 months contract
def next_train_batch_35(dict_dfs,num_of_features,index_of_last_feature,sequence_length,k=10,test_start_idx=0):
    if test_start_idx<0 or test_start_idx>=k:
        raise Exception("test_start_idx (%d) should be in range(0,k=%d)"%(test_start_idx,k))
    length = 35
    
    end = dict_dfs[length].shape[0]
    temp = int(end*(test_start_idx/k))
    test_start = temp - (temp % length)
    temp = int(end*((test_start_idx+1)/k))
    test_end = temp - (temp % length)
    
    df = dict_dfs[length].iloc[:test_start,:].append(dict_dfs[length].iloc[test_end:,:])
    end = df.shape[0]
    
    temp_index_list = [list(range(i,i+sequence_length)) for i in range(0,end-sequence_length+1,length)]
    batch_size = len(temp_index_list)
    yield end, batch_size
    
    while True:
        for start in range(0,length-sequence_length+1):
            temp_index_list = [list(range(i,i+sequence_length)) for i in range(start,end-sequence_length+1,length)]
            batch_size = len(temp_index_list)

            X = np.zeros((batch_size,sequence_length,num_of_features))
            y = np.zeros((batch_size,1))

            for b in range(0,batch_size):
                X[b,:,:] = df.iloc[temp_index_list[b],3:index_of_last_feature].values
                y[b,0] = df.iloc[temp_index_list[b][-1],-1]

            yield X,y.astype(int)

#### get next test batch of customers with 35 months contract
def next_test_batch_35(dict_dfs,num_of_features,index_of_last_feature,sequence_length,k=10,test_start_idx=0,):
    if test_start_idx<0 or test_start_idx>=k:
        raise Exception("test_start_idx (%d) should be in range(0,k=%d)"%(test_start_idx,k))
    
    length = 35
    end = dict_dfs[length].shape[0]
    temp = int(end*(test_start_idx/k))
    test_start = temp - (temp % length)
    temp = int(end*((test_start_idx+1)/k))
    test_end = temp - (temp % length)
    
    df = dict_dfs[length].iloc[test_start:test_end,:]
    end = df.shape[0]
    
    temp_index_list = [list(range(i,i+sequence_length)) for i in range(0,end-sequence_length+1,length)]
    batch_size = len(temp_index_list)
    
    yield end, batch_size
    
    while True:
        for start in range(0,length-sequence_length+1):
            temp_index_list = [list(range(i,i+sequence_length)) for i in range(start,end-sequence_length+1,length)]
            batch_size = len(temp_index_list)

            X = np.zeros((batch_size,sequence_length,num_of_features))
            y = np.zeros((batch_size,1))

            for b in range(0,batch_size):
                X[b,:,:] = df.iloc[temp_index_list[b],3:index_of_last_feature].values
                y[b,0] = df.iloc[temp_index_list[b][-1],-1]

            yield X,y.astype(int)

def next_train_batch(dict_dfs,num_of_features,index_of_last_feature,sequence_length,k=10,test_start_idx=0):
    if test_start_idx<0 or test_start_idx>=k:
        raise Exception("test_start_idx (%d) should be in range(0,k=%d)"%(test_start_idx,k))
    
    while True:
        for length in range(sequence_length,36):
            end = dict_dfs[length].shape[0]
            temp = int(end*(test_start_idx/k))
            test_start = temp - (temp % length)
            temp = int(end*((test_start_idx+1)/k))
            test_end = temp - (temp % length)

            df = dict_dfs[length].iloc[:test_start,:].append(dict_dfs[length].iloc[test_end:,:])
            end = df.shape[0]
            
            MAX_BATCH_SIZE = 10000
            for start in range(0,length-sequence_length+1):
                temp_index_list = [list(range(i,i+sequence_length)) for i in range(start,end-sequence_length+1,length)]
                batch_size = len(temp_index_list)

                X = np.zeros((batch_size,sequence_length,num_of_features))
                y = np.zeros((batch_size,1))
            
                for b in range(0,batch_size):
                    X[b,:,:] = df.iloc[temp_index_list[b],3:index_of_last_feature].values
                    y[b,0] = df.iloc[temp_index_list[b][-1],-1]

                yield X,y.astype(int)

def next_test_batch(dict_dfs,num_of_features,index_of_last_feature,sequence_length,k=10,test_start_idx=0):
    if test_start_idx<0 or test_start_idx>=k:
        raise Exception("test_start_idx (%d) should be in range(0,k=%d)"%(test_start_idx,k))
    
    while True:
        for length in range(sequence_length,36):
            end = dict_dfs[length].shape[0]
            temp = int(end*(test_start_idx/k))
            test_start = temp - (temp % length)
            temp = int(end*((test_start_idx+1)/k))
            test_end = temp - (temp % length)

            df = dict_dfs[length].iloc[test_start:test_end,:]
            end = df.shape[0]
            
            for start in range(0,length-sequence_length+1):
                temp_index_list = [list(range(i,i+sequence_length)) for i in range(start,end-sequence_length+1,length)]
                batch_size = len(temp_index_list)

                X = np.zeros((batch_size,sequence_length,num_of_features))
                y = np.zeros((batch_size,1))
            
                for b in range(0,batch_size):
                    X[b,:,:] = df.iloc[temp_index_list[b],3:index_of_last_feature].values
                    y[b,0] = df.iloc[temp_index_list[b][-1],-1]

                yield X,y.astype(int)
        
        
        
    

SEQUENCE_LENGTH = SHORT_TERM

k = 10

train_batch_generator = next_train_batch_35(dict_dfs,
                                            num_of_features = NUM_OF_FEATURES,
                                            index_of_last_feature=INDEX_OF_LAST_FEATURE,
                                            sequence_length=SEQUENCE_LENGTH)
test_batch_generator = next_test_batch_35(dict_dfs,
                                          num_of_features = NUM_OF_FEATURES,
                                          index_of_last_feature=INDEX_OF_LAST_FEATURE,
                                          sequence_length=SEQUENCE_LENGTH)

train_size, train_batch_size = next(train_batch_generator)
test_size, test_batch_size = next(test_batch_generator)

print(df.shape)
print('X_train: ',train_size)
print('train batch: ',train_batch_size)
print('X_test: ',test_size)
print('test batch: ',test_batch_size)
print('Done')

# of features:  97
(61552, 101)
X_train:  39900
train batch:  1140
X_test:  4410
test batch:  126
Done


In [32]:
X,y = next(train_batch_generator)
print(X.shape)
print(y.shape)

# X,y = next(train_batch_generator)
# print(X.shape)
# print(y.shape)


(1140, 3, 97)
(1140, 1)


## Learn LSTM
 We use AUC and k-folds cross-validation for evaluating model.
 Parameters are not tuned. So, it still has room for optimization.

In [14]:
import tensorflow as tf
K = tf.keras.backend

# https://www.kaggle.com/c/invasive-species-monitoring/discussion/32762
# calculate AUC
def auc2(y_true, y_pred):
    # https://stackoverflow.com/questions/48174323/tensorflow-1-4-tf-metrics-auc-for-auc-calculation
    auc = tf.metrics.auc(y_true, y_pred)[1]
    K.get_session().run(tf.local_variables_initializer())
    return auc

print('Learning LSTM')
print('# of features: ',NUM_OF_FEATURES)  
print('Sequence length: ',SEQUENCE_LENGTH)

cv_list = []     ## store cross-validation scores ##
num_of_folds = 5 ## k of k-fold cv ##
for k_index in range(num_of_folds):
    print('================ %d/%d th Learning ================'%(k_index+1,num_of_folds))
    train_batch_generator = next_train_batch_35(dict_dfs,
                                            num_of_features = NUM_OF_FEATURES,
                                            index_of_last_feature=INDEX_OF_LAST_FEATURE,
                                            sequence_length=SEQUENCE_LENGTH,
                                            k=num_of_folds,
                                            test_start_idx=k_index)
    test_batch_generator = next_test_batch_35(dict_dfs,
                                            num_of_features = NUM_OF_FEATURES,
                                            index_of_last_feature=INDEX_OF_LAST_FEATURE,
                                            sequence_length=SEQUENCE_LENGTH,
                                            k=num_of_folds,
                                            test_start_idx=k_index)

    train_size, train_batch_size = next(train_batch_generator)
    test_size, test_batch_size = next(test_batch_generator)

    print(df.shape)
    print('X_train: ',train_size)
    print('train batch: ',train_batch_size)
    print('X_test: ',test_size)
    print('test batch: ',test_batch_size)
  
    model = keras.Sequential()
    model.add(keras.layers.LSTM(50, batch_input_shape=(None, SEQUENCE_LENGTH, NUM_OF_FEATURES),stateful=False,activation='relu'))
    model.add(keras.layers.BatchNormalization())
    model.add(keras.layers.Dense(32,kernel_initializer='normal', activation='relu'))
    model.add(keras.layers.Dense(1,kernel_initializer='normal', activation='sigmoid'))

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[auc2])

    #### fit model with next train batch ####
    model.fit_generator(generator=train_batch_generator, 
                        steps_per_epoch=100, epochs=10, verbose=1, 
                        class_weight = {0: 1,1: 33})
    
    #### evaluate model by next test batch ####
    result = model.evaluate_generator(generator=test_batch_generator, steps=int(test_size/test_batch_size))
    print('%d/%d th TEST AUC: %.5f'%(k_index+1,num_of_folds,result[1]))
    cv_list.append(result[1])

print('================ Result ================')
print('OVERALL TEST AUC: %.5f (+/- %.4f)'%(np.mean(cv_list),np.std(cv_list)))


Learning LSTM
# of features:  97
Sequence length:  3
(61552, 101)
X_train:  35455
train batch:  1013
X_test:  8855
test batch:  253
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
1/5 th TEST AUC: 0.95574
(61552, 101)
X_train:  35455
train batch:  1013
X_test:  8855
test batch:  253
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
2/5 th TEST AUC: 0.95084
(61552, 101)
X_train:  35455
train batch:  1013
X_test:  8855
test batch:  253
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
3/5 th TEST AUC: 0.95791
(61552, 101)
X_train:  35455
train batch:  1013
X_test:  8855
test batch:  253
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
4/5 th TEST AUC: 0.95974
(61552, 101)
X_train:  35420
train batch:  1012
X_test:  8890
test batch:  254
Epoch 1/