In [67]:
import numpy as np
import math
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import Imputer
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd
import matplotlib.pyplot as plt
from skopt import gp_minimize
from skopt.utils import use_named_args
from skopt.space import Real, Integer
import datetime
%matplotlib inline

In [68]:
performance  = pd.read_csv('train/performance_train.csv', index_col= False)
facturation  = pd.read_csv('train/facturation_train.csv', index_col= False)
payments    = pd.read_csv('train/paiements_train.csv', index_col= False)
transactions = pd.read_csv('train/transactions_train.csv', index_col= False)
#load test dataset
performance_test  = pd.read_csv('test/performance_test.csv', index_col= False)
facturation_test  = pd.read_csv('test/facturation_test.csv', index_col= False)
paiements_test    = pd.read_csv('test/paiements_test.csv', index_col= False)
transactions_test = pd.read_csv('test/transactions_test.csv', index_col= False)



In [69]:
transactions.describe()

Unnamed: 0,ID_CPTE,MERCHANT_CITY_NAME,PRIOR_CREDIT_LIMIT_AMT,TRANSACTION_AMT
count,824358.0,824358.0,824358.0,824358.0
mean,55991090.0,1551937.0,2420.31434,51.255482
std,25991110.0,916794.6,3417.932717,194.838033
min,10034820.0,-1.0,-5709.0,0.0
25%,33825700.0,680536.0,466.0,9.0
50%,55956100.0,1647737.0,1169.0,21.2
75%,78551160.0,2271380.0,2921.0,50.88
max,99984770.0,2999458.0,39709.0,54536.44


In [70]:
default = performance[performance['Default'] == 1]['ID_CPTE']
gen_def = (client for client in default)


In [71]:
next_default_client = next(gen_def)

performance[performance['ID_CPTE'] == next_default_client]


Unnamed: 0,ID_CPTE,PERIODID_MY,Default
18,75780289,2012-12-01,1


In [72]:
payments[payments['ID_CPTE'] == next_default_client].sort_values(by='TRANSACTION_DTTM')

Unnamed: 0,ID_CPTE,TRANSACTION_AMT,TRANSACTION_DTTM,PAYMENT_REVERSAL_XFLG
325,75780289,156.55,2012-01-02 00:00:00,Q
322,75780289,4110.75,2012-01-30 00:00:00,Q
327,75780289,491.4,2012-03-16 00:00:00,Q
323,75780289,308.0,2012-03-17 00:00:00,Q
326,75780289,187.86,2012-04-08 00:00:00,Q
335,75780289,408.0,2012-05-04 00:00:00,Q
332,75780289,328.65,2012-05-30 00:00:00,Q
333,75780289,78.0,2012-06-16 00:00:00,Q
321,75780289,325.52,2012-06-27 04:00:00,Q
329,75780289,314.15,2012-08-04 00:00:00,Q


In [73]:
payments["TRANSACTION_DTTM"] = pd.to_datetime(payments["TRANSACTION_DTTM"].apply(lambda x: str(x)[0:7]))

In [74]:
((payments["TRANSACTION_DTTM"][0] - payments["TRANSACTION_DTTM"][1])/30).days

-1

In [75]:
payments[payments['ID_CPTE'] == next_default_client].sort_values(by='TRANSACTION_DTTM')

Unnamed: 0,ID_CPTE,TRANSACTION_AMT,TRANSACTION_DTTM,PAYMENT_REVERSAL_XFLG
322,75780289,4110.75,2012-01-01,Q
325,75780289,156.55,2012-01-01,Q
323,75780289,308.0,2012-03-01,Q
327,75780289,491.4,2012-03-01,Q
326,75780289,187.86,2012-04-01,Q
332,75780289,328.65,2012-05-01,Q
335,75780289,408.0,2012-05-01,Q
321,75780289,325.52,2012-06-01,Q
333,75780289,78.0,2012-06-01,Q
329,75780289,314.15,2012-08-01,Q


In [76]:


def summarize_by_ID(dataframe):
    output = {}
    DECISION_keys = dataframe["DECISION_XCD"].value_counts().keys()
    TRANSACTION_C_keys = dataframe["TRANSACTION_CATEGORY_XCD"].value_counts().keys()
    TRANSACTION_T_keys = dataframe["TRANSACTION_TYPE_XCD"].value_counts().keys()
    cmt = dataframe["cred_minus_transaction_net_positive"].value_counts().keys()  
    SICGROUP_keys = dataframe["SICGROUP"].value_counts().keys()
    MERCHANT_COUNTRY_XCD_keys = dataframe["MERCHANT_COUNTRY_XCD"].value_counts().keys()
    MERCHANT_CATEGORY_XCD_keys = dataframe["MERCHANT_CATEGORY_XCD"].value_counts().keys()       
    for i in dataframe["ID_CPTE"].value_counts().keys():        
        subframe = dataframe.loc[dataframe["ID_CPTE"] == i]
        #query for MERCHANT_COUNTRY_XCD
        MERCHANT_COUNTRY_XCD_dict = {}
        for j in MERCHANT_COUNTRY_XCD_keys:
            s = "MERCHANT_COUNTRY_XCD_" + str(j)
            try:
                MERCHANT_COUNTRY_XCD_dict[s] = subframe["MERCHANT_COUNTRY_XCD"].value_counts(normalize=True)[j]
            except:
                MERCHANT_COUNTRY_XCD_dict[s] = 0         
        #query for MERCHANT_CATEGORY_XCD_
        MERCHANT_CATEGORY_XCD_dict = {}
        for j in MERCHANT_CATEGORY_XCD_keys:
            s = "MERCHANT_CATEGORY_XCD_" + str(j)
            try:
                MERCHANT_CATEGORY_XCD_dict[s] = subframe["MERCHANT_CATEGORY_XCD"].value_counts(normalize=True)[j]
            except:
                MERCHANT_CATEGORY_XCD_dict[s] = 0
                
        #query for DECISION_XCD
        DECISION_dict = {}
        for j in DECISION_keys:
            s = "DECISION_XCD_" + j
            try:
                DECISION_dict[s] = subframe["DECISION_XCD"].value_counts(normalize=True)[j]
            except:
                DECISION_dict[s] = 0
        #query for transaction_c
        TRANSACTION_C_dict = {}
        for j in TRANSACTION_C_keys:
            s = "TRANSACTION_C_" + j
            try:
                TRANSACTION_C_dict[s] = subframe["TRANSACTION_CATEGORY_XCD"].value_counts(normalize=True)[j]
            except:
                TRANSACTION_C_dict[s] = 0
        TRANSACTION_T_dict = {}
        #query for transaction_t    
        for j in TRANSACTION_T_keys:
            s = "TRANSACTION_T_" + j
            try:
                TRANSACTION_T_dict[s] = subframe["TRANSACTION_TYPE_XCD"].value_counts(normalize=True)[j]
            except:
                TRANSACTION_T_dict[s] = 0
        #query for SICGROUP
        SICGROUP_dict = {}

        for j in SICGROUP_keys:
            s = "SCIGROUP_" + j
            try:
                SICGROUP_dict[s] = subframe["SICGROUP"].value_counts(normalize=True)[j]
            except:
                SICGROUP_dict[s] = 0
        CMT_dict = {}
        for j in cmt:
            s = "cred_minus_transaction_net_positive" + str(j)
            try:
                CMT_dict[s] = subframe["cred_minus_transaction_net_positive"].value_counts(normalize=True)[j]
            except:
                CMT_dict[s] = 0
                
        output[i] = [DECISION_dict, TRANSACTION_C_dict, TRANSACTION_T_dict, SICGROUP_dict, CMT_dict, MERCHANT_CATEGORY_XCD_dict
                    ,MERCHANT_COUNTRY_XCD_dict]
    return output
def add_month_difference(dataframe, ID):
    subframe = dataframe[dataframe["ID_CPTE"] == ID]
    subframe  = subframe.sort_values(by = "TRANSACTION_DTTM")
    subframe["TRANSACTION_SINCE_FIRST"] = pd.to_datetime(subframe["TRANSACTION_DTTM"].apply(lambda x : str(x)[0:7]))
    first_month = subframe.iloc[0]["TRANSACTION_SINCE_FIRST"]
    subframe["TRANSACTION_SINCE_FIRST"] = subframe["TRANSACTION_SINCE_FIRST"].apply(lambda x : math.floor((x - first_month).days/30))  
    
    return subframe
    
def summarize_by_ID_2(dataframe):
    dataframe = dataframe.dropna()
    output = {}
    PAYMENT_REVERSAL_XFLG_key =  dataframe["PAYMENT_REVERSAL_XFLG"].value_counts().keys()
    for i in dataframe["ID_CPTE"].value_counts().keys():    

        subframe = add_month_difference(dataframe, i)
        
        MONTH_KEY = set(subframe["TRANSACTION_SINCE_FIRST"])       
        
        TRANSACTION_SUM_dict = {}
        for j in MONTH_KEY:
            s = "TRANSACTION_AMT_MONTH_" + str(j)
            try:
                TRANSACTION_SUM_dict[s] = subframe[subframe["TRANSACTION_SINCE_FIRST"] == j]["TRANSACTION_AMT"].sum()
            except: 
                TRANSACTION_SUM_dict[s] = 0

        PAYMENT_REVERSAL_XFLG_dict = {}
        for j in PAYMENT_REVERSAL_XFLG_key:
            s = "PAYMENT_REVERSAL_XFLG_key_" + str(j)
            try:
                PAYMENT_REVERSAL_XFLG_dict[s] = subframe["PAYMENT_REVERSAL_XFLG"].value_counts(normalize=True)[j]
            except:
                PAYMENT_REVERSAL_XFLG_dict[s] = 0   
        output[i] = [TRANSACTION_SUM_dict,PAYMENT_REVERSAL_XFLG_dict,TRANSACTION_SUM_dict]
        
        
        
        
    return output

In [77]:
    transaction_dropped = transactions.drop([#"MERCHANT_CITY_NAME","MERCHANT_CATEGORY_XCD","MERCHANT_COUNTRY_XCD", 
                                             "TRANSACTION_DTTM"],1)
    
    ## add credit limit minus transaction amount and drop credit limit, transaction amount
    cred_minus_transaction = transaction_dropped["PRIOR_CREDIT_LIMIT_AMT"].sub(transaction_dropped["TRANSACTION_AMT"])
    transaction_dropped = transaction_dropped.drop(["PRIOR_CREDIT_LIMIT_AMT", "TRANSACTION_AMT"],1)
    transaction_dropped['cred_minus_transaction'] = cred_minus_transaction
    
    # drop cred_minus_transaction and query whether it is positive
    transaction_dropped["cred_minus_transaction_net_positive"] = transaction_dropped["cred_minus_transaction"].ge(0)
    transaction_dropped = transaction_dropped.drop(["cred_minus_transaction"],1)
    

In [78]:
output = summarize_by_ID(transaction_dropped)


In [79]:
payments

Unnamed: 0,ID_CPTE,TRANSACTION_AMT,TRANSACTION_DTTM,PAYMENT_REVERSAL_XFLG
0,99690111,208.00,2015-04-01,Q
1,99690111,176.80,2015-05-01,Q
2,99690111,200.00,2015-03-01,Q
3,99690111,80.80,2015-04-01,Q
4,99690111,250.00,2015-11-01,Q
5,99690111,273.00,2015-12-01,Q
6,99690111,267.50,2015-08-01,Q
7,99690111,618.00,2015-07-01,Q
8,99690111,226.60,2015-09-01,Q
9,99690111,244.80,2015-10-01,Q


In [80]:
convert = {}
s = pd.Series()
for i in output.keys():
    for k in output[i]:
        s= {**s,**k}
    convert[i] = pd.Series(s)
final = pd.DataFrame.from_dict(convert, orient='index')

In [89]:
def feature_engineering(performance,paiements,transactions,test):
    #Get rid of BS features
    transaction_dropped = transactions.drop([#"MERCHANT_CITY_NAME","MERCHANT_CATEGORY_XCD","MERCHANT_COUNTRY_XCD", 
                                             "TRANSACTION_DTTM"],1)
    
    ## add credit limit minus transaction amount and drop credit limit, transaction amount
    cred_minus_transaction = transaction_dropped["PRIOR_CREDIT_LIMIT_AMT"].sub(transaction_dropped["TRANSACTION_AMT"])
    transaction_dropped = transaction_dropped.drop(["PRIOR_CREDIT_LIMIT_AMT", "TRANSACTION_AMT"],1)
    transaction_dropped['cred_minus_transaction'] = cred_minus_transaction
    
    # drop cred_minus_transaction and query whether it is positive
    transaction_dropped["cred_minus_transaction_net_positive"] = transaction_dropped["cred_minus_transaction"].ge(0)
    transaction_dropped = transaction_dropped.drop(["cred_minus_transaction"],1)
    
    
    
    
    ##Create cleaned dataframe for transaction 
    output = summarize_by_ID(transaction_dropped)
    convert = {}
    s = pd.Series()
    for i in output.keys():
        for k in output[i]:
            s= {**s,**k}
        convert[i] = pd.Series(s)
    final = pd.DataFrame.from_dict(convert, orient='index')
    
    #create cleaned dataframe for payments

    #paiements_drop = paiements.drop(["TRANSACTION_DTTM"],1)
    
    output2 = summarize_by_ID_2(paiements)
    convert2 = {}
    s2 = pd.Series()
    for i in output2.keys():
        for k2 in output2[i]:
            s2= {**s2,**k2}
        convert2[i] = pd.Series(s2)    
    final2 = pd.DataFrame.from_dict(convert2, orient='index')
    
    #create cleaned dataframe for performance
    temp = performance.set_index("ID_CPTE")
    del temp.index.name
    
    combined = final2.combine_first(final.combine_first(temp))
    
    #possible imputation? 
    combined.dropna(subset=["TRANSACTION_AMT_MONTH_0"]).fillna(0)
    #if (not test):
    #    combined_drop_features = combined[["cred_minus_transaction_net_positiveTrue","Default", "PAYMENT_REVERSAL_XFLG_key_Q"]]
    #else:
    #    combined_drop_features = combined[["cred_minus_transaction_net_positiveTrue", "PAYMENT_REVERSAL_XFLG_key_Q"]]
        
    return combined

In [90]:
def imputing(dataset_train_x, imputee):
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp = imp.fit(dataset_train_x)
    return imp.transform(imputee)

In [None]:

dataset_train = feature_engineering(performance,payments,transactions, False)
dataset_train_imp = dataset_train.fillna(0)
#whole dataset split x,y
dataset_train_x, dataset_train_y =  dataset_train_imp.drop(["Default"],1), dataset_train_imp["Default"]

##### Training dataset created #####
#dataset split training and validation

train, valid = train_test_split(dataset_train_imp, test_size=0.2)
train_y = train["Default"]
train_x = train.drop(["Default"],1)
valid_y = valid["Default"]
valid_x = valid.drop(["Default"],1)
#imputation#
#train_x_imp, valid_x_imp = imputing(dataset_train_x,train_x), imputing(dataset_train_x,valid_x)
#dataset_train_x_imp = imputing(dataset_train_x,dataset_train_x)

In [None]:
def gradient_boosting_classifier(train,test):
    space  = [Integer(2, 200, name='max_depth'),
              Real(10**-5, 10**0, "log-uniform", name='learning_rate'),
              Integer(1, train_x.shape[1], name='max_features'),
              Integer(2, 100, name='min_samples_split'),
              Integer(1, 100, name='min_samples_leaf')]    
    @use_named_args(space)
    def objective(**params):
        reg.set_params(**params)

        return -np.mean(cross_val_score(reg, train,test , cv=5, n_jobs=-1,
                                        scoring="neg_mean_absolute_error"))
    reg = GradientBoostingClassifier(n_estimators=50, random_state=0)

    res_gp = gp_minimize(objective, space, n_calls=50, random_state=0)
    return GradientBoostingClassifier(n_estimators=50, random_state=0, max_depth = res_gp.x[0], 
                                      learning_rate = res_gp.x[1], max_features = res_gp.x[2], min_samples_split = res_gp.x[3]
                                     ,min_samples_leaf= res_gp.x[4])
    
    

In [None]:
def prediction(classifier, X):
    return classifier.predict(X)
    

In [None]:
bestGBclassifier = gradient_boosting_classifier(dataset_train_x,dataset_train)
ID = pd.Series(dataset_test.index)
bestGBclassifier.fit(train_x, train_y)
GB_prediction = prediction(bestGBclassifier,valid_x)
submission_GB = submission_creator(ID,GB_prediction)

In [93]:
dataset_train.fillna(0)

Unnamed: 0,DECISION_XCD_A,DECISION_XCD_B,DECISION_XCD_C,Default,MERCHANT_CATEGORY_XCD_A,MERCHANT_CATEGORY_XCD_AA,MERCHANT_CATEGORY_XCD_AB,MERCHANT_CATEGORY_XCD_AC,MERCHANT_CATEGORY_XCD_AD,MERCHANT_CATEGORY_XCD_AE,...,TRANSACTION_C_E,TRANSACTION_T_A,TRANSACTION_T_B,TRANSACTION_T_C,TRANSACTION_T_D,TRANSACTION_T_E,TRANSACTION_T_F,TRANSACTION_T_G,cred_minus_transaction_net_positiveFalse,cred_minus_transaction_net_positiveTrue
10001822,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
10007972,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
10012520,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
10025534,0.000000,0.000000,0.000000,1.0,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
10033579,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
10034823,0.000000,0.005797,0.994203,0.0,0.000000,0.00000,0.002899,0.000000,0.000000,0.011594,...,0.918841,0.0,0.028986,0.063768,0.000000,0.000000,0.904348,0.002899,0.023188,0.976812
10036020,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
10068805,0.000000,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
10069450,0.000000,0.000000,1.000000,0.0,0.000000,0.00885,0.000000,0.000000,0.008850,0.008850,...,0.725664,0.0,0.044248,0.017699,0.017699,0.000000,0.884956,0.035398,0.000000,1.000000
10081565,0.000000,0.120253,0.879747,1.0,0.000000,0.00000,0.000000,0.000000,0.006329,0.000000,...,0.715190,0.0,0.056962,0.006329,0.000000,0.000000,0.886076,0.050633,0.012658,0.987342
