In [1]:
# import packages
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.svm import LinearSVC, SVC  # Linear Support Vector Classification


RANDOM_STATE = 1234

In [2]:
# read the data
payments = pd.read_csv("payment_data.csv")
payments = payments.set_index("id")

customers = pd.read_csv("customer_data.csv")
customers = customers.set_index("id")

# merge two frames into one
customer_data = customers.join(payments)
customer_data

Unnamed: 0_level_0,label,fea_1,fea_2,fea_3,fea_4,fea_5,fea_6,fea_7,fea_8,fea_9,...,OVD_t2,OVD_t3,OVD_sum,pay_normal,prod_code,prod_limit,update_date,new_balance,highest_balance,report_date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
54982353,0,1,1130.0,2,1000000,2,4,-1,100,5,...,0,0,0,1,10,55000.0,27/08/2014,0.0,2068.0,12/6/14
54982353,0,1,1130.0,2,1000000,2,4,-1,100,5,...,0,0,1,31,10,550000.0,3/9/13,326684.4,609683.0,18/12/2015
54982353,0,1,1130.0,2,1000000,2,4,-1,100,5,...,0,0,0,19,10,,16/07/2011,31677.6,204037.0,14/12/2015
54982353,0,1,1130.0,2,1000000,2,4,-1,100,5,...,0,35,31500,0,10,12100.0,27/12/2008,12142.8,10619.0,14/07/2009
54982353,0,1,1130.0,2,1000000,2,4,-1,100,5,...,0,0,0,26,10,660000.0,12/3/07,252998.4,775030.0,23/12/2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59006219,0,4,,2,111000,2,8,5,110,4,...,0,0,28,35,10,33000.0,28/01/2013,9956.4,22591.0,4/2/16
59006219,0,4,,2,111000,2,8,5,110,4,...,0,0,0,36,10,,27/01/2013,1344.0,44655.0,13/03/2016
59006239,0,7,1322.0,3,68000,2,11,5,86,3,...,0,0,0,12,10,,23/03/2015,2492.4,15440.0,19/02/2016
59006239,0,7,1322.0,3,68000,2,11,5,86,3,...,0,0,0,1,6,,14/02/2015,0.0,21000.0,22/10/2015


In [3]:
# get the label distribution
(customer_data["label"]).value_counts()

0    6863
1    1387
Name: label, dtype: int64

In [4]:
# get the count of missing values for each column
customer_data.isnull().sum(axis=0)

label                 0
fea_1                 0
fea_2              1028
fea_3                 0
fea_4                 0
fea_5                 0
fea_6                 0
fea_7                 0
fea_8                 0
fea_9                 0
fea_10                0
fea_11                0
OVD_t1                0
OVD_t2                0
OVD_t3                0
OVD_sum               0
pay_normal            0
prod_code             0
prod_limit         6118
update_date          26
new_balance           0
highest_balance     409
report_date        1114
dtype: int64

In [5]:
# check if there is any customer with too many missing values
print("max missing = {}\n min missing = {}\n mean missing = {}".format(
        customer_data.isnull().sum(axis=1).max(),
        customer_data.isnull().sum(axis=1).min(),
        customer_data.isnull().sum(axis=1).mean()))

max missing = 4
 min missing = 0
 mean missing = 1.053939393939394


In [6]:
# make the prod_code to categorical variable
for prod_code in customer_data["prod_code"].unique():
    customer_data["prod_code_{}".format(prod_code)] = customer_data["prod_code"] == prod_code
# Category features are fea1, fea3, fea5, fea6, fea7, fea9
for feature_id in [1, 3, 5, 6, 7, 9]:
    for value in customer_data["fea_{}".format(feature_id)].unique():
        customer_data["feature_{}_{}".format(feature_id, value)] = customer_data["fea_{}".format(feature_id)] == value
customer_data

Unnamed: 0_level_0,label,fea_1,fea_2,fea_3,fea_4,fea_5,fea_6,fea_7,fea_8,fea_9,...,feature_7_10,feature_7_8,feature_7_3,feature_7_4,feature_7_1,feature_9_5,feature_9_3,feature_9_4,feature_9_1,feature_9_2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
54982353,0,1,1130.0,2,1000000,2,4,-1,100,5,...,False,False,False,False,False,True,False,False,False,False
54982353,0,1,1130.0,2,1000000,2,4,-1,100,5,...,False,False,False,False,False,True,False,False,False,False
54982353,0,1,1130.0,2,1000000,2,4,-1,100,5,...,False,False,False,False,False,True,False,False,False,False
54982353,0,1,1130.0,2,1000000,2,4,-1,100,5,...,False,False,False,False,False,True,False,False,False,False
54982353,0,1,1130.0,2,1000000,2,4,-1,100,5,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59006219,0,4,,2,111000,2,8,5,110,4,...,False,False,False,False,False,False,False,True,False,False
59006219,0,4,,2,111000,2,8,5,110,4,...,False,False,False,False,False,False,False,True,False,False
59006239,0,7,1322.0,3,68000,2,11,5,86,3,...,False,False,False,False,False,False,True,False,False,False
59006239,0,7,1322.0,3,68000,2,11,5,86,3,...,False,False,False,False,False,False,True,False,False,False


In [7]:
# remove useless columns
# prod_limit: too many missing values
# dates: not expressive
customer_data = customer_data.drop(["prod_limit", "report_date", "update_date", "prod_code", "fea_1", "fea_3", "fea_5", "fea_6", "fea_7", "fea_9"], axis=1)

In [8]:
# convert to np.array for training and fill in the missing value
data = customer_data.to_numpy(na_value=np.nan).astype(float)
# fill in missing vales with mean value
imputer = SimpleImputer(verbose=1)
data = imputer.fit_transform(data)
data

array([[      0.,    1130., 1000000., ...,       0.,       0.,       0.],
       [      0.,    1130., 1000000., ...,       0.,       0.,       0.],
       [      0.,    1130., 1000000., ...,       0.,       0.,       0.],
       ...,
       [      0.,    1322.,   68000., ...,       0.,       0.,       0.],
       [      0.,    1322.,   68000., ...,       0.,       0.,       0.],
       [      0.,    1322.,   68000., ...,       0.,       0.,       0.]])

In [9]:
# check if there is missing value
(data == np.nan).any()

False

In [10]:
# scale the data within the range of [0, 1]
scaler = MinMaxScaler()
data = scaler.fit_transform(data)

In [None]:
# # balance the dataset
# mask = data[:, 0] == 1
# risk_customers = data[mask]
# print("number of risk customers = {}".format(risk_customers.shape[0]))
#
# safe_customers = data[~mask]
# indices = np.random.choice(safe_customers.shape[0], risk_customers.shape[0]*2, replace=False)
# safe_customers = safe_customers[indices]
# print("number of safe customers = {}".format(safe_customers.shape[0]))
#
# data = np.concatenate([risk_customers, safe_customers], axis=0)
# print("Shape of selected data = ", data.shape)

In [14]:
# split the data into X, y
X = data[:, 1:]
y = data[:, 0]

In [15]:
linear_model = LinearSVC(random_state=RANDOM_STATE)
cross_val_score(linear_model, X, y, cv=3, n_jobs=-1, scoring="accuracy")

array([0.82654545, 0.824     , 0.82872727])

In [16]:
# get the prediction during cross validation
pred_tags = cross_val_predict(linear_model, X, y, cv=3, n_jobs=-1, method="predict")

In [17]:
# get confusion matrix
confusion_matrix(y, pred_tags)

array([[6784,   79],
       [1353,   34]])

In [18]:
# polynomial kernel
poly_svc = SVC(kernel="poly", random_state=RANDOM_STATE)
pred_tags = cross_val_predict(poly_svc, X, y, cv=3, n_jobs=-1, method="predict")
confusion_matrix(y, pred_tags)

array([[6476,  387],
       [1305,   82]])

In [19]:
# Gaussian RBF kernel
gaussian_svc = SVC(kernel="rbf", random_state=RANDOM_STATE)
pred_tags = cross_val_predict(gaussian_svc, X, y, cv=3, n_jobs=-1, method="predict")
confusion_matrix(y, pred_tags)

array([[6556,  307],
       [1313,   74]])