# Churn

## 1 - Import useful modules

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import numpy as np

## 2 - Load and prepare dataset

In [2]:
x_train = pd.read_csv('data/bank_data_train.csv', index_col='ID')

y_train = x_train.pop('TARGET')

Lower case categorical columns to avoid duplicates.

In [3]:
categorical_train = x_train.select_dtypes(include=["object_"])
categorical_cols = categorical_train.columns
categorical_train = categorical_train.apply(lambda c: c.str.lower())

categorical_train.describe()

Unnamed: 0,CLNT_TRUST_RELATION,APP_MARITAL_STATUS,APP_KIND_OF_PROP_HABITATION,CLNT_JOB_POSITION_TYPE,CLNT_JOB_POSITION,APP_DRIVING_LICENSE,APP_EDUCATION,APP_TRAVEL_PASS,APP_CAR,APP_POSITION_TYPE,APP_EMP_TYPE,APP_COMP_TYPE,PACK
count,69421,68234,59361,44781,210811,57257,68104,57257,57256,60545,67362,67362,355190
unique,20,8,5,4,15111,2,12,2,2,4,4,4,12
top,friend,m,so,specialist,директор,n,h,n,n,specialist,private,private,102
freq,24896,32185,28056,25123,24974,36332,44370,52750,32843,36622,59087,59087,116986


Deal with duplicates in the `CLNT_TRUST_RELATION` column by translating Russian terms to their equivalent in English.

In [4]:
ru_to_en = {'друг': 'other', 'мать': 'mother', 'брат': 'brother', 'отец': 'father', 'сестра': 'sister', 'дочь': 'daughter', 'сын': 'son', 'мама': 'mother'}
categorical_train['CLNT_TRUST_RELATION'].replace(ru_to_en, inplace=True)

In [5]:
for c in categorical_cols:
    print("-" * 80)
    print(categorical_train[c].value_counts())

--------------------------------------------------------------------------------
friend        24896
mother        11172
relative       8207
other          6850
brother        5620
sister         5583
father         3056
daughter       2032
son            1947
близкий ро       44
дальний ро        6
муж               5
жена              3
Name: CLNT_TRUST_RELATION, dtype: int64
--------------------------------------------------------------------------------
m    32185
v    23075
d     7777
t     3392
n      936
w      697
c      155
        17
Name: APP_MARITAL_STATUS, dtype: int64
--------------------------------------------------------------------------------
so         28056
jo         14153
other      11316
rent        3223
nprivat     2613
Name: APP_KIND_OF_PROP_HABITATION, dtype: int64
--------------------------------------------------------------------------------
specialist     25123
manager         9396
top_manager     7262
self_empl       3000
Name: CLNT_JOB_POSITION_TYPE, dt

Convert categorical features into one-hot encodings.

In [6]:
encoder = OneHotEncoder(handle_unknown='infrequent_if_exist', sparse_output=False, max_categories=6000)
categorical_train_encoded = encoder.fit_transform(categorical_train)

In [7]:
x_train.drop(columns=categorical_cols, inplace=True)

In [8]:
del(categorical_train)

In [9]:
x_train.describe()

Unnamed: 0,CR_PROD_CNT_IL,AMOUNT_RUB_CLO_PRC,PRC_ACCEPTS_A_EMAIL_LINK,APP_REGISTR_RGN_CODE,PRC_ACCEPTS_A_POS,PRC_ACCEPTS_A_TK,TURNOVER_DYNAMIC_IL_1M,CNT_TRAN_AUT_TENDENCY1M,SUM_TRAN_AUT_TENDENCY1M,AMOUNT_RUB_SUP_PRC,...,LDEAL_ACT_DAYS_ACC_PCT_AVG,REST_DYNAMIC_CC_3M,MED_DEBT_PRC_YWZ,LDEAL_ACT_DAYS_PCT_TR3,LDEAL_ACT_DAYS_PCT_AAVG,LDEAL_DELINQ_PER_MAXYWZ,TURNOVER_DYNAMIC_CC_3M,LDEAL_ACT_DAYS_PCT_TR,LDEAL_ACT_DAYS_PCT_TR4,LDEAL_ACT_DAYS_PCT_CURR
count,355190.0,316867.0,155163.0,60550.0,155163.0,155163.0,355190.0,77112.0,77112.0,316867.0,...,93448.0,355190.0,95713.0,93448.0,98175.0,95713.0,355190.0,93448.0,93448.0,93448.0
mean,0.105225,0.044045,0.0,50.947498,0.0,0.0,0.001305,0.416896,0.414572,0.085249,...,0.051419,0.007309,0.055074,0.025707,0.049943,0.009252,0.004309,0.013938,0.013938,0.013938
std,0.431372,0.108449,0.0,21.777855,0.0,0.0,0.029118,0.316493,0.338612,0.14231,...,0.13496,0.066681,0.215909,0.115732,0.18583,0.092789,0.059852,0.097099,0.097099,0.097099
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006944,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,33.0,0.0,0.0,0.0,0.166667,0.139645,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,54.0,0.0,0.0,0.0,0.3,0.285714,0.027117,...,0.008822,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.036608,0.0,72.0,0.0,0.0,0.0,0.571429,0.661195,0.110005,...,0.033563,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,11.0,1.0,0.0,89.0,0.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Fill NA values.

In [10]:
x_train.isna().sum()

CR_PROD_CNT_IL                   0
AMOUNT_RUB_CLO_PRC           38323
PRC_ACCEPTS_A_EMAIL_LINK    200027
APP_REGISTR_RGN_CODE        294640
PRC_ACCEPTS_A_POS           200027
                             ...  
LDEAL_DELINQ_PER_MAXYWZ     259477
TURNOVER_DYNAMIC_CC_3M           0
LDEAL_ACT_DAYS_PCT_TR       261742
LDEAL_ACT_DAYS_PCT_TR4      261742
LDEAL_ACT_DAYS_PCT_CURR     261742
Length: 101, dtype: int64

In [11]:
x_train.fillna(0, inplace=True)

In [12]:
x_train = x_train.to_numpy()
y_train = y_train.to_numpy()

In [13]:
x_train = np.concatenate((x_train, categorical_train_encoded), axis=1)

In [14]:
del(categorical_train_encoded)

Split the training set into a train dataset (80%) and a dev set (20%) with stratification.

In [15]:
X_train, X_dev, Y_train, Y_dev = train_test_split(x_train, y_train, test_size=0.2, stratify=y_train, random_state=1337)

In [16]:
del(x_train)
del(y_train)

## 3 - Baseline

In [17]:
from sklearn.metrics import accuracy_score, roc_auc_score

In [18]:
class Baseline:
    def __init__(self) -> None:
        self.prediction = 0

    def fit(self, X, y):
        classes = set(y)
        frequencies = {cls: (Y_train == cls).sum() for cls in classes}
        self.prediction = max(frequencies, key=frequencies.get)

    def predict(self, X):
        return np.full(X.shape[0], self.prediction)

In [19]:
bsl = Baseline()
bsl.fit(X_train, Y_train)
dev_predictions = bsl.predict(X_dev)

In [20]:
accuracy_score(Y_dev, dev_predictions)

0.918564711844365

In [21]:
roc_auc_score(Y_dev, dev_predictions)

0.5

## n - Make predictions on the test set

In [22]:
del(X_train)
del(X_dev)
del(Y_train)
del(Y_dev)

In [23]:
x_test = pd.read_csv('data/bank_data_test.csv', index_col='ID')
x_test.drop(columns=['TARGET'], inplace=True)
idx = x_test.index

In [24]:
categorical_test = x_test.select_dtypes(include=["object_"])
categorical_test = categorical_test.apply(lambda c: c.str.lower())
categorical_test['CLNT_TRUST_RELATION'].replace(ru_to_en, inplace=True)

In [25]:
categorical_test_encoded = encoder.transform(categorical_test)
del(categorical_test)
x_test.drop(columns=categorical_cols, inplace=True)

In [26]:
x_test.fillna(0, inplace=True)
x_test = x_test.to_numpy()
x_test = np.concatenate((x_test, categorical_test_encoded), axis=1)
del(categorical_test_encoded)

Save the final predictions to a csv file.

In [27]:
test_predictions = bsl.predict(x_test)
test_predictions = pd.DataFrame(test_predictions, index=idx, columns=['TARGET'])
test_predictions.to_csv('final_predictions.csv')