In [10]:
import pandas as pd
import numpy as np

from sklearn import tree
from sklearn.model_selection import train_test_split

In [3]:
__DATA_PATH__ = "../data"
__INTERIM_DATA_PATH__ = f"{__DATA_PATH__}/interim"

In [12]:
df = pd.read_csv(f"{__INTERIM_DATA_PATH__}/easy_money_prep_1.csv")
df.drop("Unnamed: 0", axis=1, inplace=True)
df["pk_partition"] = pd.to_datetime(df["pk_partition"])
df.shape

(5962838, 75)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5962838 entries, 0 to 5962837
Data columns (total 75 columns):
 #   Column                Dtype         
---  ------                -----         
 0   pk_cid                int64         
 1   pk_partition          datetime64[ns]
 2   entry_date            object        
 3   active_customer       int64         
 4   segment               int64         
 5   region_code           int64         
 6   gender                int64         
 7   age                   int64         
 8   deceased              int64         
 9   short_term_deposit    int64         
 10  loans                 int64         
 11  mortgage              int64         
 12  funds                 int64         
 13  securities            int64         
 14  long_term_deposit     int64         
 15  em_account_pp         int64         
 16  credit_card           int64         
 17  payroll               int64         
 18  pension_plan          int64         
 19  

In [14]:
df

Unnamed: 0,pk_cid,pk_partition,entry_date,active_customer,segment,region_code,gender,age,deceased,short_term_deposit,...,country_id_PL,country_id_PT,country_id_QA,country_id_RO,country_id_RU,country_id_SA,country_id_SE,country_id_SN,country_id_US,country_id_VE
0,15891,2018-07-28,2018-07-28,1,0,28,1,59,0,0,...,0,0,0,0,0,0,0,0,0,0
1,15891,2018-08-28,2018-07-28,0,1,28,1,59,0,0,...,0,0,0,0,0,0,0,0,0,0
2,16063,2018-11-28,2018-11-19,1,0,28,1,62,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16063,2018-12-28,2018-11-19,1,1,28,1,62,0,0,...,0,0,0,0,0,0,0,0,0,0
4,16063,2019-01-28,2018-11-19,1,1,28,1,62,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5962833,1553685,2019-05-28,2019-05-31,0,0,13,0,52,0,0,...,0,0,0,0,0,0,0,0,0,0
5962834,1553686,2019-05-28,2019-05-31,0,0,41,1,30,0,0,...,0,0,0,0,0,0,0,0,0,0
5962835,1553687,2019-05-28,2019-05-31,0,0,28,0,21,0,0,...,0,0,0,0,0,0,0,0,0,0
5962836,1553688,2019-05-28,2019-05-31,0,0,39,1,43,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
TARGET = "em_acount"

In [17]:
df[TARGET].value_counts()

1    4381586
0    1581252
Name: em_acount, dtype: int64

In [21]:
df["pk_partition"].value_counts()

2019-05-28    442993
2019-04-28    439625
2019-03-28    436181
2019-02-28    431725
2019-01-28    426873
2018-12-28    422479
2018-11-28    416385
2018-10-28    402298
2018-09-28    375321
2018-08-28    352920
2018-07-28    339338
2018-06-28    252103
2018-05-28    249915
2018-04-28    247452
2018-03-28    245247
2018-02-28    242507
2018-01-28    239476
Name: pk_partition, dtype: int64

In [24]:
train = df.drop([TARGET, "pk_cid", "pk_partition", "entry_date"], axis=1)
def split_dataframe(dataframe: pd.DataFrame, split_point_condition, exclude_columns, target: str = TARGET):

    dev_df = dataframe[split_point_condition].drop(exclude_columns, axis=1)
    print(f"dev_df.shape: " + ",".join(map(str, dev_df.shape)))

    val_df = dataframe[~split_point_condition].drop(exclude_columns, axis=1)
    print(f"val_df.shape: " + ",".join(map(str, val_df.shape)))

    dev_df_X = dev_df.drop(target, axis=1)
    dev_df_y = dev_df[[TARGET]]

    val_df_X = val_df.drop(TARGET, axis=1)
    val_df_y = val_df[[TARGET]]

    print(f"""
    dev_df_X.shape: {dev_df_X.shape}
    dev_df_y.shape: {dev_df_y.shape}

    val_df_X.shape: {val_df_X.shape}
    val_df_y.shape: {val_df_y.shape}
    """)

    return dev_df_X, dev_df_y, val_df_X, val_df_y


dev_df_X, dev_df_y, val_df_X, val_df_y = split_dataframe(
    dataframe=df,
    split_point_condition=(df["pk_partition"] < "2019-03-28"),
    exclude_columns=["pk_cid", "pk_partition", "entry_date"],
    target=TARGET
)

dev_df.shape: 4644039,72
val_df.shape: 1318799,72

    dev_df_X.shape: (4644039, 71)
    dev_df_y.shape: (4644039, 1)

    val_df_X.shape: (1318799, 71)
    val_df_y.shape: (1318799, 1)
    


In [26]:
dev_df_X

Unnamed: 0,active_customer,segment,region_code,gender,age,deceased,short_term_deposit,loans,mortgage,funds,...,country_id_PL,country_id_PT,country_id_QA,country_id_RO,country_id_RU,country_id_SA,country_id_SE,country_id_SN,country_id_US,country_id_VE
0,1,0,28,1,59,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,28,1,59,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,28,1,62,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,28,1,62,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,28,1,62,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5935183,0,0,29,0,31,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5935187,0,0,8,0,26,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5935191,0,0,35,0,59,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5935195,0,0,46,0,45,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
X_train, X_test, y_train, y_test = train_test_split(
    dev_df_X,
    dev_df_y,
    test_size=0.2,
    random_state=13,
    stratify=dev_df_y
)
print(f"""
X_train.shape: {X_train.shape}
X_test.shape: {X_test.shape}
y_train.shape: {y_train.shape}
y_test.shape: {y_test.shape}
""")


X_train.shape: (3715231, 71)
X_test.shape: (928808, 71)
y_train.shape: (3715231, 1)
y_test.shape: (928808, 1)



In [28]:
tree_one = tree.DecisionTreeClassifier()
tree_one = tree_one.fit(X_train, y_train)

In [31]:
tree_one_accuracy = round(tree_one.score(val_df_X, val_df_y), 4)
print('Accuracy: %0.4f' % (tree_one_accuracy))

Accuracy: 0.9539


In [None]:
from io import StringIO
from IPython.display import Image, display
import pydotplus

out = StringIO()
tree.export_graphviz(tree_one, out_file = out)

graph = pydotplus.graph_from_dot_data(out.getvalue())
graph.write_png('classification_easy_money.png')