In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
import os
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

warnings.filterwarnings('ignore')

In [4]:
TRAIN_DATA_PATH = "../input/amex-default-prediction/train_data.csv"
TRAIN_LABELS_PATH = "../input/amex-default-prediction/train_labels.csv"

In [5]:
chunksize = 13000

train_df_iter = pd.read_csv(TRAIN_DATA_PATH, chunksize=chunksize)

Since the dataset is so big, I am going to break it up into chunks. 

In [6]:
# for chunk in train_df_example:
#     process(chunk)

train_df_example = train_df_iter.__next__()

In [7]:
train_labels_df = pd.read_csv(TRAIN_LABELS_PATH)


In [12]:
example_customer_id = "000f1c950ae4e388f44e9ba96dd6334dfa85d8be0416d9d0d30228301f2e4cc4"

In [14]:
customer_data_ex = train_df_example[train_df_example["customer_ID"] == example_customer_id]

I am going to look at the data pertaining to one customer. 

In [15]:
customer_data_ex

Features are anonymized and normalized, and fall into the following general categories:

D_* = Delinquency variables
S_* = Spend variables
P_* = Payment variables
B_* = Balance variables
R_* = Risk variables

In [16]:
all_cols = list(customer_data_ex.columns)
print(all_cols)

In [38]:
b_cols = list(filter(lambda x: x.startswith("B_"), all_cols))
print(b_cols)

In [17]:
train_labels_df[train_labels_df["customer_ID"] == example_customer_id]

With reference to the same customer, it looks as though the customer will default. 

In [20]:
ex_customer_ids = train_labels_df.iloc[:10]["customer_ID"].tolist()
ex_customer_data = train_df_example[train_df_example["customer_ID"].isin(ex_customer_ids)]

In [21]:
ex_customer_data = pd.merge(ex_customer_data, train_labels_df.iloc[:10], on="customer_ID")
ex_customer_data["S_2"] = pd.to_datetime(ex_customer_data["S_2"])

In [22]:
ex_customer_data.head()

In [23]:
plt.figure(figsize=(16, 5))
for _, group in ex_customer_data.groupby("customer_ID"):
    sn.lineplot(data=group, x="S_2", y="P_2", label=group["target"].max())
plt.title("P_2", fontsize=16)
plt.xlabel("S_2", fontsize=14)
plt.ylabel("P_2", fontsize=14);

In [24]:
plt.figure(figsize=(16, 5))
for _, group in ex_customer_data.groupby("customer_ID"):
    sn.lineplot(data=group, x="S_2", y="B_1", label=group["target"].max())
plt.title("B_1", fontsize=16)
plt.xlabel("S_2", fontsize=14)
plt.ylabel("B_1", fontsize=14);

In [25]:

plt.figure(figsize=(16, 5))
for _, group in ex_customer_data.groupby("customer_ID"):
    sn.lineplot(data=group, x="S_2", y="B_2", label=group["target"].max())
plt.title("B_2", fontsize=16)
plt.xlabel("S_2", fontsize=14)
plt.ylabel("B_2", fontsize=14);

In [26]:
ex_customer_ids = train_labels_df.iloc[:1000]["customer_ID"].tolist()
ex_customer_data = train_df_example[train_df_example["customer_ID"].isin(ex_customer_ids)]
ex_customer_data = pd.merge(ex_customer_data, train_labels_df.iloc[:1000], on="customer_ID")
ex_customer_data["S_2"] = pd.to_datetime(ex_customer_data["S_2"])

ex_customer_data.shape

In [27]:
plt.figure(figsize=(16, 5))
sn.countplot(y=ex_customer_data.groupby("customer_ID")["target"].max())
plt.title("Class distribution", fontsize=16)
plt.xlabel("count", fontsize=14)
plt.ylabel("target", fontsize=14);

Looking at 1000 cusotmers, there are 735 results/customers without targets and 235 with. 

In [28]:
plt.figure(figsize=(16, 5))
sn.countplot(y=ex_customer_data.groupby("customer_ID")["target"].count())
plt.title("Distribution of the number of records for the client", fontsize=16)
plt.xlabel("count", fontsize=14)
plt.ylabel("n_records", fontsize=14);

In [29]:
plt.figure(figsize=(16, 5))
sn.histplot(data=ex_customer_data, x="S_2", bins=100)
plt.title("Distribution of records by time", fontsize=16)
plt.xlabel("count", fontsize=14)
plt.ylabel("n_records", fontsize=14);

In [32]:
def sort_f(x):
    try:
        a, b = x.split("_")
        return a, int(b)
    except:
        return "0", 0

all_cols = sorted(all_cols, key=sort_f)

In [34]:
categorical_cols = [
    'B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 
    'D_126', 'D_63', 'D_64', 'D_66', 'D_68',
]

In [35]:
ind = 0
for col in categorical_cols:
    if ind % 4 == 0:
        plt.figure(figsize=(16, 3))
    plt.subplot(1, 4, ind % 4 + 1)
    
    sn.countplot(data=ex_customer_data, x=col, hue="target")
    plt.ylabel("")
    
    if ind % 4 == 3:
        plt.show()
    
    ind += 1

In [36]:
ind = 0
for col in all_cols:
    if col in ["S_2", "customer_ID", "target"] + categorical_cols:
        continue
    
    if ind % 4 == 0:
        plt.figure(figsize=(16, 4))
    plt.subplot(1, 4, ind % 4 + 1)
    
    sn.histplot(data=ex_customer_data, x=col, hue="target", bins=20)
    plt.ylabel("")
    
    if ind % 4 == 3:
        plt.show()
    
    ind += 1

In [39]:
ex_customer_data[ex_customer_data["target"] == 0][b_cols[:10]].describe()