## EDA

In [None]:
import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline
from plotly import tools
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

from sklearn import model_selection, preprocessing, metrics
import lightgbm as lgb

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999

In [None]:
train_df = pd.read_csv("../input/train.csv", parse_dates=["first_active_month"])
test_df = pd.read_csv("../input/test.csv", parse_dates=["first_active_month"])
print("Number of rows and columns in train set : ",train_df.shape)
print("Number of rows and columns in test set : ",test_df.shape)

In [None]:
train_df.head(2)

In [None]:
target_col = "target"

plt.figure(figsize=(8, 6))
plt.scatter(range(train_df.shape[0]), np.sort(train_df[target_col].values))
plt.xlabel("index", fontsize=12)
plt.ylabel("Loyalty Score", fontsize=12)
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.distplot(train_df[target_col].values, bins=50, kde=False, color="red")
plt.title("Histogram of Loyalty score")
plt.xlabel("Loyalty score", fontsize=12)
plt.show()

In [None]:
(train_df[target_col]<-30).sum()

In [None]:
cnt_srs = train_df["first_active_month"].dt.date.value_counts()
cnt_srs = cnt_srs.sort_index()
plt.figure(figsize=(14, 6))
sns.barplot(cnt_srs.index, cnt_srs.values, alpha=0.8, color="green")
plt.xticks(rotation="vertical")
plt.xlabel("First active month", fontsize=12)
plt.ylabel("Number of cards", fontsize=12)
plt.title("First active month count in train set")
plt.show()

cnt_srs = test_df["first_active_month"].dt.date.value_counts()
cnt_srs = cnt_srs.sort_index()
plt.figure(figsize=(14, 6))
sns.barplot(cnt_srs.index, cnt_srs.values, alpha=0.8, color="green")
plt.xticks(rotation="vertical")
plt.xlabel("First active month", fontsize=12)
plt.ylabel("Number of cards", fontsize=12)
plt.title("First active month count in test set")
plt.show()

In [None]:
# feature 1
plt.figure(figsize=(8, 4))
sns.violinplot(x="feature_1", y=target_col, data=train_df)
plt.xticks(rotation="vertical")
plt.xlabel("Feature 1", fontsize=12)
plt.ylabel("Loyalty score", fontsize=12)
plt.title("Feature 1 distribution")
plt.show()

# feature 2
plt.figure(figsize=(8, 4))
sns.violinplot(x="feature_2", y=target_col, data=train_df)
plt.xticks(rotation="vertical")
plt.xlabel("Feature 2", fontsize=12)
plt.ylabel("Loyalty score", fontsize=12)
plt.title("Feature 1 distribution")
plt.show()

# feature 3
plt.figure(figsize=(8,4))
sns.violinplot(x="feature_3", y=target_col, data=train_df)
plt.xticks(rotation='vertical')
plt.xlabel('Feature 3', fontsize=12)
plt.ylabel('Loyalty score', fontsize=12)
plt.title("Feature 3 distribution")
plt.show()

In [None]:
hist_df = pd.read_csv("../input/historical_transactions.csv")
hist_df.head()

In [None]:
gby = hist_df.groupby("card_id")["purchase_amount"].size().reset_index()
gby.columns = ["card_id", "num_hist_transactions"]
train_df = pd.merge(train_df, gby, on="card_id", how="left")
test_df = pd.merge(test_df, gby, on="card_id", how="left")

In [None]:
cnt_srs = train_df.groupby("num_hist_transactions")[target_col].mean()
cnt_srs = cnt_srs.sort_index()
cnt_srs = cnt_srs[:-50]

def scatter_plot(cnt_srs, color):
    trace = go.Scatter(x=cnt_srs.index[::-1],
                       y=cnt_srs.values[::-1],
                       showlegend=False,
                       marker=dict(color=color))
    return trace

trace = scatter_plot(cnt_srs, "orange")
layout = dict(title="Loyalty score by Number of historical transactions")
data=[trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="Histtranscnt")

In [None]:
bins = [0, 10, 20, 30, 40, 50, 75, 100, 150, 200, 500, 10000]
train_df["binned_num_hist_transactions"] = pd.cut(train_df["num_hist_transactions"], bins)
cnt_srs = train_df.groupby("binned_num_hist_transactions")[target_col].mean()
plt.figure(figsize=(12, 8))
sns.boxplot(x="binned_num_hist_transactions", y=target_col, data=train_df, showfliers=False)
plt.xticks(rotation="vertical")
plt.xlabel("binned_num_hist_transactions", fontsize=12)
plt.ylabel("Loyalty score", fontsize=12)
plt.title("binned_num_hist_transactions distribution")
plt.show

In [None]:
gdf = hist_df.groupby("card_id")
gdf = gdf["purchase_amount"].agg(['sum', 'mean', 'std', 'min', 'max']).reset_index()
gdf.columns = ["card_id", "sum_hist_trans", "mean_hist_trans", "std_hist_trans", "min_hist_trans", "max_hist_trans"]
train_df = pd.merge(train_df, gdf, on="card_id", how="left")
test_df = pd.merge(test_df, gdf, on="card_id", how="left")

In [None]:
bins = np.percentile(train_df["sum_hist_trans"], range(0,101,10))
train_df['binned_sum_hist_trans'] = pd.cut(train_df['sum_hist_trans'], bins)
#cnt_srs = train_df.groupby("binned_sum_hist_trans")[target_col].mean()

plt.figure(figsize=(12,8))
sns.boxplot(x="binned_sum_hist_trans", y=target_col, data=train_df, showfliers=False)
plt.xticks(rotation='vertical')
plt.xlabel('binned_sum_hist_trans', fontsize=12)
plt.ylabel('Loyalty score', fontsize=12)
plt.title("Sum of historical transaction value (Binned) distribution")
plt.show()

In [None]:
bins = np.percentile(train_df["mean_hist_trans"], range(0,101,10))
train_df['binned_mean_hist_trans'] = pd.cut(train_df['mean_hist_trans'], bins)
#cnt_srs = train_df.groupby("binned_mean_hist_trans")[target_col].mean()

plt.figure(figsize=(12,8))
sns.boxplot(x="binned_mean_hist_trans", y=target_col, data=train_df, showfliers=False)
plt.xticks(rotation='vertical')
plt.xlabel('Binned Mean Historical Transactions', fontsize=12)
plt.ylabel('Loyalty score', fontsize=12)
plt.title("Mean of historical transaction value (Binned) distribution")
plt.show()

In [None]:
new_trans_df = pd.read_csv("../input/new_merchant_transactions.csv")
new_trans_df.head()

In [None]:
gby = new_trans_df.groupby("card_id")["purchase_amount"].size().reset_index()
gby.columns = ["card_id", "num_merch_transactions"]
train_df = pd.merge(train_df, gby, on="card_id", how="left")
test_df = pd.merge(test_df, gby, on="card_id", how="left")

In [None]:
bins = [0, 10, 20, 30, 40, 50, 75, 10000]
train_df['binned_num_merch_transactions'] = pd.cut(train_df['num_merch_transactions'], bins)
cnt_srs = train_df.groupby("binned_num_merch_transactions")[target_col].mean()

plt.figure(figsize=(12,8))
sns.boxplot(x="binned_num_merch_transactions", y=target_col, data=train_df, showfliers=False)
plt.xticks(rotation='vertical')
plt.xlabel('binned_num_merch_transactions', fontsize=12)
plt.ylabel('Loyalty score', fontsize=12)
plt.title("Number of new merchants transaction (Binned) distribution")
plt.show()

In [None]:
gby = new_trans_df.groupby("card_id")
gby = gby["purchase_amount"].agg(['sum', 'mean', 'std', 'min', 'max']).reset_index()
gby.columns = ["card_id", "sum_merch_trans", "mean_merch_trans", "std_merch_trans", "min_merch_trans", "max_merch_trans"]
train_df = pd.merge(train_df, gby, on="card_id", how="left")
test_df = pd.merge(test_df, gby, on="card_id", how="left")

In [None]:
bins = np.nanpercentile(train_df["mean_merch_trans"], range(0,101,10))
train_df['binned_mean_merch_trans'] = pd.cut(train_df['mean_merch_trans'], bins)
#cnt_srs = train_df.groupby("binned_sum_hist_trans")[target_col].mean()

plt.figure(figsize=(12,8))
sns.boxplot(x="binned_mean_merch_trans", y=target_col, data=train_df, showfliers=False)
plt.xticks(rotation='vertical')
plt.xlabel('binned mean of new merchant transactions', fontsize=12)
plt.ylabel('Loyalty score', fontsize=12)
plt.title("Mean of New merchants transaction value (Binned) distribution")
plt.show()