In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
%matplotlib inline

In [None]:
train_dtypes={"feature_1": np.int32,
              "feature_2": np.int16,
              "feature_3": np.int16,
              "target": np.float64}

test_dtypes = {"feature_1" : np.int16,
               "feature_2" : np.int16,
               "feature_3" : np.int16}

merch_dtypes = {
    'merchant_group_id' : np.int32,
    'merchant_category_id' : np.int16,
    'subsector_id' : np.int16,
    'active_months_lag3' : np.int16,
    'active_months_lag6' : np.int16,
    'avg_sales_lag3' : np.float64,
    'avg_sales_lag6' : np.float64,
    'avg_sales_lag12' : np.float64,
    'active_months_lag12' : np.int16,
    'city_id' : np.int16,
    'state_id' : np.int16,
    'category_2' : np.float16
}

trans_dtypes = {
    'city_id': np.int16,
    'installments': np.int16,
    'merchant_category_id': np.int16,
    'month_lag': np.int16,
    'category_2': np.float64,
    'state_id': np.int16,
    'subsector_id': np.int16
}

In [None]:
train = pd.read_csv("../data/train.csv", dtype=train_dtypes)
test = pd.read_csv("../data/test.csv", dtype=test_dtypes)
merch = pd.read_csv('../data/merchants.csv', dtype=merch_dtypes)
hist_trans = pd.read_csv('../data/historical_transactions.csv', dtype=trans_dtypes)
new_trans = pd.read_csv('../data/new_merchant_transactions.csv', dtype=trans_dtypes)

In [None]:
for cols in ['category_1', 'category_4', 'most_recent_purchases_range', 'most_recent_sales_range']:
    merch[cols] = merch[cols].astype('category')

In [None]:
def column_match(df1, df2):
    return df1.columns.intersection(df2.columns).values

In [None]:
print(train.shape)
print(test.shape)
print(merch.shape)
print(hist_trans.shape)
print(new_trans.shape)

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

In [None]:
missing_card_no = test.loc[test.first_active_month.isna()].card_id.values[0]

In [None]:
column_match(test, hist_trans)
hist_trans.query('card_id == @missing_card_no').sort_values(by='purchase_date').head(1).purchase_date

In [None]:
test.fillna('2017-03', inplace=True)

In [None]:
train.head(2)

In [None]:
train.first_active_month.value_counts().head(10).plot(kind='barh')

In [None]:
train.first_active_month.value_counts().sort_index().plot(title='Number of Cards by First Active Month')

In [None]:
train.first_active_month.min(), train.first_active_month.max()

In [None]:
test.first_active_month.min(), test.first_active_month.max()

In [None]:
train.card_id.nunique() == train.shape[0]

In [None]:
train.feature_1.value_counts().sort_index().plot(kind='bar', title='Count of Feature 1')

In [None]:
train.feature_2.value_counts().sort_index().plot(kind='bar', title='Count of Feature 2')

In [None]:
train.feature_3.value_counts().sort_index().plot(kind='bar', title='Count of Feature 3')

In [None]:
train.groupby(['feature_1', 'feature_2', 'feature_3']).\
target.count().plot(kind='barh',title='Count of Groups of Features')

In [None]:

train.groupby('feature_1').target.mean().plot(kind='bar')

In [None]:
train.groupby('feature_2').target.mean().plot(kind='bar')

In [None]:
train.groupby('feature_3').target.mean().plot(kind='bar')

In [None]:
train.groupby(['feature_1', 'feature_2', 'feature_3']).groups.keys() == test.groupby(['feature_1', 'feature_2', 'feature_3']).groups.keys()

In [None]:
g = sns.distplot(train.target)

In [None]:
train.query('target > 10').shape[0]

In [None]:
train.query('target < -30').shape[0]

In [None]:
g = sns.distplot(train.target)
g.set_xlim(-6, 6)

In [None]:
train.target.mean()

In [None]:
column_match(train, hist_trans)

In [None]:
column_match(merch, hist_trans)

In [None]:
hist_trans.head(3).transpose()

In [None]:
hist_trans.card_id.value_counts().plot()

In [None]:
hist_trans.card_id.nunique()

In [None]:
trans_grp = hist_trans.groupby('card_id')
trans_grp.purchase_amount.mean().plot()

In [None]:
np.where(trans_grp.purchase_amount.mean() > 100000)

In [None]:
trans_grp.purchase_amount.mean().loc[trans_grp.purchase_amount.mean() > 100000]

In [None]:
hist_trans.query('card_id == "C_ID_3b6ac8e52d"').purchase_amount.sum()