Launch a Dask backend for parallel computing:

In [2]:
from dask.distributed import Client

Client()

0,1
Client  Scheduler: tcp://127.0.0.1:34609  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 8  Cores: 8  Memory: 16.66 GB


Change CWD to avoid `..` in file paths:

In [3]:
import os

os.chdir("/home/data-scientist/elo-merchant-category-recommendation")

Import libraries required across the whole notebook:

In [4]:
import dask.dataframe as dd

Load the data:

In [None]:
dtype = dict()

In [9]:
import datetime
import numpy as np
import pandas as pd

In [14]:
%%time

category_2_dtype = pd.api.types.CategoricalDtype([-1, 1, 2, 3, 4, 5])
category_3_dtype = pd.api.types.CategoricalDtype([-1, 1, 2, 3])

dtype = dict(card_id="category",
             city_id="category",
             state_id="category",
             merchant_id="category",
             subsector_id="category",
             category_2=category_2_dtype,
             category_3=category_3_dtype)

kwargs = dict(parse_dates=["purchase_date"], dtype=dtype)

transactions = dict(old=dd.read_csv("data/v0-preprocessed/old/chunk.*.csv", **kwargs),
                    new=dd.read_csv("data/v0-preprocessed/new/chunk.*.csv", **kwargs))

CPU times: user 89 ms, sys: 1.31 ms, total: 90.3 ms
Wall time: 85.5 ms


In [15]:
%%time

for key, df in transactions.items():
    df["month_diff"] = df.month_lag + (datetime.datetime.today() - df.purchase_date).dt.days // 30
    df["purchase_month"] = df.purchase_date.dt.month # TODO: add_datepart (?)
    
    # try to leave category_2 column instead of / along with dummies
    transactions[key] = dd.get_dummies(df, columns=["category_2", "category_3"]) # TODO: dummy_na=True (?)

CPU times: user 19.1 ms, sys: 10.3 ms, total: 29.4 ms
Wall time: 26.4 ms


In [7]:
%%time

with pd.option_context("display.max_rows", 100):
    display(transactions["old"].isnull().any(axis=0).compute())

authorized_flag         False
card_id                 False
city_id                 False
category_1              False
installments            False
merchant_category_id    False
merchant_id              True
month_lag               False
purchase_amount         False
purchase_date           False
state_id                False
subsector_id            False
month_diff              False
purchase_month          False
category_2_-1           False
category_2_1            False
category_2_2            False
category_2_3            False
category_2_4            False
category_2_5            False
category_3_-1           False
category_3_1            False
category_3_2            False
category_3_3            False
dtype: bool

CPU times: user 4.98 s, sys: 370 ms, total: 5.35 s
Wall time: 37.4 s


In [18]:
%%time

agg = dict(# Percent of authorized transactions
           authorized_flag=["sum", "mean"],
           
           # Percent of transactions with positive category_1
           category_1=["sum", "mean"],
           
           # Percent of transactions by category_2
           category_2_1=["mean"],
           category_2_2=["mean"],
           category_2_3=["mean"],
           category_2_4=["mean"],
           category_2_5=["mean"],
           
           # Percent of transactions by category_3
           category_3_1=["mean"],
           category_3_2=["mean"],
           category_3_3=["mean"],
           
#            city_id     =["nunique"],
#            state_id    =["nunique"],
#            subsector_id=["nunique"],
           
#            merchant_id         =["nunique"],
#            merchant_category_id=["nunique"],
           
           purchase_amount=["mean", "max", "min", "std", "sum"],
           purchase_month =["mean", "max", "min", "std"],
#            purchase_date  =[np.ptp, "max", "min"],
    
           installments=["mean", "max", "min", "std", "sum"],

           month_lag =["mean", "max", "min", "std"],
           month_diff=["mean"])

for df in transactions.values():
    tmp = df.groupby("card_id").agg(agg).compute()
    
    display(tmp)
    
    break

Unnamed: 0_level_0,authorized_flag,authorized_flag,category_1,category_1,category_2_1,category_2_2,category_2_3,category_2_4,category_2_5,category_3_1,...,installments,installments,installments,installments,installments,month_lag,month_lag,month_lag,month_lag,month_diff
Unnamed: 0_level_1,sum,mean,sum,mean,mean,mean,mean,mean,mean,mean,...,mean,max,min,std,sum,mean,max,min,std,mean
card_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
C_ID_0046db9f8a,142,0.946667,3,0.020000,0.846667,0.000000,0.000000,0.000000,0.133333,0.000000,...,1.700000,12,-1,1.748441,255,-4.880000,0,-13,3.563273,10.700000
C_ID_011b0d9794,27,1.000000,27,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,4.481481,12,1,3.030237,121,-4.148148,0,-13,2.983335,10.592593
C_ID_01904d743d,50,0.704225,0,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.985915,...,0.014085,1,0,0.118678,1,-4.056338,0,-7,2.020518,10.760563
C_ID_01b098ff01,128,0.703297,26,0.142857,0.851648,0.000000,0.000000,0.000000,0.005495,0.000000,...,1.346154,6,1,1.173351,245,-5.840659,0,-10,3.481902,11.906593
C_ID_0382b662f4,162,0.952941,7,0.041176,0.947059,0.000000,0.011765,0.000000,0.000000,0.017647,...,0.994118,2,0,0.171904,169,-5.852941,0,-10,4.211371,13.576471
C_ID_03d3f6c6f4,65,0.984848,47,0.712121,0.015152,0.000000,0.257576,0.000000,0.015152,0.000000,...,3.878788,12,-1,3.967232,256,-1.742424,0,-5,1.562154,11.681818
C_ID_058b254a5b,461,0.946612,57,0.117043,0.809035,0.000000,0.053388,0.002053,0.000000,0.000000,...,1.125257,12,-1,0.895070,548,-3.708419,0,-8,2.390222,10.544148
C_ID_05fce05831,105,0.867769,0,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.991736,...,0.008264,1,0,0.090909,1,-6.206612,0,-9,2.476279,14.876033
C_ID_062c6dda0f,48,0.923077,34,0.653846,0.096154,0.000000,0.250000,0.000000,0.000000,0.000000,...,1.403846,10,-1,1.912391,73,-6.288462,0,-13,5.438918,10.538462
C_ID_06b77830e2,292,0.979866,36,0.120805,0.855705,0.016779,0.003356,0.000000,0.003356,0.000000,...,1.597315,12,-1,1.915450,476,-3.828859,0,-12,2.513447,10.614094


CPU times: user 11.8 s, sys: 880 ms, total: 12.7 s
Wall time: 1min 47s


In [None]:
data_dtype = dict(feature_1="category",
                  feature_2="category",
                  feature_3="category")

train_df = pd.read_csv("data/input/train.csv", dtype=data_dtype, parse_dates=["first_active_month"])
test_df  = pd.read_csv("data/input/test.csv",  dtype=data_dtype, parse_dates=["first_active_month"])

train_test_first_active_month = pd.concat([train_df.first_active_month, test_df.first_active_month])
test_df.first_active_month.fillna(train_test_first_active_month.value_counts().index[0], inplace=True)

train_df = train_df.merge(base_historical_transactions_df,            how="left", on="card_id") \
                   .merge(purchase_amount_historical_transactions_df, how="left", on="card_id")

test_df  = test_df.merge(base_historical_transactions_df,            how="left", on="card_id") \
                  .merge(purchase_amount_historical_transactions_df, how="left", on="card_id")

assert train_df.isnull().any().any() == False
assert  test_df.isnull().any().any() == False

display(train_df, test_df)

In [None]:
train_df[["feature_1"]].astype(int).agg({ "feature_1_1": "mean" })

In [None]:
for df in [train_df, test_df]:
    days_diff = train_df.first_active_month.max() - df.first_active_month
    df["years_since_first_active"]  = (days_diff / np.timedelta64(1, "Y")).astype(int)
    df["months_since_first_active"] = (days_diff / np.timedelta64(1, "M")).astype(int)
    
display(train_df, test_df)

In [None]:
class AddYearAndMonthTransformer:
    def call(self, df):
        for attr in ["year", "month"]:
            df["first_active_month" + "_" + attr] = getattr(df.first_active_month.dt, attr)

        return df.drop("first_active_month", axis=1)
    
add_year_and_month_transformer = AddYearAndMonthTransformer()

train_df = add_year_and_month_transformer.call(train_df)
test_df  = add_year_and_month_transformer.call(test_df)

display(train_df, test_df)

In [None]:
X_train, y_train = train_df.drop(["card_id", "target"], axis=1), train_df.target

X_test = test_df.drop("card_id", axis=1)

In [None]:
%%time

model = RandomForestRegressor(n_estimators=100, max_features="log2", max_depth=10, n_jobs=-1, random_state=SEED)
scores = np.sqrt(-cross_val_score(model, X_train, y_train, scoring="neg_mean_squared_error", cv=5))
print("CV: %.4f +/- %.4f" % (scores.mean(), scores.std() ** 2))

In [None]:
%%time

model.fit(X_train, y_train)

In [None]:
plt.scatter(y_train, model.predict(X_train), s=0.25);

In [None]:
%%time

y_test = model.predict(X_test)

In [None]:
submission_df = pd.read_csv("data/input/sample_submission.csv")
submission_df.target = y_test
submission_df.head()

In [None]:
submission_df.isnull().any().any()

In [None]:
submission_df.to_csv("submission.csv", index=False)

In [None]:
!head submission.csv

In [None]:
idx = np.argsort(model.feature_importances_)

importances = model.feature_importances_[idx]
columns = X_train.columns.values[idx]

plt.figure(figsize=(10, 10))
plt.barh(columns, importances);