In [1]:
import warnings
import time
import sys
import datetime
import joblib
import lightgbm as lgb
import os
import pickle
import seaborn as sns

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
# from sklearn.metrics import accuracy_score

warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', None)

In [2]:
df_train = pd.read_csv('../Kaggle-data/processed/train_processed_clv&churn&ltv.csv')
df_test = pd.read_csv('../Kaggle-data/processed/test_processed_clv&churn&ltv.csv')

In [4]:
# new = pd.read_csv('../Kaggle-data/new_merchant_transactions.csv', parse_dates=['purchase_date'])
hist = pd.read_csv('../Kaggle-data/historical_transactions.csv', parse_dates=['purchase_date'])
# merchants = pd.read_csv('../Kaggle-data/merchants.csv')

In [15]:
hist['purchase_amount_new'] = np.round(hist['purchase_amount'] / 0.00150265118 + 497.06)

In [16]:
hist = hist[['card_id', 'month_lag', 'purchase_amount_new']]
hist

Unnamed: 0,card_id,month_lag,purchase_amount_new
0,C_ID_4e6213e9bc,-8,29.0
1,C_ID_4e6213e9bc,-7,9.0
2,C_ID_4e6213e9bc,-6,18.0
3,C_ID_4e6213e9bc,-5,8.0
4,C_ID_4e6213e9bc,-11,16.0
...,...,...,...
29112356,C_ID_2863d2fa95,-1,76.0
29112357,C_ID_2863d2fa95,0,76.0
29112358,C_ID_5c240d6e3c,0,59.0
29112359,C_ID_5c240d6e3c,-1,1010.0


In [17]:
hist_sorted = hist.sort_values(by=['card_id', 'month_lag'])
hist_sorted

Unnamed: 0,card_id,month_lag,purchase_amount_new
19095775,C_ID_00007093c1,-12,186.0
19095845,C_ID_00007093c1,-12,116.0
19095866,C_ID_00007093c1,-12,276.0
19095896,C_ID_00007093c1,-12,522.0
19095780,C_ID_00007093c1,-11,230.0
...,...,...,...
22821219,C_ID_fffffd5772,0,22.0
22821227,C_ID_fffffd5772,0,65.0
22821232,C_ID_fffffd5772,0,12.0
22821235,C_ID_fffffd5772,0,10.0


In [19]:
hist_amount = hist_sorted.groupby(['card_id', 'month_lag']).purchase_amount_new.sum().reset_index()

In [21]:
hist_amount['purchase_amount_lag'] = hist_amount.purchase_amount_new.diff(1)

In [23]:
hist_amount['purchase_amount_lag_1'] = hist_amount.purchase_amount_new.shift(1)

In [24]:
hist_amount

Unnamed: 0,card_id,month_lag,purchase_amount_new,purchase_amount_lag,purchase_amount_lag_1
0,C_ID_00007093c1,-12,1100.0,,
1,C_ID_00007093c1,-11,1169.0,69.0,1100.0
2,C_ID_00007093c1,-10,1632.0,463.0,1169.0
3,C_ID_00007093c1,-9,2691.0,1059.0,1632.0
4,C_ID_00007093c1,-8,2877.0,186.0,2691.0
...,...,...,...,...,...
2563760,C_ID_fffffd5772,-4,359.0,-64.0,423.0
2563761,C_ID_fffffd5772,-3,421.0,62.0,359.0
2563762,C_ID_fffffd5772,-2,774.0,353.0,421.0
2563763,C_ID_fffffd5772,-1,359.0,-415.0,774.0


In [25]:
hist_amount['purchase_change_ratio'] = hist_amount['purchase_amount_lag'] / hist_amount['purchase_amount_lag_1']
hist_amount

Unnamed: 0,card_id,month_lag,purchase_amount_new,purchase_amount_lag,purchase_amount_lag_1,purchase_change_ratio
0,C_ID_00007093c1,-12,1100.0,,,
1,C_ID_00007093c1,-11,1169.0,69.0,1100.0,0.062727
2,C_ID_00007093c1,-10,1632.0,463.0,1169.0,0.396065
3,C_ID_00007093c1,-9,2691.0,1059.0,1632.0,0.648897
4,C_ID_00007093c1,-8,2877.0,186.0,2691.0,0.069119
...,...,...,...,...,...,...
2563760,C_ID_fffffd5772,-4,359.0,-64.0,423.0,-0.151300
2563761,C_ID_fffffd5772,-3,421.0,62.0,359.0,0.172702
2563762,C_ID_fffffd5772,-2,774.0,353.0,421.0,0.838480
2563763,C_ID_fffffd5772,-1,359.0,-415.0,774.0,-0.536176


In [30]:
hist_amount_change_ratio = hist_amount.groupby('card_id').agg(
    purchase_change_ratio_mean = ('purchase_change_ratio', 'mean'),
    purchase_change_ratio_max = ('purchase_change_ratio', 'max'),
    purchase_change_ratio_min = ('purchase_change_ratio', 'min'),
    purchase_change_ratio_var = ('purchase_change_ratio', 'std')
).reset_index()

hist_amount_change_ratio

Unnamed: 0,card_id,purchase_change_ratio_mean,purchase_change_ratio_max,purchase_change_ratio_min,purchase_change_ratio_var
0,C_ID_00007093c1,0.391058,2.202599,-0.874672,0.954641
1,C_ID_0001238066,1.190707,6.417323,-0.929562,2.691805
2,C_ID_0001506ef0,3.314126,41.375000,-0.974715,11.468376
3,C_ID_0001793786,0.508867,3.271517,-0.579462,1.245312
4,C_ID_000183fdda,0.110683,1.057501,-0.604350,0.693061
...,...,...,...,...,...
325535,C_ID_ffff1d9928,4.730634,21.304348,-0.933293,11.050481
325536,C_ID_ffff579d3a,0.529440,1.741117,-0.910290,0.993831
325537,C_ID_ffff756266,0.801350,6.360000,-0.873267,2.762465
325538,C_ID_ffff828181,2.900548,35.234375,-0.798620,9.420401


In [31]:
df_train_new = pd.merge(df_train, hist_amount_change_ratio, on='card_id')
df_test_new = pd.merge(df_test, hist_amount_change_ratio, on='card_id')

In [33]:
df_train_new.to_csv('../Kaggle-data/processed/train_processed_clv&churn&ltv&change_ratio.csv', index=False)
df_test_new.to_csv('../Kaggle-data/processed/test_processed_clv&churn&ltv&change_ratio.csv', index=False)