In [1]:
import numpy as np
import pandas as pd
import warnings
import gc
import matplotlib.pyplot as plt
import datetime
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
import xgboost as xgb
warnings.filterwarnings('ignore')
plt.style.use('seaborn')

In [2]:
# Reduce the memory usage - Inspired by Panchajanya Banerjee
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (
                start_mem - end_mem) / start_mem))
    return df


In [12]:
train = reduce_mem_usage(pd.read_csv('../data/train.csv', parse_dates=["first_active_month"]))
test = reduce_mem_usage(pd.read_csv('../data/test.csv', parse_dates=["first_active_month"]))
test["target"] = -999
data = pd.concat([train, test])

data["year"] = data["first_active_month"].dt.year
data["month"] = data["first_active_month"].dt.month
data["dayofyear"] = data["first_active_month"].dt.dayofyear
data["weekofyear"] = data["first_active_month"].dt.weekofyear
data['dayofweek'] = data['first_active_month'].dt.dayofweek
data["days"] = (datetime.date(2018, 2, 1) - data["first_active_month"].dt.date).dt.days

Mem. usage decreased to  4.04 Mb (56.2% reduction)
Mem. usage decreased to  2.24 Mb (52.5% reduction)


In [None]:
transactions = reduce_mem_usage(pd.read_csv('../data/historical_transactions.csv'))
transactions['authorized_flag'] = transactions['authorized_flag'].map({'Y': 1, 'N': 0})
transactions['category_1'] = transactions['category_1'].map({'Y': 1, 'N': 0})
# Feature Engineering - Adding new features inspired by Chau's first kernel
transactions["purchase_date"] = pd.to_datetime(transactions["purchase_date"])
transactions["year"] = transactions["purchase_date"].dt.year
transactions["month"] = transactions["purchase_date"].dt.month
transactions["weekofyear"] = transactions["purchase_date"].dt.weekofyear
transactions["dayofweek"] = transactions["purchase_date"].dt.dayofweek
transactions["weekend"] = (transactions["purchase_date"].dt.weekday >= 5).astype(int)
transactions["hour"] = transactions["purchase_date"].dt.hour
transactions["month_diff"] = ((datetime.datetime.today() - transactions["purchase_date"]).dt.days) // 30
transactions["month_diff"] += transactions["month_lag"]

# impute missing values - This is now excluded.
transactions["category_2"] = transactions["category_2"].fillna(2.0, inplace=True)
transactions["category_3"] = transactions["category_3"].fillna("A", inplace=True)
transactions["merchant_id"] = transactions["merchant_id"].fillna("M_ID_00a6ca8a8a", inplace=True)
gc.collect()


In [None]:
# Taking Reference from Other Kernels
def aggregate_transaction_hist(trans, prefix):
    agg_func = {'purchase_date': ['max', 'min'],
                'month_diff': ['mean'],
                'weekend': ['sum', 'mean'],
                'authorized_flag': ['sum', 'mean'],
                'category_1': ['sum', 'mean'],
                'purchase_amount': ['sum', 'mean', 'count', 'max', 'min', 'std'],
                'installments': ['sum', 'mean', 'count', 'max', 'min', 'std'],
                'merchant_id': ['nunique'],
                'month_lag': ['max', 'min', 'mean', 'var'],
                'month_diff': ['mean'],
                'card_id': ['size'],
                'month': ['nunique'],
                'hour': ['nunique'],
                'weekofyear': ['nunique'],
                'dayofweek': ['nunique'],
                'year': ['nunique'],
                'subsector_id': ['nunique'],
                'merchant_category_id': ['nunique'],
                'Christmas_Day_2017': ['mean'],
                'Mothers_Day_2017':['mean'],
                'fathers_day_2017': ['mean'],
                'Children_day_2017': ['mean'],
                'Black_Friday_2017': ['mean'],
                'Valentine_day_2017': ['mean'],
                'Mothers_Day_2018': ['mean']}
    agg_trans = trans.groupby(['card_id']).agg(agg_func)
    agg_trans.columns = [prefix + '_'.join(col).strip() for col in agg_trans.columns.values]
    agg_trans.reset_index(inplace=True)
    df = (trans.groupby('card_id').size().reset_index(name='{}transactions_count'.format(prefix)))
    agg_trans = pd.merge(df, agg_trans, on='card_id', how='left')

    return agg_trans