In [1]:
import datetime
import gc
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import time
import warnings

from contextlib import contextmanager
from pandas.core.common import SettingWithCopyWarning
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold

warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)

FEATS_EXCLUDED = ['first_active_month', 'target', 'card_id', 'outliers',
                  'hist_purchase_date_max', 'hist_purchase_date_min', 'hist_card_id_size',
                  'new_purchase_date_max', 'new_purchase_date_min', 'new_card_id_size',
                  'OOF_PRED', 'month_0','observation_date_x','observation_date_y']

@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

# rmse
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns
    
# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]

    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances.png')

# reduce memory
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df
    
# preprocessing train & test
def train_test(num_rows=None):

    # load csv
    train_df = pd.read_csv('D:\Ellunium\elo/train.csv', index_col=['card_id'], nrows=num_rows)
    test_df = pd.read_csv('D:\Ellunium\elo/test.csv', index_col=['card_id'], nrows=num_rows)

    print("Train samples: {}, test samples: {}".format(len(train_df), len(test_df)))

    # outlier
    train_df['outliers'] = 0
    train_df.loc[train_df['target'] < -30, 'outliers'] = 1

    # set target as nan
    test_df['target'] = np.nan

    # merge
    df = train_df.append(test_df)

    del train_df, test_df
    gc.collect()

    # to datetime
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])

    # datetime features
    df['quarter'] = df['first_active_month'].dt.quarter
    df['elapsed_time'] = (datetime.datetime.today() - df['first_active_month']).dt.days

    df['days_feature1'] = df['elapsed_time'] * df['feature_1']
    df['days_feature2'] = df['elapsed_time'] * df['feature_2']
    df['days_feature3'] = df['elapsed_time'] * df['feature_3']

    df['days_feature1_ratio'] = df['feature_1'] / df['elapsed_time']
    df['days_feature2_ratio'] = df['feature_2'] / df['elapsed_time']
    df['days_feature3_ratio'] = df['feature_3'] / df['elapsed_time']

    # one hot encoding
    df, cols = one_hot_encoder(df, nan_as_category=False)

    for f in ['feature_1','feature_2','feature_3']:
        order_label = df.groupby([f])['outliers'].mean()
        df[f] = df[f].map(order_label)

    df['feature_sum'] = df['feature_1'] + df['feature_2'] + df['feature_3']
    df['feature_mean'] = df['feature_sum']/3
    df['feature_max'] = df[['feature_1', 'feature_2', 'feature_3']].max(axis=1)
    df['feature_min'] = df[['feature_1', 'feature_2', 'feature_3']].min(axis=1)
    df['feature_var'] = df[['feature_1', 'feature_2', 'feature_3']].std(axis=1)

    return df

In [2]:
def historical_transactions(num_rows=None):
    # load csv
    hist_df = pd.read_csv('D:\Ellunium\elo/historical_transactions.csv', nrows=num_rows)

    # fillna
    hist_df['category_2'].fillna(1.0,inplace=True)
    hist_df['category_3'].fillna('A',inplace=True)
    hist_df['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True)
    hist_df['installments'].replace(-1, np.nan,inplace=True)
    hist_df['installments'].replace(999, np.nan,inplace=True)

    # trim
    #hist_df['purchase_amount'] = hist_df['purchase_amount'].apply(lambda x: min(x, 0.8))
    hist_df['purchase_amount'] = np.round(hist_df['purchase_amount'] / 0.00150265118 + 497.06,2)

    # Y/N to 1/0
    hist_df['authorized_flag'] = hist_df['authorized_flag'].map({'Y': 1, 'N': 0}).astype(int)
    hist_df['category_1'] = hist_df['category_1'].map({'Y': 1, 'N': 0}).astype(int)
    hist_df['category_3'] = hist_df['category_3'].map({'A':0, 'B':1, 'C':2})

    # datetime features
    hist_df['purchase_date'] = pd.to_datetime(hist_df['purchase_date'])
    hist_df['month'] = hist_df['purchase_date'].dt.month
    hist_df['day'] = hist_df['purchase_date'].dt.day
    hist_df['hour'] = hist_df['purchase_date'].dt.hour
    #hist_df['year'] = hist_df['purchase_date'].dt.year
    hist_df['weekofyear'] = hist_df['purchase_date'].dt.weekofyear
    hist_df['weekday'] = hist_df['purchase_date'].dt.weekday
    hist_df['weekend'] = (hist_df['purchase_date'].dt.weekday >=5).astype(int)
    

    # additional features
    hist_df['price'] = hist_df['purchase_amount'] / hist_df['installments']

    #Christmas : December 25 2017
    hist_df['Christmas_Day_2017']=(pd.to_datetime('2017-12-25')-hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    #Mothers Day: May 14 2017
    hist_df['Mothers_Day_2017']=(pd.to_datetime('2017-06-04')-hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    #fathers day: August 13 2017
    hist_df['fathers_day_2017']=(pd.to_datetime('2017-08-13')-hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    #Childrens day: October 12 2017
    hist_df['Children_day_2017']=(pd.to_datetime('2017-10-12')-hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    #Valentine's Day : 12th June, 2017
    hist_df['Valentine_Day_2017']=(pd.to_datetime('2017-06-12')-hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    #Black Friday : 24th November 2017
    hist_df['Black_Friday_2017']=(pd.to_datetime('2017-11-24') - hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)

    #2018
    #Mothers Day: May 13 2018
    hist_df['Mothers_Day_2018']=(pd.to_datetime('2018-05-13')-hist_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)

    hist_df['month_diff'] = ((datetime.datetime.today() - hist_df['purchase_date']).dt.days)//30
    hist_df['month_diff'] += hist_df['month_lag']

    # additional features
    hist_df['duration'] = hist_df['purchase_amount']*hist_df['month_diff']
    hist_df['amount_month_ratio'] = hist_df['purchase_amount']/hist_df['month_diff']

    # reduce memory usage
    hist_df = reduce_mem_usage(hist_df)

    col_unique =['subsector_id', 'merchant_id', 'merchant_category_id']
    col_seas = ['month', 'hour', 'weekofyear', 'weekday', 'day']

    aggs = {}
    for col in col_unique:
        aggs[col] = ['nunique']

    for col in col_seas:
        aggs[col] = ['nunique', 'mean', 'min', 'max']

    aggs['purchase_amount'] = ['sum','max','min','mean','var','skew']
    aggs['installments'] = ['sum','max','mean','var','skew']
    aggs['purchase_date'] = ['max','min']
    aggs['month_lag'] = ['max','min','mean','var','skew']
    aggs['month_diff'] = ['max','min','mean','var','skew']
    aggs['authorized_flag'] = ['mean']
    aggs['weekend'] = ['mean'] # overwrite
    aggs['weekday'] = ['mean'] # overwrite
    aggs['day'] = ['nunique', 'mean', 'min'] # overwrite
    aggs['category_1'] = ['mean']
    aggs['category_2'] = ['mean']
    aggs['category_3'] = ['mean']
    aggs['card_id'] = ['size','count']
    aggs['price'] = ['sum','mean','max','min','var']
    aggs['Christmas_Day_2017'] = ['mean']
    aggs['Mothers_Day_2017'] = ['mean']
    aggs['fathers_day_2017'] = ['mean']
    aggs['Children_day_2017'] = ['mean']
    aggs['Valentine_Day_2017'] = ['mean']
    aggs['Black_Friday_2017'] = ['mean']
    aggs['Mothers_Day_2018'] = ['mean']
    aggs['duration']=['mean','min','max','var','skew']
    aggs['amount_month_ratio']=['mean','min','max','var','skew']

    for col in ['category_2','category_3']:
        hist_df[col+'_mean'] = hist_df.groupby([col])['purchase_amount'].transform('mean')
        hist_df[col+'_min'] = hist_df.groupby([col])['purchase_amount'].transform('min')
        hist_df[col+'_max'] = hist_df.groupby([col])['purchase_amount'].transform('max')
        hist_df[col+'_sum'] = hist_df.groupby([col])['purchase_amount'].transform('sum')
        aggs[col+'_mean'] = ['mean']

    hist_df = hist_df.reset_index().groupby('card_id').agg(aggs)

    # change column name
    hist_df.columns = pd.Index([e[0] + "_" + e[1] for e in hist_df.columns.tolist()])
    hist_df.columns = ['hist_'+ c for c in hist_df.columns]

    hist_df['hist_purchase_date_diff'] = (hist_df['hist_purchase_date_max']-hist_df['hist_purchase_date_min']).dt.days
    hist_df['hist_purchase_date_average'] = hist_df['hist_purchase_date_diff']/hist_df['hist_card_id_size']
    hist_df['hist_purchase_date_uptonow'] = (datetime.datetime.today()-hist_df['hist_purchase_date_max']).dt.days
    hist_df['hist_purchase_date_uptomin'] = (datetime.datetime.today()-hist_df['hist_purchase_date_min']).dt.days
    
    # reduce memory usage
    hist_df = reduce_mem_usage(hist_df)
    
    

    return hist_df

In [3]:
def rate_historical_features(num_rows=None):
    # load csv
    hist_df = pd.read_csv('D:\Ellunium\elo/historical_transactions.csv', nrows=num_rows,
                          usecols=['card_id','purchase_date','purchase_amount','merchant_category_id','month_lag'])
    
    #hist_df['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True)
    hist_df['purchase_amount'] = hist_df['purchase_amount'].apply(lambda x: min(x, 0.8))
    hist_df['purchase_date'] = pd.to_datetime(hist_df['purchase_date'])
    hist_df['month'] = hist_df['purchase_date'].dt.month
    hist_df['year'] = hist_df['purchase_date'].dt.year
    
    
    # upd 16.02 Calculating rate of change purchases for each month
    
    hist_df['purchase_amount_new'] = np.round(hist_df['purchase_amount'] / 0.00150265118 + 497.06,2)
    
    ex_df = hist_df.groupby(['card_id','month_lag']).agg({'purchase_amount_new':'sum'})\
        .reset_index().rename(columns={'purchase_amount_new':'sum'})
    #ex_df['sum'] = np.round(ex_df['sum']/ex_df['n_year'],2)
    #ex_df.drop('n_year',axis=1,inplace=True)
    
    new_df = hist_df.groupby(['card_id']).size().reset_index(name="tmp").drop('tmp',axis=1)
    
    month_lags = list(set(ex_df['month_lag'].values))
    for i in month_lags:
        new_df = new_df.merge(ex_df[ex_df['month_lag']==i].rename(columns={'sum':'ml_sum_'+str(i)}).drop('month_lag',axis=1),
                          on='card_id',how="outer")
    
    #hist_df = hist_df.merge(new_df,on='merchant_category_id',how="outer")
    
    del ex_df
    gc.collect()
    
    #new_df = hist_df.groupby(['card_id']).agg({'hist_sum_'+str(i):'mean' for i in range(1,13)}).reset_index()
    
    for i in month_lags:
        new_df['ml_sum_'+str(i)].fillna(new_df['ml_sum_'+str(i)].mean(),inplace=True)
    
    # ПЕРЕНЕСТИ В РАЗДЕЛ ДОП ФЬЮЧЕРСОВ
    #x = np.arange(1,13)
    #y = np.roll(x,1)
    #for i in range(12):
    #    new_df['hist_ratio_'+str(x[i])] = np.round(new_df['hist_sum_'+str(x[i])] / new_df['hist_sum_'+str(y[i])],2)
    
    #hist_df.merge(new_df,on="card_id",how='outer',inplace=True)
    
    #del new_df
    #gc.collect()
    
    # upd 16.02 features with observation date
    
    #last_hist_transaction = hist_df.groupby('card_id').agg({'month_lag' : 'max', 'purchase_date' : 'max'}).reset_index()
    #last_hist_transaction.columns = ['card_id', 'hist_month_lag', 'hist_purchase_date']
    #last_hist_transaction['observation_date'] = \
    #last_hist_transaction.apply(lambda x: x['hist_purchase_date']  - pd.DateOffset(months=x['hist_month_lag']), axis=1)
    #last_hist_transaction['observation_date'] = last_hist_transaction['observation_date'].dt.to_period('M')\
    #    .dt.to_timestamp() + pd.DateOffset(months=1)
    #last_hist_transaction['observation_month'] = last_hist_transaction['observation_date'].dt.month
    #last_hist_transaction.drop(['hist_month_lag','hist_purchase_date'],axis=1,inplace=True)
    
    #new_df = new_df.merge(last_hist_transaction,on="card_id",how='outer')
    
    #del last_hist_transaction
    #gc.collect()
    new_df = reduce_mem_usage(new_df)
    
    return new_df

In [4]:
def new_merchant_transactions(num_rows=None):
    # load csv
    new_merchant_df = pd.read_csv('D:\Ellunium\elo/new_merchant_transactions.csv', nrows=num_rows)

    # fillna
    new_merchant_df['category_2'].fillna(1.0,inplace=True)
    new_merchant_df['category_3'].fillna('A',inplace=True)
    new_merchant_df['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True)
    new_merchant_df['installments'].replace(-1, np.nan,inplace=True)
    new_merchant_df['installments'].replace(999, np.nan,inplace=True)

    # trim
    #new_merchant_df['purchase_amount'] = new_merchant_df['purchase_amount'].apply(lambda x: min(x, 0.8))
    new_merchant_df['purchase_amount'] = np.round(new_merchant_df['purchase_amount'] / 0.00150265118 + 497.06,2)

    # Y/N to 1/0
    new_merchant_df['authorized_flag'] = new_merchant_df['authorized_flag'].map({'Y': 1, 'N': 0}).astype(int)
    new_merchant_df['category_1'] = new_merchant_df['category_1'].map({'Y': 1, 'N': 0}).astype(int)
    new_merchant_df['category_3'] = new_merchant_df['category_3'].map({'A':0, 'B':1, 'C':2}).astype(int)

    # datetime features
    new_merchant_df['purchase_date'] = pd.to_datetime(new_merchant_df['purchase_date'])
    new_merchant_df['month'] = new_merchant_df['purchase_date'].dt.month
    new_merchant_df['day'] = new_merchant_df['purchase_date'].dt.day
    #new_merchant_df['year'] = new_merchant_df['purchase_date'].dt.year
    new_merchant_df['hour'] = new_merchant_df['purchase_date'].dt.hour
    new_merchant_df['weekofyear'] = new_merchant_df['purchase_date'].dt.weekofyear
    new_merchant_df['weekday'] = new_merchant_df['purchase_date'].dt.weekday
    new_merchant_df['weekend'] = (new_merchant_df['purchase_date'].dt.weekday >=5).astype(int)
    
    
    # additional features
    new_merchant_df['price'] = new_merchant_df['purchase_amount'] / new_merchant_df['installments']

    #Christmas : December 25 2017
    new_merchant_df['Christmas_Day_2017']=(pd.to_datetime('2017-12-25')-new_merchant_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    #Childrens day: October 12 2017
    new_merchant_df['Children_day_2017']=(pd.to_datetime('2017-10-12')-new_merchant_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)
    #Black Friday : 24th November 2017
    new_merchant_df['Black_Friday_2017']=(pd.to_datetime('2017-11-24') - new_merchant_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)

    #Mothers Day: May 13 2018
    new_merchant_df['Mothers_Day_2018']=(pd.to_datetime('2018-05-13')-new_merchant_df['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0)

    new_merchant_df['month_diff'] = ((datetime.datetime.today() - new_merchant_df['purchase_date']).dt.days)//30
    new_merchant_df['month_diff'] += new_merchant_df['month_lag']

    # additional features
    new_merchant_df['duration'] = new_merchant_df['purchase_amount']*new_merchant_df['month_diff']
    new_merchant_df['amount_month_ratio'] = new_merchant_df['purchase_amount']/new_merchant_df['month_diff']

    # reduce memory usage
    new_merchant_df = reduce_mem_usage(new_merchant_df)

    col_unique =['subsector_id', 'merchant_id', 'merchant_category_id']
    col_seas = ['month', 'hour', 'weekofyear', 'weekday', 'day']

    aggs = {}
    for col in col_unique:
        aggs[col] = ['nunique']

    for col in col_seas:
        aggs[col] = ['nunique', 'mean', 'min', 'max']

    aggs['purchase_amount'] = ['sum','max','min','mean','var','skew']
    aggs['installments'] = ['sum','max','mean','var','skew']
    aggs['purchase_date'] = ['max','min']
    aggs['month_lag'] = ['max','min','mean','var','skew']
    aggs['month_diff'] = ['mean','var','skew']
    aggs['weekend'] = ['mean']
    aggs['month'] = ['mean', 'min', 'max']
    aggs['weekday'] = ['mean', 'min', 'max']
    aggs['category_1'] = ['mean']
    aggs['category_2'] = ['mean']
    aggs['category_3'] = ['mean']
    aggs['card_id'] = ['size','count']
    aggs['price'] = ['mean','max','min','var']
    aggs['Christmas_Day_2017'] = ['mean']
    aggs['Children_day_2017'] = ['mean']
    aggs['Black_Friday_2017'] = ['mean']
    aggs['Mothers_Day_2018'] = ['mean']
    aggs['duration']=['mean','min','max','var','skew']
    aggs['amount_month_ratio']=['mean','min','max','var','skew']

    for col in ['category_2','category_3']:
        new_merchant_df[col+'_mean'] = new_merchant_df.groupby([col])['purchase_amount'].transform('mean')
        new_merchant_df[col+'_min'] = new_merchant_df.groupby([col])['purchase_amount'].transform('min')
        new_merchant_df[col+'_max'] = new_merchant_df.groupby([col])['purchase_amount'].transform('max')
        new_merchant_df[col+'_sum'] = new_merchant_df.groupby([col])['purchase_amount'].transform('sum')
        aggs[col+'_mean'] = ['mean']

    new_merchant_df = new_merchant_df.reset_index().groupby('card_id').agg(aggs)

    # change column name
    new_merchant_df.columns = pd.Index([e[0] + "_" + e[1] for e in new_merchant_df.columns.tolist()])
    new_merchant_df.columns = ['new_'+ c for c in new_merchant_df.columns]

    new_merchant_df['new_purchase_date_diff'] = (new_merchant_df['new_purchase_date_max']-new_merchant_df['new_purchase_date_min']).dt.days
    new_merchant_df['new_purchase_date_average'] = new_merchant_df['new_purchase_date_diff']/new_merchant_df['new_card_id_size']
    new_merchant_df['new_purchase_date_uptonow'] = (datetime.datetime.today()-new_merchant_df['new_purchase_date_max']).dt.days
    new_merchant_df['new_purchase_date_uptomin'] = (datetime.datetime.today()-new_merchant_df['new_purchase_date_min']).dt.days

    # reduce memory usage
    new_merchant_df = reduce_mem_usage(new_merchant_df)

    return new_merchant_df

In [5]:
def rate_new_merch_features(num_rows=None):
    # load csv
    n_m_df = pd.read_csv('D:\Ellunium\elo/new_merchant_transactions.csv', nrows=num_rows,
                          usecols=['card_id','purchase_date','purchase_amount','merchant_category_id','month_lag'])
    
    #n_m_df['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True)
    #n_m_df['purchase_amount'] = n_m_df['purchase_amount'].apply(lambda x: min(x, 0.8))
    n_m_df['purchase_date'] = pd.to_datetime(n_m_df['purchase_date'])
    n_m_df['month'] = n_m_df['purchase_date'].dt.month
    n_m_df['year'] = n_m_df['purchase_date'].dt.year
    
    
    # upd 16.02 Calculating rate of change purchases for each month
    
    n_m_df['purchase_amount_new'] = np.round(n_m_df['purchase_amount'] / 0.00150265118 + 497.06,2)
    
    ex_df = n_m_df.groupby(['card_id','month_lag']).agg({'purchase_amount_new':'sum'})\
        .reset_index().rename(columns={'purchase_amount_new':'sum'})
    #ex_df['sum'] = np.round(ex_df['sum']/ex_df['n_year'],2)
    #ex_df.drop('n_year',axis=1,inplace=True)
    
    new_df = n_m_df.groupby(['card_id']).size().reset_index(name="tmp").drop('tmp',axis=1)
    month_lags = list(set(ex_df['month_lag'].values))
    for i in month_lags:
        new_df = new_df.merge(ex_df[ex_df['month_lag']==i].rename(columns={'sum':'ml_sum_'+str(i)}).drop('month_lag',axis=1),
                          on='card_id',how="outer")
    
    #n_m_df = n_m_df.merge(new_df,on='merchant_category_id',how="outer")
    
    del ex_df
    gc.collect()
    
    #new_df = n_m_df.groupby(['card_id']).agg({'newm_sum_'+str(i):'mean' for i in range(1,13)}).reset_index()
    
    for i in month_lags:
        new_df['ml_sum_'+str(i)].fillna(new_df['ml_sum_'+str(i)].mean(),inplace=True)
    
    #x = np.arange(1,13)
    #y = np.roll(x,1)
    #for i in range(12):
    #    new_df['newm_ratio_'+str(x[i])] = np.round(new_df['newm_sum_'+str(x[i])] / new_df['newm_sum_'+str(y[i])],2)
    
    #hist_df.merge(new_df,on="card_id",how='outer',inplace=True)
    
    #del new_df
    #gc.collect()
    
    # upd 16.02 features with observation date
    
    #first_new_transaction = n_m_df.groupby('card_id').agg({'month_lag' : 'min', 'purchase_date' : 'min'}).reset_index()
    #first_new_transaction.columns = ['card_id', 'new_month_lag', 'new_purchase_date']
    #first_new_transaction['observation_date'] = \
    #    first_new_transaction.apply(lambda x: x['new_purchase_date']  - pd.DateOffset(months=x['new_month_lag']-1), axis=1)
    #first_new_transaction['observation_date'] = first_new_transaction['observation_date'].dt.to_period('M').dt.to_timestamp()
    
    #first_new_transaction['observation_month'] = first_new_transaction['observation_date'].dt.month
    #first_new_transaction.drop(['new_month_lag','new_purchase_date'],axis=1,inplace=True)
    
    #new_df = new_df.merge(first_new_transaction,on="card_id",how='outer')
    
    #del first_new_transaction
    #gc.collect()
    new_df = reduce_mem_usage(new_df)
    
    return new_df

In [17]:
def predict_lag1(df):
    df_train = df[df['ml_sum_1'].notnull()]
    df_test = df[df['ml_sum_1'].isnull()]  
    target = df_train['ml_sum_1']
    del(df)
    del(df_train['ml_sum_1'])
    gc.collect()
    
    features = [c for c in df_train.columns if c not in FEATS_EXCLUDED]
    categorical_feats = [c for c in features if 'feature_' in c]
    
    param = {'objective':'regression',
         'num_leaves': 31,
         'min_data_in_leaf': 25,
         'max_depth': 7,
         'learning_rate': 0.01,
         'lambda_l1':0.13,
         "boosting": "gbdt",
         "feature_fraction":0.85,
         'bagging_freq':8,
         "bagging_fraction": 0.9 ,
         "metric": 'rmse',
         "verbosity": -1,
         "random_state": 2333}

    folds = KFold(n_splits= 5, shuffle=True, random_state=326) 
    oof_lgb = np.zeros(len(df_train))
    predictions_lgb = np.zeros(len(df_test))
    feature_importance_df = pd.DataFrame()

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train)):
        print("fold {}".format(fold_))
        trn_data = lgb.Dataset(df_train.iloc[trn_idx][features], label=target.iloc[trn_idx])#, categorical_feature=categorical_feats)
        val_data = lgb.Dataset(df_train.iloc[val_idx][features], label=target.iloc[val_idx])#, categorical_feature=categorical_feats)

        num_round = 10000
        clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval= 100, early_stopping_rounds = 200)
        oof_lgb[val_idx] = clf.predict(df_train.iloc[val_idx][features], num_iteration=clf.best_iteration)

        fold_importance_df = pd.DataFrame()
        fold_importance_df["Feature"] = features
        fold_importance_df["importance"] = clf.feature_importance()
        fold_importance_df["fold"] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

        predictions_lgb += clf.predict(df_test[features], num_iteration=clf.best_iteration) / folds.n_splits

    print("CV score: {:<8.5f}".format(mean_squared_error(oof_lgb, target)**0.5))
    # simple model
    pred_ml1_df = pd.DataFrame({"card_id":df_test["card_id"].values})
    pred_ml1_df["ml_sum_1"] = predictions_lgb
    
    del df_train,df_test
    gc.collect()
    
    return pred_ml1_df
    

In [18]:
def predict_lag2(df):
    df_train = df[df['ml_sum_2'].notnull()]
    df_test = df[df['ml_sum_2'].isnull()]  
    target = df_train['ml_sum_2']
    del(df)
    del(df_train['ml_sum_2'])
    gc.collect()
    
    features = [c for c in df_train.columns if c not in FEATS_EXCLUDED]
    categorical_feats = [c for c in features if 'feature_' in c]
    
    param = {'objective':'regression',
         'num_leaves': 31,
         'min_data_in_leaf': 25,
         'max_depth': 7,
         'learning_rate': 0.01,
         'lambda_l1':0.13,
         "boosting": "gbdt",
         "feature_fraction":0.85,
         'bagging_freq':8,
         "bagging_fraction": 0.9 ,
         "metric": 'rmse',
         "verbosity": -1,
         "random_state": 2333}

    folds = KFold(n_splits= 5, shuffle=True, random_state=326) 
    oof_lgb = np.zeros(len(df_train))
    predictions_lgb = np.zeros(len(df_test))
    feature_importance_df = pd.DataFrame()

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train)):
        print("fold {}".format(fold_))
        trn_data = lgb.Dataset(df_train.iloc[trn_idx][features], label=target.iloc[trn_idx])#, categorical_feature=categorical_feats)
        val_data = lgb.Dataset(df_train.iloc[val_idx][features], label=target.iloc[val_idx])#, categorical_feature=categorical_feats)

        num_round = 10000
        clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval= 100, early_stopping_rounds = 200)
        oof_lgb[val_idx] = clf.predict(df_train.iloc[val_idx][features], num_iteration=clf.best_iteration)

        fold_importance_df = pd.DataFrame()
        fold_importance_df["Feature"] = features
        fold_importance_df["importance"] = clf.feature_importance()
        fold_importance_df["fold"] = fold_ + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

        predictions_lgb += clf.predict(df_test[features], num_iteration=clf.best_iteration) / folds.n_splits

    print("CV score: {:<8.5f}".format(mean_squared_error(oof_lgb, target)**0.5))
    # simple model
    pred_ml2_df = pd.DataFrame({"card_id":df_test["card_id"].values})
    pred_ml2_df["ml_sum_2"] = predictions_lgb
    
    del df_train,df_test
    gc.collect()
    
    return pred_ml2_df
    

In [8]:
def add_ml_lags(df):
    pred_ml1_df = predict_lag1(df)
    pred_ml2_df = predict_lag2(df)
    for card_id in pred_ml1_df['card_id']:
        df.loc[df['card_id']==card_id,'ml_sum_1'] = pred_ml1_df.loc[pred_ml1_df['card_id']==card_id,'ml_sum_1'].values
        df.loc[df['card_id']==card_id,'ml_sum_2'] = pred_ml2_df.loc[pred_ml2_df['card_id']==card_id,'ml_sum_2'].values
    return df

In [9]:
def additional_features(df):
    df['hist_first_buy'] = (df['hist_purchase_date_min'] - df['first_active_month']).dt.days
    df['hist_last_buy'] = (df['hist_purchase_date_max'] - df['first_active_month']).dt.days
    df['new_first_buy'] = (df['new_purchase_date_min'] - df['first_active_month']).dt.days
    df['new_last_buy'] = (df['new_purchase_date_max'] - df['first_active_month']).dt.days

    date_features=['hist_purchase_date_max','hist_purchase_date_min',
                   'new_purchase_date_max', 'new_purchase_date_min']

    for f in date_features:
        df[f] = df[f].astype(np.int64) * 1e-9
    
    
    # ПЕРЕНЕСТИ В РАЗДЕЛ ДОП ФЬЮЧЕРСОВ
    x = np.arange(-13,3)
    y = np.roll(x,1)
    for i in range(16):
        df['ml_ratio_'+str(x[i])] = np.round(df['ml_sum_'+str(x[i])] / df['ml_sum_'+str(y[i])],2)
    
    # upd 16.02 Возможно стоит считать только для месяцев 2,3,4
    #for i in range(1,13):
    #    df['new_hist_ratio_'+str(i)] = np.round(df['newm_sum_'+str(i)] / df['hist_sum_'+str(i)],2)
        
    # by option
    #df.drop(['_'.join(['newm_sum',str(j)]) for j in range(1,13)],axis=1,inplace=True)
    #df.drop(['_'.join(['hist_sum',str(j)]) for j in range(1,13)],axis=1,inplace=True)
    #gc.collect()
    
    df['card_id_total'] = df['new_card_id_size']+df['hist_card_id_size']
    df['card_id_cnt_total'] = df['new_card_id_count']+df['hist_card_id_count']
    df['card_id_cnt_ratio'] = df['new_card_id_count']/df['hist_card_id_count']
    df['purchase_amount_total'] = df['new_purchase_amount_sum']+df['hist_purchase_amount_sum']
    df['purchase_amount_mean'] = df['new_purchase_amount_mean']+df['hist_purchase_amount_mean']
    df['purchase_amount_max'] = df['new_purchase_amount_max']+df['hist_purchase_amount_max']
    df['purchase_amount_min'] = df['new_purchase_amount_min']+df['hist_purchase_amount_min']
    df['purchase_amount_ratio'] = df['new_purchase_amount_sum']/df['hist_purchase_amount_sum']
    df['month_diff_mean'] = df['new_month_diff_mean']+df['hist_month_diff_mean']
    df['month_diff_ratio'] = df['new_month_diff_mean']/df['hist_month_diff_mean']
    df['month_lag_mean'] = df['new_month_lag_mean']+df['hist_month_lag_mean']
    df['month_lag_max'] = df['new_month_lag_max']+df['hist_month_lag_max']
    df['month_lag_min'] = df['new_month_lag_min']+df['hist_month_lag_min']
    df['category_1_mean'] = df['new_category_1_mean']+df['hist_category_1_mean']
    df['installments_total'] = df['new_installments_sum']+df['hist_installments_sum']
    df['installments_mean'] = df['new_installments_mean']+df['hist_installments_mean']
    df['installments_max'] = df['new_installments_max']+df['hist_installments_max']
    df['installments_ratio'] = df['new_installments_sum']/df['hist_installments_sum']
    df['price_total'] = df['purchase_amount_total'] / df['installments_total']
    df['price_mean'] = df['purchase_amount_mean'] / df['installments_mean']
    df['price_max'] = df['purchase_amount_max'] / df['installments_max']
    df['duration_mean'] = df['new_duration_mean']+df['hist_duration_mean']
    df['duration_min'] = df['new_duration_min']+df['hist_duration_min']
    df['duration_max'] = df['new_duration_max']+df['hist_duration_max']
    df['amount_month_ratio_mean']=df['new_amount_month_ratio_mean']+df['hist_amount_month_ratio_mean']
    df['amount_month_ratio_min']=df['new_amount_month_ratio_min']+df['hist_amount_month_ratio_min']
    df['amount_month_ratio_max']=df['new_amount_month_ratio_max']+df['hist_amount_month_ratio_max']
    df['new_CLV'] = df['new_card_id_count'] * df['new_purchase_amount_sum'] / df['new_month_diff_mean']
    df['hist_CLV'] = df['hist_card_id_count'] * df['hist_purchase_amount_sum'] / df['hist_month_diff_mean']
    df['CLV_ratio'] = df['new_CLV'] / df['hist_CLV']

    return df

In [10]:
debug = False
num_rows = 10000 if debug else None
df = train_test(num_rows)

Train samples: 201917, test samples: 123623


In [11]:
df = pd.merge(df, rate_historical_features(num_rows), on='card_id', how='outer')

Memory usage after optimization is: 19.25 MB
Decreased by 51.6%


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 325540 entries, 0 to 325539
Data columns (total 34 columns):
card_id                325540 non-null object
feature_1              325540 non-null float64
feature_2              325540 non-null float64
feature_3              325540 non-null float64
first_active_month     325539 non-null datetime64[ns]
outliers               201917 non-null float64
target                 201917 non-null float64
quarter                325539 non-null float64
elapsed_time           325539 non-null float64
days_feature1          325539 non-null float64
days_feature2          325539 non-null float64
days_feature3          325539 non-null float64
days_feature1_ratio    325539 non-null float64
days_feature2_ratio    325539 non-null float64
days_feature3_ratio    325539 non-null float64
feature_sum            325540 non-null float64
feature_mean           325540 non-null float64
feature_max            325540 non-null float64
feature_min            325540 non-nul

In [13]:
df = pd.merge(df, rate_new_merch_features(num_rows), on='card_id', how='outer')

Memory usage after optimization is: 6.64 MB
Decreased by 25.0%


In [14]:
df = pd.merge(df, historical_transactions(num_rows), on='card_id', how='outer')

Memory usage after optimization is: 1998.99 MB
Decreased by 69.5%
Memory usage after optimization is: 63.64 MB
Decreased by 55.1%


In [15]:
df = pd.merge(df, new_merchant_transactions(num_rows), on='card_id', how='outer')

Memory usage after optimization is: 125.43 MB
Decreased by 67.8%
Memory usage after optimization is: 53.38 MB
Decreased by 52.0%


In [19]:
df = add_ml_lags(df)

fold 0
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 544.481	valid_1's rmse: 509.625
[200]	training's rmse: 356.934	valid_1's rmse: 336.532
[300]	training's rmse: 287.366	valid_1's rmse: 281.972
[400]	training's rmse: 251.071	valid_1's rmse: 260.425
[500]	training's rmse: 226.925	valid_1's rmse: 251.398
[600]	training's rmse: 205.259	valid_1's rmse: 244.061
[700]	training's rmse: 189.352	valid_1's rmse: 240.38
[800]	training's rmse: 174.589	valid_1's rmse: 236.684
[900]	training's rmse: 163.703	valid_1's rmse: 234.298
[1000]	training's rmse: 153.61	valid_1's rmse: 231.589
[1100]	training's rmse: 144.741	valid_1's rmse: 229.708
[1200]	training's rmse: 136.822	valid_1's rmse: 228.563
[1300]	training's rmse: 130.3	valid_1's rmse: 227.152
[1400]	training's rmse: 123.243	valid_1's rmse: 226.545
[1500]	training's rmse: 117.604	valid_1's rmse: 226.377
[1600]	training's rmse: 111.936	valid_1's rmse: 226.036
[1700]	training's rmse: 107.954	valid_1's rmse:

[200]	training's rmse: 494.686	valid_1's rmse: 366.972
[300]	training's rmse: 440.889	valid_1's rmse: 328.031
[400]	training's rmse: 402.998	valid_1's rmse: 309.559
[500]	training's rmse: 378.09	valid_1's rmse: 302.956
[600]	training's rmse: 357.927	valid_1's rmse: 300.987
[700]	training's rmse: 340.101	valid_1's rmse: 299.111
[800]	training's rmse: 324.127	valid_1's rmse: 296.911
[900]	training's rmse: 309.361	valid_1's rmse: 294.144
[1000]	training's rmse: 296.167	valid_1's rmse: 291.153
[1100]	training's rmse: 283.724	valid_1's rmse: 289.694
[1200]	training's rmse: 273.216	valid_1's rmse: 289.351
[1300]	training's rmse: 262.672	valid_1's rmse: 288.302
[1400]	training's rmse: 253.381	valid_1's rmse: 286.781
[1500]	training's rmse: 244.448	valid_1's rmse: 284.958
[1600]	training's rmse: 236.877	valid_1's rmse: 285.143
[1700]	training's rmse: 229.121	valid_1's rmse: 284.877
Early stopping, best iteration is:
[1535]	training's rmse: 242.062	valid_1's rmse: 284.609
fold 1
Training until 

In [20]:
df = additional_features(df)

  result = com._values_from_object(self).round(decimals)


In [21]:
train_df = df[df['target'].notnull()]
test_df = df[df['target'].isnull()]
del df
gc.collect()

28

In [22]:
num_folds=11
stratified = True
if stratified:
    folds = StratifiedKFold(n_splits= num_folds, shuffle=True, random_state=326)
else:
    folds = KFold(n_splits= num_folds, shuffle=True, random_state=326)    

In [18]:
train_df['target'] = 2**train_df['target']

In [None]:
# Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in FEATS_EXCLUDED]

    # k-fold
    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['outliers'])):
        train_x, train_y = train_df[feats].iloc[train_idx], train_df['target'].iloc[train_idx]
        valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['target'].iloc[valid_idx]

        # set data structure
        lgb_train = lgb.Dataset(train_x,
                                label=train_y,
                                free_raw_data=False)
        lgb_test = lgb.Dataset(valid_x,
                               label=valid_y,
                               free_raw_data=False)

        # params optimized by optuna
        params ={
                'task': 'train',
                'boosting': 'goss',
                'objective': 'regression',
                'metric': 'rmse',
                'learning_rate': 0.01,
                'subsample': 0.9855232997390695,
                'max_depth': 7,
                'top_rate': 0.9064148448434349,
                'num_leaves': 63,
                'min_child_weight': 41.9612869171337,
                'other_rate': 0.0721768246018207,
                'reg_alpha': 9.677537745007898,
                'colsample_bytree': 0.5665320670155495,
                'min_split_gain': 9.820197773625843,
                'reg_lambda': 8.2532317400459,
                'min_data_in_leaf': 21,
                'verbose': -1,
                'seed':int(2**n_fold),
                'bagging_seed':int(2**n_fold),
                'drop_seed':int(2**n_fold)
                }

        reg = lgb.train(
                        params,
                        lgb_train,
                        valid_sets=[lgb_train, lgb_test],
                        valid_names=['train', 'test'],
                        num_boost_round=10000,
                        early_stopping_rounds= 200,
                        verbose_eval=100
                        )

        oof_preds[valid_idx] = reg.predict(valid_x, num_iteration=reg.best_iteration)
        sub_preds += reg.predict(test_df[feats], num_iteration=reg.best_iteration) / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = feats
        fold_importance_df["importance"] = np.log1p(reg.feature_importance(importance_type='gain', iteration=reg.best_iteration))
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
        print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx])))
        del reg, train_x, train_y, valid_x, valid_y
        gc.collect()
    
    print('LGBM RMSE: {:<8.5f}'.format(rmse(oof_preds, train_df['target'])))
    #sub_preds = np.log2(sub_preds)

Training until validation scores don't improve for 200 rounds.
[100]	train's rmse: 3.65977	test's rmse: 3.68858
[200]	train's rmse: 3.58028	test's rmse: 3.643
[300]	train's rmse: 3.53415	test's rmse: 3.62494
[400]	train's rmse: 3.49906	test's rmse: 3.61695
[500]	train's rmse: 3.47301	test's rmse: 3.61321
[600]	train's rmse: 3.45097	test's rmse: 3.61173
[700]	train's rmse: 3.43114	test's rmse: 3.61004
[800]	train's rmse: 3.41471	test's rmse: 3.60952
[900]	train's rmse: 3.39886	test's rmse: 3.6089
[1000]	train's rmse: 3.38305	test's rmse: 3.60866
[1100]	train's rmse: 3.36873	test's rmse: 3.60828
[1200]	train's rmse: 3.35424	test's rmse: 3.60803
[1300]	train's rmse: 3.33914	test's rmse: 3.60756
[1400]	train's rmse: 3.32486	test's rmse: 3.60687
[1500]	train's rmse: 3.3093	test's rmse: 3.60573
[1600]	train's rmse: 3.29518	test's rmse: 3.60466
[1700]	train's rmse: 3.2805	test's rmse: 3.60415
[1800]	train's rmse: 3.26702	test's rmse: 3.60437
[1900]	train's rmse: 3.25343	test's rmse: 3.60386
[

In [19]:
#Cat boost
import catboost as cb

oof_cb = np.zeros(train_df.shape[0])
predictions_cb = np.zeros(test_df.shape[0])

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['outliers'])):
    train_x, train_y = train_df[feats].iloc[train_idx], train_df['target'].iloc[train_idx]
    valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['target'].iloc[valid_idx]
    
    # CatBoost Regressor estimator
    model = cb.CatBoostRegressor(
        learning_rate = 0.03,
        iterations = 1000,
        eval_metric = 'RMSE',
        allow_writing_files = False,
        od_type = 'Iter',
        bagging_temperature = 0.2,
        depth = 10,
        od_wait = 20,
        silent = True
    )
    
    # Fit
    model.fit(
        train_x, train_y,
        eval_set=[(train_x, train_y), (valid_x, valid_y)],
        verbose=None,
        early_stopping_rounds=100
    )
    
    print("CB " + str(n_fold) + "-" * 50)
    
    oof_cb[valid_idx] = model.predict(valid_x)
    test_preds = model.predict(test_df[feats])
    predictions_cb += test_preds / folds.n_splits
    print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_cb[valid_idx])))

print('Cat Boost RMSE: {:<8.5f}'.format(rmse(oof_cb, train_df['target'])))

CB 0--------------------------------------------------
Fold  1 RMSE : 3.620180
CB 1--------------------------------------------------
Fold  2 RMSE : 3.643692
CB 2--------------------------------------------------
Fold  3 RMSE : 3.655465
CB 3--------------------------------------------------
Fold  4 RMSE : 3.670002
CB 4--------------------------------------------------
Fold  5 RMSE : 3.650339
CB 5--------------------------------------------------
Fold  6 RMSE : 3.653679
CB 6--------------------------------------------------
Fold  7 RMSE : 3.621149
CB 7--------------------------------------------------
Fold  8 RMSE : 3.701807
CB 8--------------------------------------------------
Fold  9 RMSE : 3.661767
CB 9--------------------------------------------------
Fold 10 RMSE : 3.624535
CB 10--------------------------------------------------
Fold 11 RMSE : 3.686924
Cat Boost RMSE: 3.65368 


In [20]:
from scipy.optimize import minimize

def find_best_weight(preds, target):
    def _validate_func(weights):
        ''' scipy minimize will pass the weights as a numpy array '''
        final_prediction = 0
        for weight, prediction in zip(weights, preds):
                final_prediction += weight * prediction
        return np.sqrt(mean_squared_error(final_prediction, target))

    #the algorithms need a starting value, right not we chose 0.5 for all weights
    #its better to choose many random starting points and run minimize a few times
    starting_values = [0.5]*len(preds)

    #adding constraints and a different solver as suggested by user 16universe
    #https://kaggle2.blob.core.windows.net/forum-message-attachments/75655/2393/otto%20model%20weights.pdf?sv=2012-02-12&se=2015-05-03T21%3A22%3A17Z&sr=b&sp=r&sig=rkeA7EJC%2BiQ%2FJ%2BcMpcA4lYQLFh6ubNqs2XAkGtFsAv0%3D
    cons = ({'type':'eq','fun':lambda w: 1-sum(w)})
    #our weights are bound between 0 and 1
    bounds = [(0, 1)] * len(preds)
    
    res = minimize(_validate_func, starting_values, method='Nelder-Mead', bounds=bounds, constraints=cons)
    
    print('Ensemble Score: {best_score}'.format(best_score=(1-res['fun'])))
    print('Best Weights: {weights}'.format(weights=res['x']))
    
    return res

In [21]:
res = find_best_weight([oof_preds, oof_cb], train_df['target'])



Ensemble Score: -2.6483428403868827
Best Weights: [0.62306617 0.42865847]


In [22]:
#Mix submission
sub_df = pd.read_csv('D:\Ellunium\elo/sample_submission.csv')
sub_df["target"] = 0.62306617*sub_preds + 0.42865847*predictions_cb
sub_df.to_csv("D:\Ellunium\elo/submission_elo_strat_lgb_cat.csv", index=False)

In [20]:
# simple submission
sub_df = pd.read_csv('D:\Ellunium\elo/sample_submission.csv')
sub_df["target"] = sub_preds
sub_df.to_csv("D:\Ellunium\elo/submission_elo_strat_lgb_3_64489.csv", index=False)


In [22]:
#train_df = train_df.reset_index()
#test_df = test_df.reset_index()
train_df.reset_index().to_csv("D:\Ellunium\elo/train_data_clean_17_02.csv", index=False)
test_df.reset_index().to_csv("D:\Ellunium\elo/test_data_clean_17_02.csv", index=False)

In [21]:
train_df.head()

Unnamed: 0,card_id,feature_1,feature_2,feature_3,first_active_month,outliers,target,quarter,elapsed_time,days_feature1,...,price_max,duration_mean,duration_min,duration_max,amount_month_ratio_mean,amount_month_ratio_min,amount_month_ratio_max,new_CLV,hist_CLV,CLV_ratio
0,C_ID_92a2005557,0.013145,0.008752,0.011428,2017-06-01,0.0,-0.820283,2.0,626.0,3130.0,...,2300.0,2184.511719,225.0,27600.0,15.875774,1.780273,193.932297,5149.522065,414840.40625,0.012413
1,C_ID_3d0044924f,0.010712,0.011385,0.010283,2017-01-01,0.0,0.392913,1.0,777.0,3108.0,...,328.041809,1432.088135,103.870003,46880.0,8.72001,0.614258,277.765747,39.105829,936333.375,4.2e-05
2,C_ID_d639edf6cd,0.01061,0.008752,0.010283,2016-08-01,0.0,0.688056,3.0,930.0,1860.0,...,inf,870.480957,474.919983,4741.0,6.761347,3.748047,39.181995,2.818182,7159.795898,0.000394
3,C_ID_186d6a6901,0.010712,0.014166,0.010283,2017-09-01,0.0,0.142495,3.0,534.0,2136.0,...,394.747498,1441.479248,150.0,17488.791016,10.667797,1.041992,142.644547,225.463827,35165.296875,0.006412
4,C_ID_cdbd2c0db2,0.008058,0.014166,0.010283,2017-11-01,0.0,-0.159749,4.0,473.0,473.0,...,434.362854,4386.531738,66.0,67689.0,33.357674,0.458496,546.797485,14130.305261,383071.90625,0.036887
