# Libraries

In [1]:
!pip install lightgbm
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV


pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns',700)
pd.set_option('display.float_format', lambda x: '%.3f' % x)


import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning) 
warnings.filterwarnings("ignore", category=UserWarning) 

Collecting lightgbm
  Downloading https://files.pythonhosted.org/packages/1f/cb/a8ec24334c35a7d0c87b4e4e056bd2137573c7c1bd81c760b79a2f370254/lightgbm-2.3.1-py2.py3-none-win_amd64.whl (544kB)
Installing collected packages: lightgbm
Successfully installed lightgbm-2.3.1


# Helper Functions

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                #if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                #    df[col] = df[col].astype(np.float16)
                #elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                #    df[col] = df[col].astype(np.float32)
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:100].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(15, 20))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances01.png')
    
    

# Loading Data

In [3]:
train_transaction = pd.read_csv("data/train_transaction.csv")
train_identity = pd.read_csv("data/train_identity.csv")

test_transaction = pd.read_csv("data/test_transaction.csv")
test_identity = pd.read_csv("data/test_identity.csv")

# Fix column name 
fix_col_name = {testIdCol:trainIdCol for testIdCol, trainIdCol in zip(test_identity.columns, train_identity.columns)}
test_identity.rename(columns=fix_col_name, inplace=True)
    
## Reduce memory
train_transaction = reduce_mem_usage(train_transaction)
train_identity = reduce_mem_usage(train_identity)

test_transaction = reduce_mem_usage(test_transaction)
test_identity = reduce_mem_usage(test_identity)
    
# Merge (transaction-identity) DATA
train_temp = train_transaction.merge(train_identity, on='TransactionID', how='left')
test_temp = test_transaction.merge(test_identity, on='TransactionID', how='left')

#MERGE (X_train - X_test)
train_test_temp = pd.concat([train_temp, test_temp], ignore_index=True)

print(f'train dataset has {train_temp.shape[0]} rows and {train_temp.shape[1]} columns.')
print(f'test dataset has {test_temp.shape[0]} rows and {test_temp.shape[1]} columns.')

del train_transaction, train_identity, test_transaction, test_identity; x = gc.collect()

FileNotFoundError: [Errno 2] File b'data/train_transaction.csv' does not exist: b'data/train_transaction.csv'

In [None]:
train_test = train_test_temp.copy()

# Processing

In [None]:
drop_col_list = []
LE_col_list = []
OHE_col_list = []
# Cok sinifli kategorik degiskenler icin bunu kullanacagim. 
LGBM_cat_col_list =[]
cat_cols = ['ProductCD','card1','card2','card3','card4','card5','card6','addr1','addr2','P_emaildomain','R_emaildomain',
            'M1','M2','M3','M4','M5','M6','M7','M8','M9','DeviceType','DeviceInfo'] + [f'id_{i}' for i in range(12,39)]

In [None]:
# TransactionDT Yi isle
# ENCODING STRATEJISI BELIRLE : ohe uygulanabilir yeni turetilecek degiskenler icin  
import datetime
START_DATE = '2019-04-22'
startdate = datetime.datetime.strptime(START_DATE, "%Y-%m-%d")
train_test['NewDate'] = train_test['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds=x)))
train_test['NewDate_YMD'] = train_test['NewDate'].dt.year.astype(str) + '-' + train_test['NewDate'].dt.month.astype(str) + '-' + train_test['NewDate'].dt.day.astype(str)
train_test['NewDate_YearMonth'] = train_test['NewDate'].dt.year.astype(str) + '-' + train_test['NewDate'].dt.month.astype(str)
train_test['NewDate_Weekday'] = train_test['NewDate'].dt.dayofweek
train_test['NewDate_Hour'] = train_test['NewDate'].dt.hour
train_test['NewDate_Day'] = train_test['NewDate'].dt.day

drop_col_list.append("TransactionDT")

In [None]:
# TransactionAMT
train_test['New_Cents'] = (train_test['TransactionAmt'] - np.floor(train_test['TransactionAmt'])).astype('float32')
train_test['New_TransactionAmt_Bin'] = pd.qcut(train_test['TransactionAmt'],10)

LE_col_list.append("New_TransactionAmt_Bin")

In [None]:
# ProductCD--> 5 FARKLI deger var bos deger yok, ohe yapilacak, frequency encoding uygulanabilir
OHE_col_list.append('ProductCD')

In [None]:
#cardX
# ENCODING STRATEJISI : 
# Card degerleri categorik  --> frekans encoding uygulanabilir, 
#                               lighbtm ye cat diye belirtilebilir,
#                               le uygulanabilir .
#                               ohe uygulanabilir.
#                               sayisalsa direk birakilabilir. 
#100 den fazla kategorik olanlari ya le yada ligbtm cat uygulayacagim. 
#card1 - card2 _addr1 _addr2 kombinasyonlarindan yeni degiskenler olusturulup yukariaki encoding islemleri denenebilir. 
#card1 - 13553 
#card2 - 500
#card3 - 114
#card4 - 4 
#card5 - 119
#card6 - 4
# NAN DEGER STRATEJISI :  !!! BELIRLENECEK.


LGBM_cat_col_list.append('card1')
LGBM_cat_col_list.append('card2')
LGBM_cat_col_list.append('card3')
LGBM_cat_col_list.append('card5')
OHE_col_list.append('card4')
OHE_col_list.append('card6')

In [None]:
# addr1 & addr2 --> KATEGORIK DEGISKEN BIRINDE  addr1 de 332 adr2 de 74 farkli sinif var
# ENCODING STRATEJISI : le veya oldugu gibi birakmakta denenebilir.  
# NAN DEGER STRATEJISI :  !!! BELIRLENECEK. 
# FE STRATEJISI : card1, card2 , addr1 ve addr2 ile kombinasyonlarindan yeni degiskenler turetilebilir. 


LGBM_cat_col_list.append('addr1')
LGBM_cat_col_list.append('addr2')

In [None]:
# dist1 & dist2 --> SAYISAL DEGISKENLER ENCODING YAPMAYA GEREK YOK 
# NAN DEGER STRATEJISI :  !!! BELIRLENECEK. 


In [None]:
# P_email_domain & R_email_domain --> kategorik degiskenler. 
#  NAN DEGER STRATEJISI :  Unknown olarak belirtilebilir.  
#  SINIF SAYISI AZALTILACAK --> Others diye belirtilecek 

#P_email_domain SINIF SAYISININ AZALTILMASI
train_test.loc[train_test['P_emaildomain'].isin(['gmail.com', 'gmail']),'P_emaildomain'] = 'Google'
train_test.loc[train_test['P_emaildomain'].isin(['yahoo.com', 'yahoo.com.mx',  'yahoo.co.uk','yahoo.co.jp', 'yahoo.de', 'yahoo.fr','yahoo.es']), 'P_emaildomain'] = 'Yahoo'
train_test.loc[train_test['P_emaildomain'].isin(['hotmail.com','outlook.com','msn.com', 'live.com.mx', 'hotmail.es','hotmail.co.uk', 'hotmail.de','outlook.es', 'live.com', 'live.fr','hotmail.fr']), 'P_emaildomain'] = 'Microsoft'
train_test.loc[train_test['P_emaildomain'].isin(train_test['P_emaildomain'].value_counts()[train_test['P_emaildomain'].value_counts() <= 500 ].index), 'P_emaildomain'] = "Others"
train_test['P_emaildomain'].fillna("Unknown", inplace=True)

# R_email_domain SINIF SAYISININ AZALTILMASI
train_test.loc[train_test['R_emaildomain'].isin(['gmail.com', 'gmail']),'R_emaildomain'] = 'Google'
train_test.loc[train_test['R_emaildomain'].isin(['yahoo.com', 'yahoo.com.mx',  'yahoo.co.uk','yahoo.co.jp', 'yahoo.de', 'yahoo.fr','yahoo.es']), 'R_emaildomain'] = 'Yahoo'
train_test.loc[train_test['R_emaildomain'].isin(['hotmail.com','outlook.com','msn.com', 'live.com.mx', 'hotmail.es','hotmail.co.uk', 'hotmail.de','outlook.es', 'live.com', 'live.fr','hotmail.fr']), 'R_emaildomain'] = 'Microsoft'
train_test.loc[train_test['R_emaildomain'].isin(train_test['R_emaildomain'].value_counts()[train_test['R_emaildomain'].value_counts() <= 300 ].index), 'R_emaildomain'] = "Others"
train_test['R_emaildomain'].fillna("Unknown", inplace=True)

OHE_col_list.append('P_emaildomain')
OHE_col_list.append('R_emaildomain')


In [None]:
# C1 - C14 columns -->sadece 3 er tane nan deger var, 
# Hepsi sayisal oldugu icin direk modele sokulabilir. 
train_test[train_test['C14'].isnull()]

In [None]:
# D1 - D15 columns --> Time deltas , negatif degerler var , 
# normalizasyon yapilmis chris in notbookunda 
# simdilik bu haliyle kalsin.

#for i in range(1,16):
#    if i in [1,2,3,5,9]: continue
#    X_train['D'+str(i)] =  X_train['D'+str(i)] - X_train.TransactionDT/np.float32(24*60*60)
#    X_test['D'+str(i)] = X_test['D'+str(i)] - X_test.TransactionDT/np.float32(24*60*60) 

In [None]:
# M1 - M9
for i in range(1,10):
    LE_col_list.append('M'+str(i))

LE_col_list


In [None]:
# V1 - V339
# Ayri bir kernel da pca deneyecegim simdilik boyle kalsin.

In [None]:
# id1 - id38
# cok fzla eksik veri var ??? 800 bin den fazla
# id12-id38 arasi kategorik 100 den fazla sinifi olanlara lgbm, 10 dan kucuk olanlara ohe digerlerine le uygulayacagim sonra degisebilir
# tabi lgbm sadece numerik lere uygulanabiliyor. 

id_cols = [c for c in train_test if c[:2] == 'id']
# train_test[id_cols].nunique()
# train_test[id_cols].isnull().sum()

#OHE_col_list.append()
#LE_col_list.append(id_cols)
#LGBM_cat_col_list.append()

for col in id_cols:
    LE_col_list.append(col)

In [None]:
# DeviceType --> 2 farkli deger var le uygulayacagim
train_test.DeviceType.value_counts()

In [None]:
# Device Info -->2799 deger var ve kategorik le uygulayacagim. 
# gruplama yapilip sinif sayisi azaltilabilir samsung, ios ... others gibi 
LE_col_list.append('DeviceInfo')

In [None]:
LE_col_list

# Encoding

In [None]:
train_test.columns

In [None]:
drop_col_list

In [None]:
train_test['NewDate']=train_test['NewDate'].values.astype(float)

In [None]:
for col in train_test.columns:
    if train_test[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(list(train_test[col].astype(str).values))
        train_test[col] = le.transform(list(train_test[col].astype(str).values))

In [None]:
train_test.dtypes

In [None]:
LGBM_cat_col_list

# Modelling 

In [None]:
def modeling(train_test,target):

    train = train_test[train_test[target].notnull()]
    test = train_test[train_test[target].isnull()]

    folds = KFold(n_splits = 10, shuffle = True, random_state = 1001)

    oof_preds = np.zeros(train.shape[0])
    sub_preds = np.zeros(test.shape[0])
    
    feature_importance_df = pd.DataFrame()

    features = [f for f in train.columns if f not in [target,'TransactionID','New_TransactionAmt_Bin','NewDate']]

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train[features], train[target])):
        
        start_time = time.time()
        print('Training on fold {}'.format(n_fold + 1))

        X_train, y_train = train[features].iloc[train_idx], train[target].iloc[train_idx]

        X_valid, y_valid = train[features].iloc[valid_idx], train[target].iloc[valid_idx]
       
        clf = LGBMClassifier(num_leaves =  256,
                             min_child_samples= 79,
                             objective = 'binary',
                             max_depth = 13,
                             learning_rate= 0.03,
                             boosting_type= "gbdt",
                             subsample_freq= 3,
                             subsample= 0.9,
                             bagging_seed= 11,
                             metric='auc',
                             verbosity= -1,
                             reg_alpha= 0.3,
                             reg_lambda= 0.3,
                             colsample_bytree= 0.9)
                             #categorical_feature = LGBM_cat_col_list)

        clf.fit(X_train, y_train, eval_set = [(X_train, y_train), (X_valid, y_valid)], 
                eval_metric = 'auc', verbose = 200, early_stopping_rounds = 200)

        #y_pred_valid
        oof_preds[valid_idx] = clf.predict_proba(X_valid, num_iteration=clf.best_iteration_)[:, 1]
        sub_preds += clf.predict_proba(test[features], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

        fold_importance_df = pd.DataFrame()
        fold_importance_df["feature"] = features
        fold_importance_df["importance"] = clf.feature_importances_
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)


        print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(y_valid, oof_preds[valid_idx]))) 


    print('Full AUC score %.6f' % roc_auc_score(train[target], oof_preds)) #y_pred_valid   

    test[target] = sub_preds
    test[['TransactionID', target]].to_csv("submission_lightgbm.csv", index= False)

    display_importances(feature_importance_df)
    
    return feature_importance_df


In [None]:
train_test = reduce_mem_usage(train_test)

In [None]:
modeling(train_test,'isFraud')

In [None]:
# FREQUENCY ENCODE TOGETHER
def encode_FE(df1, df2, cols):
    for col in cols:
        df = pd.concat([df1[col],df2[col]])
        vc = df.value_counts(dropna=True, normalize=True).to_dict()
        vc[-1] = -1
        nm = col+'_FE'
        df1[nm] = df1[col].map(vc)
        df1[nm] = df1[nm].astype('float32')
        df2[nm] = df2[col].map(vc)
        df2[nm] = df2[nm].astype('float32')

In [None]:
# FREQUENCY ENCODE TOGETHER
def encode_FE(df, cols):
    for col in cols:
        vc = df.value_counts(dropna=True, normalize=True).to_dict()
        vc[-1] = -1
        nm = col+'_FE'
        df[nm] = df1[col].map(vc)
        df[nm] = df1[nm].astype('float32')