In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm,tqdm_notebook 
import lightgbm as lgb

from sklearn.model_selection import KFold
from scipy import sparse
import warnings
import time
import sys
import os
import gc
import datetime

from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)
pd.set_option('max_colwidth',100)

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [2]:
train = pd.read_csv("train_cleaned_1.csv")
test =pd.read_csv("test_cleaned_1.csv")

del_cols = []
for col in train.columns:
    if 'subsector_id_cnt_' in col and 'new_cardf': 
        del_cols.append(col)
del_cols1 = []
for col in train.columns:
    if 'subsector_id_cnt_' in col and 'hist_last2_' in col:
        del_cols1.append(col)
del_cols2 = []
for col in train.columns:
    if 'subsector_id_cnt_' in col and 'auth_cardf' in col:
        del_cols2.append(col)
del_cols3 = []
for col in train.columns:
    if 'merchant_category_id_month_lag_nunique_' in col and '_pivot_supp' in col:
        del_cols3.append(col)
    if 'city_id' in col and '_pivot_supp' in col:
        del_cols3.append(col)
    if 'month_diff' in col and 'hist_last2_' in col:
        del_cols3.append(col)
    if 'month_diff_std' in col or 'month_diff_gap' in col:
        del_cols3.append(col) 
fea_cols = [col for col in train.columns if train[col].dtypes!='object' and train[col].dtypes != '<M8[ns]' and col!='target' not in col and col!='min_num'\
            and col not in del_cols and col not in del_cols1 and col not in del_cols2 and col!='target1' and col!='card_id_cnt_ht_pivot_supp'  and col not in del_cols3] 

train = train[fea_cols+['target']+['card_id']]
fea_cols.remove('outliers')
test = test[fea_cols+['card_id']]

inf_cols = ['new_cardf_card_id_cnt_divide_installments_nunique', 'hist_last2_card_id_cnt_divide_installments_nunique']
train[inf_cols] = train[inf_cols].replace(np.inf, train[inf_cols].replace(np.inf, -99).max().max())

test[inf_cols] = test[inf_cols].replace(np.inf, test[inf_cols].replace(np.inf, -99).max().max())

# ## load sparse
# train_tags = sparse.load_npz('train_tags.npz')
# test_tags  = sparse.load_npz('test_tags.npz')

## Get the index of non-exceptional values
normal_index = train[train['outliers']==0].index.tolist()
## without outliers
ntrain = train[train['outliers'] == 0]

target        = train['target'].values
ntarget       = ntrain['target'].values
target_binary = train['outliers'].values
###
y_train        = target
y_ntrain       = ntarget
y_train_binary = target_binary

id_train = train['card_id'].copy(); train.drop('card_id', axis = 1, inplace = True)
id_ntrain = ntrain['card_id'].copy(); ntrain.drop('card_id', axis = 1, inplace = True)
id_test = test['card_id'].copy(); test.drop('card_id', axis = 1, inplace = True)

print('train:',train.shape)
print('ntrain:',ntrain.shape)

train: (1000, 647)
ntrain: (982, 647)


In [3]:
def train_model(X, X_test, y, params, folds, model_type='lgb', eval_type='regression'):
    oof = np.zeros(X.shape[0])
    predictions = np.zeros(X_test.shape[0])
    scores = []
    for fold_n, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
        print('Fold', fold_n, 'started at', time.ctime())
        
        if model_type == 'lgb':
            trn_data = lgb.Dataset(X[trn_idx], y[trn_idx])
            val_data = lgb.Dataset(X[val_idx], y[val_idx])
            clf = lgb.train(params, trn_data, num_boost_round=20000, 
                            valid_sets=[trn_data, val_data], 
                            callbacks=[lgb.early_stopping(stopping_rounds=500), lgb.log_evaluation(200)])
            oof[val_idx] = clf.predict(X[val_idx], num_iteration=clf.best_iteration)
            predictions += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits
        print(predictions)

        if eval_type == 'regression':
            scores.append(mean_squared_error(oof[val_idx], y[val_idx])**0.5)
        if eval_type == 'binary':
            scores.append(log_loss(y[val_idx], oof[val_idx]))
        
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    return oof, predictions, scores

In [4]:
#### lgb
lgb_params = {'num_leaves': 64,
             'min_data_in_leaf': 32, 
             'objective':'regression',
             'max_depth': -1,
             'learning_rate': 0.01,
             "min_child_samples": 20,
             "boosting": "gbdt",
             "feature_fraction": 0.5,
             "bagging_freq": 1,
             "bagging_fraction": 0.9,
             "bagging_seed": 11,
             "metric": 'rmse',
             "lambda_l1": 0.1,
             "verbosity": -1}
folds = KFold(n_splits=2, shuffle=True, random_state=42)
X_ntrain = ntrain[fea_cols].values
X_train  = train[fea_cols].values
X_test   = test[fea_cols].values
print('='*10,'Regression Models','='*10)
oof_lgb , predictions_lgb , scores_lgb  = train_model(X_train , X_test, y_train, params=lgb_params, folds=folds, model_type='lgb', eval_type='regression')
predictions_lgb_df = pd.DataFrame({'card_id': id_test.to_list(), 'prediction': predictions_lgb})


Fold 0 started at Thu Apr 25 17:58:36 2024
Training until validation scores don't improve for 500 rounds
[200]	training's rmse: 3.53707	valid_1's rmse: 5.06596
[400]	training's rmse: 3.053	valid_1's rmse: 5.07098
[600]	training's rmse: 2.67511	valid_1's rmse: 5.08738
Early stopping, best iteration is:
[148]	training's rmse: 3.68434	valid_1's rmse: 5.06317
[-1.00648293e+00 -3.31311079e-01  1.17471678e-02  1.67028132e-02
 -2.88913344e-01 -7.00643197e-02 -9.09981424e-02  5.76093891e-01
 -7.54969455e-01 -5.21876898e-01 -1.43836819e-01 -1.28786904e-01
  1.39444239e-02 -2.64709293e-01 -1.09650296e+00 -1.56743597e-01
  4.45197180e-01 -4.42590060e-02 -3.08173509e-01 -3.13255437e-01
  4.38603936e-02 -6.54442980e-01  2.25936739e-01  7.01421532e-02
 -5.53454755e-02 -1.63785172e-01 -2.62259206e-01 -8.05899156e-04
 -8.06936809e-02 -5.18703275e-02 -2.15456128e-01  4.15727971e-01
 -1.57552871e-01  2.81490131e-01 -2.45114300e-01 -3.07908254e-01
  3.56100189e-01 -6.53656367e-02 -4.71923888e-01  1.07870

In [5]:
print('='*10,'without outliers Regression Models','='*10)
oof_nlgb, predictions_nlgb, scores_nlgb = train_model(X_ntrain, X_test, y_ntrain, params=lgb_params, folds=folds, model_type='lgb', eval_type='regression')
predictions_nlgb_df = pd.DataFrame({'card_id': id_test.to_list(), 'prediction': predictions_nlgb})


Fold 0 started at Thu Apr 25 17:58:45 2024
Training until validation scores don't improve for 500 rounds
[200]	training's rmse: 1.19635	valid_1's rmse: 1.6322
[400]	training's rmse: 0.919992	valid_1's rmse: 1.65253
[600]	training's rmse: 0.725478	valid_1's rmse: 1.67494
Early stopping, best iteration is:
[125]	training's rmse: 1.33324	valid_1's rmse: 1.62615
[-5.80095239e-02 -2.31322521e-01 -1.24231516e-01 -1.55348380e-01
 -2.70813469e-01 -8.10236633e-03  5.42099549e-02  4.01496412e-01
 -2.20329762e-01 -3.57817383e-02 -6.04508110e-02  1.54500985e-01
 -3.23265215e-02 -4.46612583e-02  1.48719668e-01 -2.70974041e-02
  1.95173280e-01 -6.49418097e-02  1.57006054e-01  6.13775249e-04
  1.59439077e-01 -3.21142535e-01  3.56793422e-01 -1.00910267e-01
 -9.03822679e-02 -3.63741340e-02 -3.25846665e-01 -7.35224888e-02
  5.44205017e-02 -7.23257153e-02 -7.76996367e-02  2.12777927e-01
 -5.16532046e-02  2.53946906e-01 -6.67308567e-02 -3.45234811e-02
  3.81367728e-01  4.49435856e-02 -1.22170274e-01  1.31

In [6]:
print('='*10,'Classification Model','='*10)
lgb_params['objective'] = 'binary'
lgb_params['metric']    = 'binary_logloss'
oof_blgb, predictions_blgb, scores_blgb = train_model(X_train , X_test, y_train_binary, params=lgb_params, folds=folds, model_type='lgb', eval_type='binary')
predictions_blgb_df = pd.DataFrame({'card_id': id_test.to_list(), 'prediction': predictions_blgb})


Fold 0 started at Thu Apr 25 17:58:51 2024
Training until validation scores don't improve for 500 rounds
[200]	training's binary_logloss: 0.0125824	valid_1's binary_logloss: 0.108492
[400]	training's binary_logloss: 0.00410864	valid_1's binary_logloss: 0.127385
Early stopping, best iteration is:
[88]	training's binary_logloss: 0.0311897	valid_1's binary_logloss: 0.100848
[0.01329832 0.0065501  0.00432725 0.00314209 0.00429091 0.00487395
 0.00354097 0.00467285 0.00867151 0.0084037  0.00455166 0.00709618
 0.00361172 0.0056171  0.03959124 0.00548294 0.00362278 0.00403977
 0.00753188 0.00553793 0.00360213 0.00900958 0.00412616 0.00354715
 0.0051403  0.00704122 0.00380112 0.00360584 0.00499038 0.00308034
 0.00368735 0.00307877 0.00334041 0.00412493 0.00473059 0.00553956
 0.00359783 0.00385113 0.00705008 0.00403639 0.00375511 0.0037936
 0.00520942 0.00352753 0.00356114 0.00516521 0.00357669 0.00467027
 0.00346668 0.00403889 0.00654752 0.0047077  0.00693192 0.00329085
 0.00945703 0.00337994 0

In [12]:
merged_df = pd.merge(predictions_blgb_df, predictions_nlgb_df, on='card_id', suffixes=('_blgb', '_nlgb'), how='left')
merged_df = pd.merge(merged_df, predictions_lgb_df, on='card_id', suffixes=('_nlgb', '_lgb'), how='left')
print(merged_df.head(5))

# Create a new column 'final_prediction' based on the outliers classification
merged_df['final_prediction'] = np.where(merged_df['prediction_blgb'] >= 0.8, 
                                         merged_df['prediction'], 
                                         merged_df['prediction_nlgb'])

# Drop unnecessary columns
merged_df.drop(['prediction_blgb', 'prediction_nlgb', 'prediction'], axis=1, inplace=True)

           card_id  prediction_blgb  prediction_nlgb  prediction
0  C_ID_0ab67a22ab         0.018093        -0.224010   -1.283473
1  C_ID_130fd0cbdd         0.009651        -0.345745   -0.660612
2  C_ID_b709037bc5         0.009048        -0.483164   -0.704222
3  C_ID_d27d835a9f         0.006722        -0.109869   -0.205514
4  C_ID_2b5e3df5c2         0.011024        -0.622492   -0.964870


In [13]:
sub_df = pd.read_csv('./elo-merchant-category-recommendation/sample_submission.csv')
sub_df["target"] = merged_df['final_prediction']
sub_df.to_csv('predictions_lgb_cleaned.csv', index=False)