In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm,tqdm_notebook 
import lightgbm as lgb

from sklearn.model_selection import KFold
from scipy import sparse
import warnings
import time
import sys
import os
import gc
import datetime

from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)
pd.set_option('max_colwidth',100)

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [2]:
train = pd.read_csv("train_cleaned.csv")
test =pd.read_csv("test_cleaned.csv")

del_cols = []
for col in train.columns:
    if 'subsector_id_cnt_' in col and 'new_cardf': 
        del_cols.append(col)
del_cols1 = []
for col in train.columns:
    if 'subsector_id_cnt_' in col and 'hist_last2_' in col:
        del_cols1.append(col)
del_cols2 = []
for col in train.columns:
    if 'subsector_id_cnt_' in col and 'auth_cardf' in col:
        del_cols2.append(col)
del_cols3 = []
for col in train.columns:
    if 'merchant_category_id_month_lag_nunique_' in col and '_pivot_supp' in col:
        del_cols3.append(col)
    if 'city_id' in col and '_pivot_supp' in col:
        del_cols3.append(col)
    if 'month_diff' in col and 'hist_last2_' in col:
        del_cols3.append(col)
    if 'month_diff_std' in col or 'month_diff_gap' in col:
        del_cols3.append(col) 
fea_cols = [col for col in train.columns if train[col].dtypes!='object' and train[col].dtypes != '<M8[ns]' and col!='target' not in col and col!='min_num'\
            and col not in del_cols and col not in del_cols1 and col not in del_cols2 and col!='target1' and col!='card_id_cnt_ht_pivot_supp'  and col not in del_cols3] 

train = train[fea_cols+['target']+['card_id']]
fea_cols.remove('outliers')
test = test[fea_cols+['card_id']]

inf_cols = ['new_cardf_card_id_cnt_divide_installments_nunique', 'hist_last2_card_id_cnt_divide_installments_nunique']
train[inf_cols] = train[inf_cols].replace(np.inf, train[inf_cols].replace(np.inf, -99).max().max())

test[inf_cols] = test[inf_cols].replace(np.inf, test[inf_cols].replace(np.inf, -99).max().max())

# ## load sparse
# train_tags = sparse.load_npz('train_tags.npz')
# test_tags  = sparse.load_npz('test_tags.npz')

## Get the index of non-exceptional values
normal_index = train[train['outliers']==0].index.tolist()
## without outliers
ntrain = train[train['outliers'] == 0]

target        = train['target'].values
ntarget       = ntrain['target'].values
target_binary = train['outliers'].values
###
y_train        = target
y_ntrain       = ntarget
y_train_binary = target_binary

id_train = train['card_id'].copy(); train.drop('card_id', axis = 1, inplace = True)
id_ntrain = ntrain['card_id'].copy(); ntrain.drop('card_id', axis = 1, inplace = True)
id_test = test['card_id'].copy(); test.drop('card_id', axis = 1, inplace = True)

print('train:',train.shape)
print('ntrain:',ntrain.shape)

train: (201917, 647)
ntrain: (199710, 647)


In [3]:
def train_model(X, X_test, y, params, folds, model_type='lgb', eval_type='regression'):
    oof = np.zeros(X.shape[0])
    predictions = np.zeros(X_test.shape[0])
    scores = []
    for fold_n, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
        print('Fold', fold_n, 'started at', time.ctime())
        
        if model_type == 'lgb':
            trn_data = lgb.Dataset(X[trn_idx], y[trn_idx])
            val_data = lgb.Dataset(X[val_idx], y[val_idx])
            clf = lgb.train(params, trn_data, num_boost_round=20000, 
                            valid_sets=[trn_data, val_data], 
                            callbacks=[lgb.early_stopping(stopping_rounds=500), lgb.log_evaluation(200)])
            oof[val_idx] = clf.predict(X[val_idx], num_iteration=clf.best_iteration)
            predictions += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits
        print(predictions)

        if eval_type == 'regression':
            scores.append(mean_squared_error(oof[val_idx], y[val_idx])**0.5)
        if eval_type == 'binary':
            scores.append(log_loss(y[val_idx], oof[val_idx]))
        
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    return oof, predictions, scores

In [4]:
#### lgb
lgb_params = {'num_leaves': 64,
             'min_data_in_leaf': 32, 
             'objective':'regression',
             'max_depth': -1,
             'learning_rate': 0.01,
             "min_child_samples": 20,
             "boosting": "gbdt",
             "feature_fraction": 0.5,
             "bagging_freq": 1,
             "bagging_fraction": 0.9,
             "bagging_seed": 11,
             "metric": 'rmse',
             "lambda_l1": 0.1,
             "verbosity": -1}
folds = KFold(n_splits=2, shuffle=True, random_state=42)
X_ntrain = ntrain[fea_cols].values
X_train  = train[fea_cols].values
X_test   = test[fea_cols].values
print('='*10,'Regression Models','='*10)
oof_lgb , predictions_lgb , scores_lgb  = train_model(X_train , X_test, y_train, params=lgb_params, folds=folds, model_type='lgb', eval_type='regression')
predictions_lgb_df = pd.DataFrame({'card_id': id_test.to_list(), 'prediction': predictions_lgb})


Fold 0 started at Thu Apr 25 18:13:53 2024
Training until validation scores don't improve for 500 rounds
[200]	training's rmse: 3.32844	valid_1's rmse: 3.6968
[400]	training's rmse: 3.13264	valid_1's rmse: 3.68059
[600]	training's rmse: 3.00495	valid_1's rmse: 3.67797
[800]	training's rmse: 2.90065	valid_1's rmse: 3.67806
[1000]	training's rmse: 2.81152	valid_1's rmse: 3.67871
[1200]	training's rmse: 2.72918	valid_1's rmse: 3.67991
Early stopping, best iteration is:
[721]	training's rmse: 2.93927	valid_1's rmse: 3.67768
[-0.7612198  -0.22311361 -0.405374   ...  0.333631   -1.44950372
  0.08461484]
Fold 1 started at Thu Apr 25 18:14:55 2024
Training until validation scores don't improve for 500 rounds
[200]	training's rmse: 3.38587	valid_1's rmse: 3.63928
[400]	training's rmse: 3.19065	valid_1's rmse: 3.62474
[600]	training's rmse: 3.05876	valid_1's rmse: 3.62366
[800]	training's rmse: 2.95027	valid_1's rmse: 3.62402
[1000]	training's rmse: 2.85919	valid_1's rmse: 3.62448
Early stopping

In [5]:
print('='*10,'without outliers Regression Models','='*10)
oof_nlgb, predictions_nlgb, scores_nlgb = train_model(X_ntrain, X_test, y_ntrain, params=lgb_params, folds=folds, model_type='lgb', eval_type='regression')
predictions_nlgb_df = pd.DataFrame({'card_id': id_test.to_list(), 'prediction': predictions_nlgb})


Fold 0 started at Thu Apr 25 18:15:55 2024
Training until validation scores don't improve for 500 rounds
[200]	training's rmse: 1.5358	valid_1's rmse: 1.5576
[400]	training's rmse: 1.4879	valid_1's rmse: 1.5459
[600]	training's rmse: 1.45363	valid_1's rmse: 1.54362
[800]	training's rmse: 1.42462	valid_1's rmse: 1.54304
[1000]	training's rmse: 1.39871	valid_1's rmse: 1.54297
[1200]	training's rmse: 1.37448	valid_1's rmse: 1.54293
[1400]	training's rmse: 1.35129	valid_1's rmse: 1.54313
[1600]	training's rmse: 1.32902	valid_1's rmse: 1.54324
Early stopping, best iteration is:
[1136]	training's rmse: 1.3822	valid_1's rmse: 1.5429
[-0.09294421 -0.12536726 -0.18594042 ...  0.42438142 -0.28133206
  0.08904922]
Fold 1 started at Thu Apr 25 18:17:28 2024
Training until validation scores don't improve for 500 rounds
[200]	training's rmse: 1.52048	valid_1's rmse: 1.57509
[400]	training's rmse: 1.47417	valid_1's rmse: 1.56356
[600]	training's rmse: 1.44067	valid_1's rmse: 1.56088
[800]	training's 

In [None]:
print('='*10,'Classification Model','='*10)
lgb_params['objective'] = 'binary'
lgb_params['metric']    = 'binary_logloss'
oof_blgb, predictions_blgb, scores_blgb = train_model(X_train , X_test, y_train_binary, params=lgb_params, folds=folds, model_type='lgb', eval_type='binary')
predictions_blgb_df = pd.DataFrame({'card_id': id_test.to_list(), 'prediction': predictions_blgb})


Fold 0 started at Thu Apr 25 18:19:04 2024
Training until validation scores don't improve for 500 rounds
[200]	training's binary_logloss: 0.022064	valid_1's binary_logloss: 0.0457366
[400]	training's binary_logloss: 0.014054	valid_1's binary_logloss: 0.0454491


In [None]:
merged_df = pd.merge(predictions_blgb_df, predictions_nlgb_df, on='card_id', suffixes=('_blgb', '_nlgb'), how='left')
merged_df = pd.merge(merged_df, predictions_lgb_df, on='card_id', suffixes=('_nlgb', '_lgb'), how='left')
print(merged_df.head(5))

# Create a new column 'final_prediction' based on the outliers classification
merged_df['final_prediction'] = np.where(merged_df['prediction_blgb'] >= 0.8, 
                                         merged_df['prediction'], 
                                         merged_df['prediction_nlgb'])

# Drop unnecessary columns
merged_df.drop(['prediction_blgb', 'prediction_nlgb', 'prediction'], axis=1, inplace=True)

In [None]:
sub_df = pd.read_csv('./data/sample_submission.csv')
sub_df["target"] = merged_df['final_prediction']
sub_df.to_csv('predictions_lgb_cleaned.csv', index=False)