In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm,tqdm_notebook 
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

from catboost import CatBoostClassifier
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import BayesianRidge
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold, StratifiedKFold
from scipy import sparse
import warnings
import time
import sys
import os
import gc
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)
pd.set_option('max_colwidth',100)

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [2]:
train = pd.read_csv("train.csv")
test =pd.read_csv("test.csv")

del_cols = []
for col in train.columns:
    if 'subsector_id_cnt_' in col and 'new_cardf': 
        del_cols.append(col)
del_cols1 = []
for col in train.columns:
    if 'subsector_id_cnt_' in col and 'hist_last2_' in col:
        del_cols1.append(col)
del_cols2 = []
for col in train.columns:
    if 'subsector_id_cnt_' in col and 'auth_cardf' in col:
        del_cols2.append(col)
del_cols3 = []
for col in train.columns:
    if 'merchant_category_id_month_lag_nunique_' in col and '_pivot_supp' in col:
        del_cols3.append(col)
    if 'city_id' in col and '_pivot_supp' in col:
        del_cols3.append(col)
    if 'month_diff' in col and 'hist_last2_' in col:
        del_cols3.append(col)
    if 'month_diff_std' in col or 'month_diff_gap' in col:
        del_cols3.append(col) 
fea_cols = [col for col in train.columns if train[col].dtypes!='object' and train[col].dtypes != '<M8[ns]' and col!='target' not in col and col!='min_num'\
            and col not in del_cols and col not in del_cols1 and col not in del_cols2 and col!='target1' and col!='card_id_cnt_ht_pivot_supp'  and col not in del_cols3] 

train = train[fea_cols+['target']]
fea_cols.remove('outliers')
test = test[fea_cols]

inf_cols = ['new_cardf_card_id_cnt_divide_installments_nunique', 'hist_last2_card_id_cnt_divide_installments_nunique']
train[inf_cols] = train[inf_cols].replace(np.inf, train[inf_cols].replace(np.inf, -99).max().max())

test[inf_cols] = test[inf_cols].replace(np.inf, test[inf_cols].replace(np.inf, -99).max().max())

# ## load sparse
# train_tags = sparse.load_npz('train_tags.npz')
# test_tags  = sparse.load_npz('test_tags.npz')

## Get the index of non-exceptional values
normal_index = train[train['outliers']==0].index.tolist()
## without outliers
ntrain = train[train['outliers'] == 0]

target        = train['target'].values
ntarget       = ntrain['target'].values
target_binary = train['outliers'].values
###
y_train        = target
y_ntrain       = ntarget
y_train_binary = target_binary

print('train:',train.shape)
print('ntrain:',ntrain.shape)

train: (201917, 800)
ntrain: (199710, 800)


In [3]:
def train_model(X, X_test, y, params, folds, model_type='lgb', eval_type='regression'):
    oof = np.zeros(X.shape[0])
    predictions = np.zeros(X_test.shape[0])
    scores = []
    for fold_n, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
        print('Fold', fold_n, 'started at', time.ctime())
        
        if model_type == 'lgb':
            trn_data = lgb.Dataset(X[trn_idx], y[trn_idx])
            val_data = lgb.Dataset(X[val_idx], y[val_idx])
            clf = lgb.train(params, trn_data, num_boost_round=20000, 
                            valid_sets=[trn_data, val_data], 
                            callbacks=[lgb.early_stopping(stopping_rounds=500), lgb.log_evaluation(200)])
            oof[val_idx] = clf.predict(X[val_idx], num_iteration=clf.best_iteration)
            predictions += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits
        print(predictions)

        if eval_type == 'regression':
            scores.append(mean_squared_error(oof[val_idx], y[val_idx])**0.5)
        if eval_type == 'binary':
            scores.append(log_loss(y[val_idx], oof[val_idx]))
        
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    return oof, predictions, scores

In [4]:
#### lgb
lgb_params = {'num_leaves': 63,
             'min_data_in_leaf': 32, 
             'objective':'regression',
             'max_depth': -1,
             'learning_rate': 0.01,
             "min_child_samples": 20,
             "boosting": "gbdt",
             "feature_fraction": 0.9,
             "bagging_freq": 1,
             "bagging_fraction": 0.9 ,
             "bagging_seed": 11,
             "metric": 'rmse',
             "lambda_l1": 0.1,
             "verbosity": -1}
folds = KFold(n_splits=5, shuffle=True, random_state=42)
X_ntrain = ntrain[fea_cols].values
X_train  = train[fea_cols].values
X_test   = test[fea_cols].values
print('='*10,'Regression Models','='*10)
oof_lgb , predictions_lgb , scores_lgb  = train_model(X_train , X_test, y_train, params=lgb_params, folds=folds, model_type='lgb', eval_type='regression')

Fold 0 started at Wed Apr 24 20:57:03 2024
Training until validation scores don't improve for 500 rounds
[200]	training's rmse: 3.4098	valid_1's rmse: 3.6906
[400]	training's rmse: 3.25191	valid_1's rmse: 3.67378
[600]	training's rmse: 3.14948	valid_1's rmse: 3.67001
[800]	training's rmse: 3.06603	valid_1's rmse: 3.66907
[1000]	training's rmse: 2.99467	valid_1's rmse: 3.66962
[1200]	training's rmse: 2.9313	valid_1's rmse: 3.67026
Early stopping, best iteration is:
[835]	training's rmse: 3.05215	valid_1's rmse: 3.66867
[-0.45922686 -0.04079365 -0.17871709 ...  0.13035791 -0.62400649
  0.05880008]
Fold 1 started at Wed Apr 24 21:02:46 2024
Training until validation scores don't improve for 500 rounds
[200]	training's rmse: 3.41275	valid_1's rmse: 3.67655
[400]	training's rmse: 3.25374	valid_1's rmse: 3.65629
[600]	training's rmse: 3.15048	valid_1's rmse: 3.65202
[800]	training's rmse: 3.06729	valid_1's rmse: 3.65123
[1000]	training's rmse: 2.99635	valid_1's rmse: 3.65235
[1200]	training'

In [5]:
print('='*10,'without outliers Regression Models','='*10)
oof_nlgb, predictions_nlgb, scores_nlgb = train_model(X_ntrain, X_test, y_ntrain, params=lgb_params, folds=folds, model_type='lgb', eval_type='regression')

Fold 0 started at Wed Apr 24 21:25:16 2024
Training until validation scores don't improve for 500 rounds
[200]	training's rmse: 1.5443	valid_1's rmse: 1.54885
[400]	training's rmse: 1.50687	valid_1's rmse: 1.53699
[600]	training's rmse: 1.48112	valid_1's rmse: 1.53442
[800]	training's rmse: 1.45974	valid_1's rmse: 1.53371
[1000]	training's rmse: 1.44088	valid_1's rmse: 1.53344
[1200]	training's rmse: 1.42318	valid_1's rmse: 1.53329
[1400]	training's rmse: 1.40634	valid_1's rmse: 1.53315
[1600]	training's rmse: 1.39007	valid_1's rmse: 1.53317
[1800]	training's rmse: 1.37425	valid_1's rmse: 1.53339
Early stopping, best iteration is:
[1438]	training's rmse: 1.40327	valid_1's rmse: 1.53309
[-0.04584995 -0.07422663 -0.12030693 ...  0.15917514 -0.11935037
  0.02582678]
Fold 1 started at Wed Apr 24 21:32:27 2024
Training until validation scores don't improve for 500 rounds
[200]	training's rmse: 1.53882	valid_1's rmse: 1.57476
[400]	training's rmse: 1.50219	valid_1's rmse: 1.5619
[600]	traini

In [6]:
print('='*10,'Classification Model','='*10)
lgb_params['objective'] = 'binary'
lgb_params['metric']    = 'binary_logloss'
oof_blgb, predictions_blgb, scores_blgb = train_model(X_train , X_test, y_train_binary, params=lgb_params, folds=folds, model_type='lgb', eval_type='binary')

Fold 0 started at Wed Apr 24 22:06:35 2024
Training until validation scores don't improve for 500 rounds
[200]	training's binary_logloss: 0.0271464	valid_1's binary_logloss: 0.0453615
[400]	training's binary_logloss: 0.0194736	valid_1's binary_logloss: 0.0447733
[600]	training's binary_logloss: 0.0149399	valid_1's binary_logloss: 0.045285
[800]	training's binary_logloss: 0.0116922	valid_1's binary_logloss: 0.0460295
Early stopping, best iteration is:
[387]	training's binary_logloss: 0.0198397	valid_1's binary_logloss: 0.044763
[0.00587306 0.00033902 0.0017156  ... 0.00175233 0.00801306 0.00052394]
Fold 1 started at Wed Apr 24 22:10:53 2024
Training until validation scores don't improve for 500 rounds
[200]	training's binary_logloss: 0.0274312	valid_1's binary_logloss: 0.0448709
[400]	training's binary_logloss: 0.0196409	valid_1's binary_logloss: 0.0440392
[600]	training's binary_logloss: 0.0151192	valid_1's binary_logloss: 0.0441638
[800]	training's binary_logloss: 0.0118634	valid_1's 

In [8]:
sub_df = pd.read_csv('./elo-merchant-category-recommendation/sample_submission.csv')
sub_df["target"] = predictions_lgb
sub_df.to_csv('predictions_lgb_1.csv', index=False)