In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm,tqdm_notebook 
import lightgbm as lgb

from sklearn.model_selection import KFold
from scipy import sparse
import warnings
import time
import sys
import os
import gc
import datetime

from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)
pd.set_option('max_colwidth',100)

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [2]:
train = pd.read_csv("train.csv")
test =pd.read_csv("test.csv")

del_cols = []
for col in train.columns:
    if 'subsector_id_cnt_' in col and 'new_cardf': 
        del_cols.append(col)
del_cols1 = []
for col in train.columns:
    if 'subsector_id_cnt_' in col and 'hist_last2_' in col:
        del_cols1.append(col)
del_cols2 = []
for col in train.columns:
    if 'subsector_id_cnt_' in col and 'auth_cardf' in col:
        del_cols2.append(col)
del_cols3 = []
for col in train.columns:
    if 'merchant_category_id_month_lag_nunique_' in col and '_pivot_supp' in col:
        del_cols3.append(col)
    if 'city_id' in col and '_pivot_supp' in col:
        del_cols3.append(col)
    if 'month_diff' in col and 'hist_last2_' in col:
        del_cols3.append(col)
    if 'month_diff_std' in col or 'month_diff_gap' in col:
        del_cols3.append(col) 
fea_cols = [col for col in train.columns if train[col].dtypes!='object' and train[col].dtypes != '<M8[ns]' and col!='target' not in col and col!='min_num'\
            and col not in del_cols and col not in del_cols1 and col not in del_cols2 and col!='target1' and col!='card_id_cnt_ht_pivot_supp'  and col not in del_cols3] 

train = train.iloc[:1000]
test = test.iloc[:1000]
train = train[fea_cols+['target']+['card_id']]
fea_cols.remove('outliers'+['card_id'])
test = test[fea_cols]

inf_cols = ['new_cardf_card_id_cnt_divide_installments_nunique', 'hist_last2_card_id_cnt_divide_installments_nunique']
train[inf_cols] = train[inf_cols].replace(np.inf, train[inf_cols].replace(np.inf, -99).max().max())

test[inf_cols] = test[inf_cols].replace(np.inf, test[inf_cols].replace(np.inf, -99).max().max())

# ## load sparse
# train_tags = sparse.load_npz('train_tags.npz')
# test_tags  = sparse.load_npz('test_tags.npz')

## Get the index of non-exceptional values
normal_index = train[train['outliers']==0].index.tolist()
## without outliers
ntrain = train[train['outliers'] == 0]

target        = train['target'].values
ntarget       = ntrain['target'].values
target_binary = train['outliers'].values
###
y_train        = target
y_ntrain       = ntarget
y_train_binary = target_binary

id_train = train['card_id'].copy(); train.drop('card_id', axis = 1, inplace = True)
id_ntrain = ntrain['card_id'].copy(); ntrain.drop('card_id', axis = 1, inplace = True)
id_test = test['card_id'].copy(); test.drop('card_id', axis = 1, inplace = True)

print('train:',train.shape)
print('ntrain:',ntrain.shape)

train: (1000, 800)
ntrain: (982, 800)


In [3]:
def train_model(X, X_test, y, params, folds, model_type='lgb', eval_type='regression'):
    oof = np.zeros(X.shape[0])
    predictions = np.zeros(X_test.shape[0])
    scores = []
    for fold_n, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
        print('Fold', fold_n, 'started at', time.ctime())
        
        if model_type == 'lgb':
            trn_data = lgb.Dataset(X[trn_idx], y[trn_idx])
            val_data = lgb.Dataset(X[val_idx], y[val_idx])
            clf = lgb.train(params, trn_data, num_boost_round=20000, 
                            valid_sets=[trn_data, val_data], 
                            callbacks=[lgb.early_stopping(stopping_rounds=500), lgb.log_evaluation(200)])
            oof[val_idx] = clf.predict(X[val_idx], num_iteration=clf.best_iteration)
            predictions += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits
        print(predictions)

        if eval_type == 'regression':
            scores.append(mean_squared_error(oof[val_idx], y[val_idx])**0.5)
        if eval_type == 'binary':
            scores.append(log_loss(y[val_idx], oof[val_idx]))
        
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    return oof, predictions, scores

In [4]:
#### lgb
lgb_params = {'num_leaves': 64,
             'min_data_in_leaf': 32, 
             'objective':'regression',
             'max_depth': -1,
             'learning_rate': 0.01,
             "min_child_samples": 20,
             "boosting": "gbdt",
             "feature_fraction": 0.5,
             "bagging_freq": 1,
             "bagging_fraction": 0.9,
             "bagging_seed": 11,
             "metric": 'rmse',
             "lambda_l1": 0.1,
             "verbosity": -1}
folds = KFold(n_splits=2, shuffle=True, random_state=42)
X_ntrain = ntrain[fea_cols].values
X_train  = train[fea_cols].values
X_test   = test[fea_cols].values
print('='*10,'Regression Models','='*10)
oof_lgb , predictions_lgb , scores_lgb  = train_model(X_train , X_test, y_train, params=lgb_params, folds=folds, model_type='lgb', eval_type='regression')
predictions_lgb_df = pd.DataFrame({'card_id': id_test.to_list(), 'prediction': predictions_lgb})


Fold 0 started at Thu Apr 25 15:54:34 2024
Training until validation scores don't improve for 500 rounds
[200]	training's rmse: 3.51504	valid_1's rmse: 5.08911
[400]	training's rmse: 3.02884	valid_1's rmse: 5.10161
[600]	training's rmse: 2.64305	valid_1's rmse: 5.12114
Early stopping, best iteration is:
[141]	training's rmse: 3.68985	valid_1's rmse: 5.07801
[-7.33991099e-01 -4.59204595e-01 -1.51456233e-02 -8.14574837e-02
 -2.53628770e-01  3.85803880e-03 -4.49244642e-02  5.79587901e-01
 -6.63561719e-01 -5.36064933e-01 -1.46544586e-01 -9.03015306e-02
  4.08843708e-02 -2.39870598e-01 -1.10562234e+00 -1.07069302e-01
  4.04794260e-01 -6.50078152e-02 -3.48546805e-01 -4.20641320e-01
  3.06538048e-02 -7.65989065e-01  1.47223559e-01  7.15483680e-02
  1.23835642e-01  2.13970083e-03 -1.62056733e-01 -1.78923703e-01
 -8.02752427e-02 -8.96555274e-02 -3.08051777e-01  4.39678722e-01
 -1.40586592e-01  2.80715065e-01 -1.85192036e-01 -3.12287167e-01
  3.36399837e-01 -6.96559295e-02 -3.96001401e-01  1.421

In [5]:
print('='*10,'without outliers Regression Models','='*10)
oof_nlgb, predictions_nlgb, scores_nlgb = train_model(X_ntrain, X_test, y_ntrain, params=lgb_params, folds=folds, model_type='lgb', eval_type='regression')
predictions_nlgb_df = pd.DataFrame({'card_id': id_test.to_list(), 'prediction': predictions_nlgb})


Fold 0 started at Thu Apr 25 15:54:42 2024
Training until validation scores don't improve for 500 rounds
[200]	training's rmse: 1.17597	valid_1's rmse: 1.63789
[400]	training's rmse: 0.895148	valid_1's rmse: 1.66218
[600]	training's rmse: 0.702008	valid_1's rmse: 1.68119
Early stopping, best iteration is:
[117]	training's rmse: 1.33791	valid_1's rmse: 1.63047
[-5.63178449e-02 -2.49598453e-01 -8.69004537e-02 -1.57676057e-01
 -2.87739038e-01  6.13040231e-03  2.08032339e-02  3.54910891e-01
 -1.53499579e-01  1.89955560e-02 -1.47896189e-02  1.85854546e-01
 -7.29283634e-04 -4.68769195e-02  1.98007982e-01  2.33950744e-02
  2.39566273e-01 -2.54637945e-02  2.06872402e-01  7.91511335e-02
  1.61660075e-01 -3.04777275e-01  4.17153689e-01 -7.38573749e-02
 -5.53152143e-02 -8.07156108e-02 -3.10377335e-01 -3.87926755e-02
  9.00257659e-03 -7.75057295e-02 -5.42993757e-02  1.65262492e-01
 -4.24238280e-02  2.07971657e-01 -9.89670432e-02 -9.72656291e-03
  3.58743037e-01  9.60355344e-03 -6.92699089e-02  1.3

In [6]:
print('='*10,'Classification Model','='*10)
lgb_params['objective'] = 'binary'
lgb_params['metric']    = 'binary_logloss'
oof_blgb, predictions_blgb, scores_blgb = train_model(X_train , X_test, y_train_binary, params=lgb_params, folds=folds, model_type='lgb', eval_type='binary')
predictions_blgb_df = pd.DataFrame({'card_id': id_test.to_list(), 'prediction': predictions_blgb})


Fold 0 started at Thu Apr 25 15:54:48 2024
Training until validation scores don't improve for 500 rounds
[200]	training's binary_logloss: 0.0121629	valid_1's binary_logloss: 0.10896
[400]	training's binary_logloss: 0.00388789	valid_1's binary_logloss: 0.127458
Early stopping, best iteration is:
[77]	training's binary_logloss: 0.0340047	valid_1's binary_logloss: 0.101231
[0.01460363 0.00597793 0.00463612 0.00335599 0.00425795 0.00444464
 0.00383425 0.00504624 0.00959244 0.0076023  0.00468558 0.00979089
 0.0040345  0.00629043 0.03693796 0.00538465 0.00391585 0.00383088
 0.0064     0.00483779 0.00367558 0.00815529 0.00440314 0.00367085
 0.00466861 0.00736133 0.00391093 0.00497794 0.00554249 0.00342858
 0.00380333 0.00335599 0.00355449 0.00421938 0.00451146 0.00498049
 0.0037381  0.00382571 0.00781972 0.00385273 0.00369768 0.00476697
 0.00650982 0.00423902 0.00419887 0.00564991 0.00446474 0.00437003
 0.00402319 0.00391986 0.00607828 0.00510843 0.00586761 0.003512
 0.00991636 0.00394791 0.0

In [11]:
predictions_lgb["target"]

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [7]:
merged_df = pd.merge(predictions_blgb_df, predictions_nlgb_df, on='card_id', suffixes=('_blgb', '_nlgb'), how='left')
merged_df = pd.merge(merged_df, predictions_lgb_df, on='card_id', suffixes=('_nlgb', '_lgb'), how='left')

# Create a new column 'final_prediction' based on the outliers classification
merged_df['final_prediction'] = merged_df['predictions_nlgb'].where(merged_df['outliers'] >= 0.8, merged_df['predictions_lgb'])

# Drop unnecessary columns
merged_df.drop(['predictions_nlgb', 'predictions_lgb'], axis=1, inplace=True)

TypeError: Can only merge Series or DataFrame objects, a <class 'numpy.ndarray'> was passed

In [None]:
sub_df = pd.read_csv('./elo-merchant-category-recommendation/sample_submission.csv')
sub_df["target"] = merged_df['final_prediction']
sub_df.to_csv('predictions_lgb_cleaned.csv', index=False)