In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm,tqdm_notebook 
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

from catboost import CatBoostClassifier
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import BayesianRidge
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold, StratifiedKFold
from scipy import sparse
import warnings
import time
import sys
import os
import gc
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)
pd.set_option('max_colwidth',100)

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
train = pd.read_csv("train.csv")
test =pd.read_csv("test.csv")

del_cols = []
for col in train.columns:
    if 'subsector_id_cnt_' in col and 'new_cardf': 
        del_cols.append(col)
del_cols1 = []
for col in train.columns:
    if 'subsector_id_cnt_' in col and 'hist_last2_' in col:
        del_cols1.append(col)
del_cols2 = []
for col in train.columns:
    if 'subsector_id_cnt_' in col and 'auth_cardf' in col:
        del_cols2.append(col)
del_cols3 = []
for col in train.columns:
    if 'merchant_category_id_month_lag_nunique_' in col and '_pivot_supp' in col:
        del_cols3.append(col)
    if 'city_id' in col and '_pivot_supp' in col:
        del_cols3.append(col)
    if 'month_diff' in col and 'hist_last2_' in col:
        del_cols3.append(col)
    if 'month_diff_std' in col or 'month_diff_gap' in col:
        del_cols3.append(col) 
fea_cols = [col for col in train.columns if train[col].dtypes!='object' and train[col].dtypes != '<M8[ns]' and col!='target' not in col and col!='min_num'\
            and col not in del_cols and col not in del_cols1 and col not in del_cols2 and col!='target1' and col!='card_id_cnt_ht_pivot_supp'  and col not in del_cols3] 

train = train[fea_cols+['target']]
fea_cols.remove('outliers')
test = test[fea_cols]

inf_cols = ['new_cardf_card_id_cnt_divide_installments_nunique', 'hist_last2_card_id_cnt_divide_installments_nunique']
train[inf_cols] = train[inf_cols].replace(np.inf, train[inf_cols].replace(np.inf, -99).max().max())

test[inf_cols] = test[inf_cols].replace(np.inf, test[inf_cols].replace(np.inf, -99).max().max())

# ## load sparse
# train_tags = sparse.load_npz('train_tags.npz')
# test_tags  = sparse.load_npz('test_tags.npz')

## Get the index of non-exceptional values
normal_index = train[train['outliers']==0].index.tolist()
## without outliers
ntrain = train[train['outliers'] == 0]

target        = train['target'].values
ntarget       = ntrain['target'].values
target_binary = train['outliers'].values
###
y_train        = target
y_ntrain       = ntarget
y_train_binary = target_binary

print('train:',train.shape)
print('ntrain:',ntrain.shape)

In [None]:
def train_model(X, X_test, y, params, folds, model_type='lgb', eval_type='regression'):
    oof = np.zeros(X.shape[0])
    predictions = np.zeros(X_test.shape[0])
    scores = []
    for fold_n, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
        print('Fold', fold_n, 'started at', time.ctime())
        
        if model_type == 'lgb':
            trn_data = lgb.Dataset(X[trn_idx], y[trn_idx])
            val_data = lgb.Dataset(X[val_idx], y[val_idx])
            clf = lgb.train(params, trn_data, num_boost_round=20000, 
                            valid_sets=[trn_data, val_data], 
                            callbacks=[lgb.early_stopping(stopping_rounds=500), lgb.log_evaluation(200)])
            oof[val_idx] = clf.predict(X[val_idx], num_iteration=clf.best_iteration)
            predictions += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits
        print(predictions)

        if eval_type == 'regression':
            scores.append(mean_squared_error(oof[val_idx], y[val_idx])**0.5)
        if eval_type == 'binary':
            scores.append(log_loss(y[val_idx], oof[val_idx]))
        
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    return oof, predictions, scores

In [None]:

num_leaves = [16, 32, 64, 128]
mean_scores = []
for num_leaf in num_leaves:
    lgb_params = {'num_leaves': num_leaf,
                'min_data_in_leaf': 32, 
                'objective':'regression',
                'max_depth': -1,
                'learning_rate': 0.01,
                "min_child_samples": 20,
                "boosting": "gbdt",
                "feature_fraction": 0.9,
                "bagging_freq": 1,
                "bagging_fraction": 0.9 ,
                "bagging_seed": 11,
                "metric": 'rmse',
                "lambda_l1": 0.1,
                "verbosity": -1}
    folds = KFold(n_splits=5, shuffle=True, random_state=42)
    X_ntrain = ntrain[fea_cols].values
    X_train  = train[fea_cols].values
    X_test   = test[fea_cols].values
    print('='*10,'Regression Models','='*10)
    oof_lgb , predictions_lgb , scores_lgb  = train_model(X_train , X_test, y_train, params=lgb_params, folds=folds, model_type='lgb', eval_type='regression')
    mean_score = np.mean(scores_lgb)
    
    # Append the mean score to the list
    mean_scores.append(mean_score)

In [None]:
# Plot the mean scores for each num_leaf
plt.plot(num_leaves, mean_scores, marker='o')
plt.xlabel('num_leaves')
plt.ylabel('Mean RMSE')
plt.title('Mean RMSE vs. num_leaves')
plt.grid(True)
plt.show()

In [None]:

feature_fractions = [0.2, 0.5, 0.7, 0.9]
mean_scores = []
for feature_fraction in feature_fractions:
    lgb_params = {'num_leaves': 64,
                'min_data_in_leaf': 32, 
                'objective':'regression',
                'max_depth': -1,
                'learning_rate': 0.01,
                "min_child_samples": 20,
                "boosting": "gbdt",
                "feature_fraction": feature_fraction,
                "bagging_freq": 1,
                "bagging_fraction": 0.9 ,
                "bagging_seed": 11,
                "metric": 'rmse',
                "lambda_l1": 0.1,
                "verbosity": -1}
    folds = KFold(n_splits=5, shuffle=True, random_state=42)
    X_ntrain = ntrain[fea_cols].values
    X_train  = train[fea_cols].values
    X_test   = test[fea_cols].values
    print('='*10,'Regression Models','='*10)
    oof_lgb , predictions_lgb , scores_lgb  = train_model(X_train , X_test, y_train, params=lgb_params, folds=folds, model_type='lgb', eval_type='regression')
    mean_score = np.mean(scores_lgb)
    
    # Append the mean score to the list
    mean_scores.append(mean_score)

In [None]:
plt.xlabel('num_leaves')
plt.ylabel('Mean RMSE')
plt.title('Mean RMSE vs. num_leaves')
plt.grid(True)
plt.show()

In [None]:

bagging_fractions = [0.2, 0.5, 0.7, 0.9]
mean_scores = []
for bagging_fraction in bagging_fractions:
    lgb_params = {'num_leaves': 64,
                'min_data_in_leaf': 32, 
                'objective':'regression',
                'max_depth': -1,
                'learning_rate': 0.01,
                "min_child_samples": 20,
                "boosting": "gbdt",
                "feature_fraction": 0.9,
                "bagging_freq": 1,
                "bagging_fraction": bagging_fraction ,
                "bagging_seed": 11,
                "metric": 'rmse',
                "lambda_l1": 0.1,
                "verbosity": -1}
    folds = KFold(n_splits=5, shuffle=True, random_state=42)
    X_ntrain = ntrain[fea_cols].values
    X_train  = train[fea_cols].values
    X_test   = test[fea_cols].values
    print('='*10,'Regression Models','='*10)
    oof_lgb , predictions_lgb , scores_lgb  = train_model(X_train , X_test, y_train, params=lgb_params, folds=folds, model_type='lgb', eval_type='regression')
    mean_score = np.mean(scores_lgb)
    
    # Append the mean score to the list
    mean_scores.append(mean_score)

In [None]:

plt.plot(bagging_fractions, mean_scores, marker='o')
plt.xlabel('num_leaves')
plt.ylabel('Mean RMSE')
plt.title('Mean RMSE vs. num_leaves')
plt.grid(True)
plt.show()