### LR 模型

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
X = train.drop(['first_active_month', 'card_id', 'target'], axis=1)
y = train.target
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.35, random_state=11)
lr = LinearRegression(normalize=True)
lr.fit(X_tr, y_tr)
val_preds = lr.predict(X_val)
test_preds = lr.predict(test.drop(['first_active_month', 'card_id'], axis=1))
base_sub_df = pd.DataFrame(np.array([test.card_id, test_preds]).T, columns=['card_id', 'target'])
base_sub_df.to_csv('../submission/sub_0_baseline.csv', index=False)
# online=3.930

## xgb+lgb+catboost

In [1]:
import os
import json
import time
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
%matplotlib inline
import datetime



In [2]:
from sklearn.model_selection import KFold
from sklearn import model_selection, preprocessing, metrics
from sklearn import model_selection
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, RidgeCV, Lasso
from sklearn.ensemble import RandomForestRegressor

import lightgbm as lgb
import xgboost as xgb
import catboost as cb

In [3]:
df_train = pd.read_csv('../data/train.csv', parse_dates=['first_active_month'])
df_test = pd.read_csv('../data/test.csv', parse_dates=['first_active_month'])

In [4]:
df_train["month"] = df_train["first_active_month"].dt.month
df_test["month"] = df_test["first_active_month"].dt.month

df_train["year"] = df_train["first_active_month"].dt.year
df_test["year"] = df_test["first_active_month"].dt.year

df_train['elapsed_time'] = (datetime.date(2018, 2, 1) - df_train['first_active_month'].dt.date).dt.days
df_test['elapsed_time'] = (datetime.date(2018, 2, 1) - df_test['first_active_month'].dt.date).dt.days

In [5]:
df_train = pd.get_dummies(df_train, columns=['feature_1', 'feature_2'])
df_test = pd.get_dummies(df_test, columns=['feature_1', 'feature_2'])

In [7]:
df_hist = pd.read_csv('../data/historical_transactions.csv')
df_hist = pd.get_dummies(df_hist, columns=['category_2', 'category_3'])
df_hist['authorized_flag'] = df_hist['authorized_flag'].map({'Y': 1, 'N': 0})
df_hist['category_1'] = df_hist['category_1'].map({'Y': 1, 'N': 0})

In [8]:
def aggregate_transactions(df, prefix):  
    df.loc[:, 'purchase_date'] = pd.DatetimeIndex(df['purchase_date']).\
                                      astype(np.int64) * 1e-9
    
    agg_func = {'authorized_flag': ['sum', 'mean'],
                'category_1': ['mean'],
                'category_2_1.0': ['mean'],
                'category_2_2.0': ['mean'],
                'category_2_3.0': ['mean'],
                'category_2_4.0': ['mean'],
                'category_2_5.0': ['mean'],
                'category_3_A': ['mean'],
                'category_3_B': ['mean'],
                'category_3_C': ['mean'],
                'merchant_id': ['nunique'],
                'purchase_amount': ['sum', 'mean', 'max', 'min', 'std'],
                'installments': ['sum', 'mean', 'max', 'min', 'std'],
                'purchase_date': [np.ptp],
                'month_lag': ['min', 'max']
                }
    agg_df = df.groupby(['card_id']).agg(agg_func)
    agg_df.columns = [prefix + '_'.join(col).strip() for col in agg_df.columns.values]
    agg_df.reset_index(inplace=True)
    
    df = (df.groupby('card_id').size().reset_index(name='{}transactions_count'.format(prefix)))
    
    agg_df = pd.merge(df, agg_df, on='card_id', how='left')
    
    return agg_df

In [9]:
df_hist = aggregate_transactions(df_hist, prefix='hist_')
df_train = pd.merge(df_train, df_hist, on='card_id',how='left')
df_test = pd.merge(df_test, df_hist, on='card_id',how='left')
print(df_train.shape, df_test.shape)

(201917, 41) (123623, 40)


In [10]:
df_new = pd.read_csv('../data/new_merchant_transactions.csv')
df_new = pd.get_dummies(df_new, columns=['category_2', 'category_3'])
df_new['authorized_flag'] = df_new['authorized_flag'].map({'Y': 1, 'N': 0})
df_new['category_1'] = df_new['category_1'].map({'Y': 1, 'N': 0})

In [11]:
df_new = aggregate_transactions(df_new, prefix='new_')
df_train = pd.merge(df_train, df_new, on='card_id',how='left')
df_test = pd.merge(df_test, df_new, on='card_id',how='left')

print(df_train.shape, df_test.shape)

(201917, 67) (123623, 66)


In [12]:
target = df_train['target']
cols_to_drop = ['card_id', 'first_active_month', 'target']
use_cols = [c for c in df_train.columns if c not in cols_to_drop]
features = list(df_train[use_cols].columns)

### 1、Lightgbm

In [13]:
lgb_params = {'num_leaves': 50,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.005,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1}

folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof_lgb = np.zeros(len(df_train))
predictions_lgb = np.zeros(len(df_test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train.values, target.values)):
    print('-')
    print("Fold {}".format(fold_ + 1))
    trn_data = lgb.Dataset(df_train.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(df_train.iloc[val_idx][features], label=target.iloc[val_idx])

    num_round = 10000
    clf = lgb.train(lgb_params, trn_data, num_round, valid_sets = [trn_data, val_data], 
                    verbose_eval=100, early_stopping_rounds=100)
    oof_lgb[val_idx] = clf.predict(df_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    predictions_lgb += clf.predict(df_test[features], num_iteration=clf.best_iteration) / folds.n_splits

-
Fold 1
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 3.75147	valid_1's rmse: 3.81757
[200]	training's rmse: 3.69502	valid_1's rmse: 3.7841
[300]	training's rmse: 3.6531	valid_1's rmse: 3.76498
[400]	training's rmse: 3.61719	valid_1's rmse: 3.75078
[500]	training's rmse: 3.5871	valid_1's rmse: 3.74076
[600]	training's rmse: 3.56154	valid_1's rmse: 3.73426
[700]	training's rmse: 3.53944	valid_1's rmse: 3.72883
[800]	training's rmse: 3.51884	valid_1's rmse: 3.7248
[900]	training's rmse: 3.50013	valid_1's rmse: 3.72151
[1000]	training's rmse: 3.48358	valid_1's rmse: 3.71906
[1100]	training's rmse: 3.46722	valid_1's rmse: 3.71656
[1200]	training's rmse: 3.45352	valid_1's rmse: 3.71511
[1300]	training's rmse: 3.44004	valid_1's rmse: 3.71374
[1400]	training's rmse: 3.42796	valid_1's rmse: 3.71275
[1500]	training's rmse: 3.41636	valid_1's rmse: 3.7116
[1600]	training's rmse: 3.40514	valid_1's rmse: 3.711
[1700]	training's rmse: 3.39419	valid_1's rmse: 

In [14]:
validation_score = np.sqrt(mean_squared_error(target, oof_lgb))
validation_score

3.690564482025014

In [15]:
df_submission = pd.DataFrame({"card_id": df_test["card_id"].values})
df_submission["target"] = predictions_lgb
df_submission.to_csv("../submission/lgb12131605.csv", index=False)

# online=3.738

### 2、xgboost

In [16]:
xgb_params = {'eta': 0.005, 
              'max_depth': 10, 
              'subsample': 0.8, 
              'colsample_bytree': 0.8, 
              'objective': 'reg:linear', 
              'eval_metric': 'rmse', 
              'silent': True}

folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof_xgb = np.zeros(len(df_train))
predictions_xgb = np.zeros(len(df_test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train.values, target.values)):
    print('-')
    print("Fold {}".format(fold_ + 1))
    trn_data = xgb.DMatrix(data=df_train.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = xgb.DMatrix(data=df_train.iloc[val_idx][features], label=target.iloc[val_idx])
    watchlist = [(trn_data, 'train'), (val_data, 'valid')]
    print("xgb " + str(fold_) + "-" * 50)
    num_round = 10000
    xgb_model = xgb.train(xgb_params, trn_data, num_round, watchlist, 
                          early_stopping_rounds=50, verbose_eval=200)
    oof_xgb[val_idx] = xgb_model.predict(xgb.DMatrix(df_train.iloc[val_idx][features]), 
                                         ntree_limit=xgb_model.best_ntree_limit+50)

    predictions_xgb += xgb_model.predict(xgb.DMatrix(df_test[features]), 
                                         ntree_limit=xgb_model.best_ntree_limit+50) / folds.n_splits

-
Fold 1
xgb 0--------------------------------------------------
[0]	train-rmse:3.94008	valid-rmse:3.98681
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[200]	train-rmse:3.49157	valid-rmse:3.78322
[400]	train-rmse:3.2684	valid-rmse:3.73671
[600]	train-rmse:3.12562	valid-rmse:3.72128
[800]	train-rmse:3.03269	valid-rmse:3.71554
[1000]	train-rmse:2.96113	valid-rmse:3.71264
[1200]	train-rmse:2.89615	valid-rmse:3.71126
Stopping. Best iteration:
[1288]	train-rmse:2.86982	valid-rmse:3.71085

-
Fold 2
xgb 1--------------------------------------------------
[0]	train-rmse:3.96013	valid-rmse:3.90615
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 50 rounds.
[200]	train-rmse:3.49784	valid-rmse:3.71616
[400]	train-rmse:3.26637	valid-rmse:3.67674
[600]	train-rmse:3.11642	valid-rmse:3.66414
[800]	train-rmse:3.01862	va

In [17]:
validation_score = np.sqrt(mean_squared_error(target, oof_xgb))
validation_score

3.693962942857355

In [18]:
df_submission = pd.DataFrame({"card_id": df_test["card_id"].values})
df_submission["target"] = predictions_xgb
df_submission.to_csv("../submission/xgb12131646.csv", index=False)

# online=3.745

### 3、catboost

In [19]:
kfolds = KFold(n_splits=5, shuffle=True, random_state=15)
oof_cb = np.zeros(len(df_train))
predictions_cb = np.zeros(len(df_test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train.values, target.values)):
    print("_")
    print("Fold {}".format(fold_ + 1))
    X_train, y_train = df_train[features].iloc[trn_idx], target.iloc[trn_idx]
    X_valid, y_valid = df_train[features].iloc[val_idx], target.iloc[val_idx]
    print("cb " + str(fold_) + "_" * 50)
    
    # CatBoost Regressor estimator
    model = cb.CatBoostRegressor(learning_rate = 0.005,
        iterations = 10000,
        eval_metric = 'RMSE',
        allow_writing_files = False,
        od_type = 'Iter',
        bagging_temperature = 0.2,
        depth = 10,
        od_wait = 20,
        silent = True)
    
            
    # Fit
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              early_stopping_rounds=50,
              verbose_eval=400)
    
    oof_cb[val_idx] = model.predict(X_valid)
    predictions_cb += model.predict(df_test[features]) / kfolds.n_splits

_
Fold 1
cb 0__________________________________________________
0:	learn: 3.8605352	test: 3.8605352	test1: 3.9041308	best: 3.9041308 (0)	total: 192ms	remaining: 32m 2s
400:	learn: 3.6699645	test: 3.6699645	test1: 3.7536176	best: 3.7536176 (400)	total: 34.7s	remaining: 13m 49s
800:	learn: 3.6133553	test: 3.6133553	test1: 3.7328984	best: 3.7328984 (800)	total: 1m 6s	remaining: 12m 43s
1200:	learn: 3.5730400	test: 3.5730400	test1: 3.7231023	best: 3.7231023 (1200)	total: 1m 37s	remaining: 11m 54s
1600:	learn: 3.5371259	test: 3.5371259	test1: 3.7176282	best: 3.7176282 (1600)	total: 2m 8s	remaining: 11m 16s
2000:	learn: 3.5054497	test: 3.5054497	test1: 3.7141400	best: 3.7141400 (2000)	total: 2m 39s	remaining: 10m 37s
2400:	learn: 3.4755359	test: 3.4755359	test1: 3.7118845	best: 3.7118845 (2400)	total: 3m 10s	remaining: 10m 2s
2800:	learn: 3.4473954	test: 3.4473954	test1: 3.7103322	best: 3.7103322 (2800)	total: 3m 40s	remaining: 9m 27s
3200:	learn: 3.4197028	test: 3.4197028	test1: 3.7090015	b

In [20]:
validation_score = np.sqrt(mean_squared_error(target, oof_cb))
validation_score

3.6926237436123257

In [21]:
df_submission = pd.DataFrame({"card_id": df_test["card_id"].values})
df_submission["target"] = predictions_cb
df_submission.to_csv("../submission/cb12131722.csv", index=False)

# online=3.751

### 4、Ridge模型

In [22]:
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof_ridge = np.zeros(len(df_train))
predictions_ridge = np.zeros(len(df_test))

tst_data = df_test.copy()
tst_data.fillna((tst_data.mean()), inplace=True)

tst_data = tst_data[features].values

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train, target)):
    print("fold no.{}".format(fold_+1))
    trn_data, trn_y = df_train.iloc[trn_idx][features], target.iloc[trn_idx].values
    val_data, val_y = df_train.iloc[val_idx][features], target.iloc[val_idx].values
    
    trn_data.fillna((trn_data.mean()), inplace=True)
    val_data.fillna((val_data.mean()), inplace=True)
    
    trn_data = trn_data.values
    val_data = val_data.values

    clf = Ridge(alpha=100)
    clf.fit(trn_data, trn_y)
    
    oof_ridge[val_idx] = clf.predict(val_data)
    predictions_ridge += clf.predict(tst_data) / folds.n_splits

fold no.1
fold no.2
fold no.3
fold no.4
fold no.5


In [23]:
validation_score = np.sqrt(mean_squared_error(target, oof_ridge))
validation_score

3.8285953928376797

In [24]:
df_submission = pd.DataFrame({"card_id": df_test["card_id"].values})
df_submission["target"] = predictions_ridge
df_submission.to_csv("../submission/ridge12131724.csv", index=False)

### 5、Lasso模型

In [25]:
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof_lasso = np.zeros(len(df_train))
predictions_lasso = np.zeros(len(df_test))

tst_data = df_test.copy()
tst_data.fillna((tst_data.mean()), inplace=True)

tst_data = tst_data[features].values

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train, target)):
    print("fold no.{}".format(fold_+1))
    trn_data, trn_y = df_train.iloc[trn_idx][features], target.iloc[trn_idx].values
    val_data, val_y = df_train.iloc[val_idx][features], target.iloc[val_idx].values
    
    trn_data.fillna((trn_data.mean()), inplace=True)
    val_data.fillna((val_data.mean()), inplace=True)
    
    trn_data = trn_data.values
    val_data = val_data.values

    clf = Lasso(alpha=100)
    clf.fit(trn_data, trn_y)
    
    oof_ridge[val_idx] = clf.predict(val_data)
    predictions_lasso += clf.predict(tst_data) / folds.n_splits

fold no.1
fold no.2
fold no.3
fold no.4
fold no.5


In [26]:
validation_score = np.sqrt(mean_squared_error(target, oof_lasso))
validation_score

3.8705589161316296

In [27]:
df_submission = pd.DataFrame({"card_id": df_test["card_id"].values})
df_submission["target"] = predictions_lasso
df_submission.to_csv("../submission/lasso12131726.csv", index=False)

### 6、stacking融合

In [28]:
train_stack = np.vstack([oof_cb, oof_lgb, oof_xgb, oof_ridge, oof_lasso]).transpose()
test_stack = np.vstack([predictions_cb, predictions_lgb, predictions_xgb, predictions_ridge, predictions_lasso]).transpose()

folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(train_stack.shape[0])
predictions = np.zeros(test_stack.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_stack, target)):
    print("fold no. {}".format(fold_ + 1))
    trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
    val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values

    clf = Ridge(alpha=500)
    clf.fit(trn_data, trn_y)
    
    oof[val_idx] = clf.predict(val_data)
    predictions += clf.predict(test_stack) / folds.n_splits


np.sqrt(mean_squared_error(target, oof))

fold no. 1
fold no. 2
fold no. 3
fold no. 4
fold no. 5


3.686560466875316

In [29]:
df_submission = pd.DataFrame({"card_id": df_test["card_id"].values})
df_submission["target"] = predictions
df_submission.to_csv("../submission/stacking12131728.csv", index=False)
# online=3.739