In [1]:
import numpy as np 
import pandas as pd 
import scipy as sc
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# 忽视警告
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

from tqdm import tqdm_notebook
import datetime
import time
import random
from joblib import Parallel, delayed

# 要用的模型，以及一些预处理的方法
from tensorflow import keras
from catboost import Pool, CatBoostRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV, KFold, RandomizedSearchCV

In [2]:
# 数据导入
train_X_0 = pd.read_csv("../input/feature1/train_X_features_865.csv")
train_X_1 = pd.read_csv("../input/feature2/train_X_features_865_1.csv")
y_0 = pd.read_csv("../input/feature1/train_y.csv", index_col=False,  header=None)
y_1 = pd.read_csv("../input/feature2/train_y1.csv", index_col=False,  header=None)
train_X = pd.concat([train_X_0, train_X_1], axis=0)
#将两个训练集合并后需要对index进行重置
train_X = train_X.reset_index(drop=True)
train_X.head()

Unnamed: 0,FFT_Mag_01q0,FFT_Mag_10q0,FFT_Mag_90q0,FFT_Mag_99q0,FFT_Mag_mean0,FFT_Mag_std0,FFT_Mag_max0,FFT_Phz_mean0,FFT_Phz_std0,FFT_Mag_01q2500,FFT_Mag_10q2500,FFT_Mag_90q2500,FFT_Mag_99q2500,FFT_Mag_mean2500,FFT_Mag_std2500,FFT_Mag_max2500,FFT_Phz_mean2500,FFT_Phz_std2500,FFT_Mag_01q5000,FFT_Mag_10q5000,FFT_Mag_90q5000,FFT_Mag_99q5000,FFT_Mag_mean5000,FFT_Mag_std5000,FFT_Mag_max5000,FFT_Phz_mean5000,FFT_Phz_std5000,FFT_Mag_01q7500,FFT_Mag_10q7500,FFT_Mag_90q7500,FFT_Mag_99q7500,FFT_Mag_mean7500,FFT_Mag_std7500,FFT_Mag_max7500,FFT_Phz_mean7500,FFT_Phz_std7500,FFT_Mag_01q10000,FFT_Mag_10q10000,FFT_Mag_90q10000,FFT_Mag_99q10000,...,q01_roll_std_100,q05_roll_std_100,q95_roll_std_100,q99_roll_std_100,av_change_abs_roll_std_100,av_change_rate_roll_std_100,abs_max_roll_std_100,ave_roll_mean_100,std_roll_mean_100,max_roll_mean_100,min_roll_mean_100,q01_roll_mean_100,q05_roll_mean_100,q95_roll_mean_100,q99_roll_mean_100,av_change_abs_roll_mean_100,av_change_rate_roll_mean_100,abs_max_roll_mean_100,ave_roll_std_1000,std_roll_std_1000,max_roll_std_1000,min_roll_std_1000,q01_roll_std_1000,q05_roll_std_1000,q95_roll_std_1000,q99_roll_std_1000,av_change_abs_roll_std_1000,av_change_rate_roll_std_1000,abs_max_roll_std_1000,ave_roll_mean_1000,std_roll_mean_1000,max_roll_mean_1000,min_roll_mean_1000,q01_roll_mean_1000,q05_roll_mean_1000,q95_roll_mean_1000,q99_roll_mean_1000,av_change_abs_roll_mean_1000,av_change_rate_roll_mean_1000,abs_max_roll_mean_1000
0,139.522396,409.718152,2599.097192,4061.567699,1345.706663,938.17566,11969.918774,0.022472,0.913523,237.008292,731.742174,4249.831881,6874.568414,2317.422748,1425.766416,8646.302978,-0.016085,0.907828,448.85426,1338.451624,6620.832265,9452.082006,3818.343685,2036.886799,13104.251576,0.006786,0.897903,566.136528,1828.685542,11398.359936,18595.373516,5886.087254,3894.189563,23532.881664,-0.036087,0.909921,256.880152,933.99809,5958.916634,10835.791291,...,2.275451,2.44278,8.526104,18.892797,2e-05,74941.769634,50.90974,4.618451,0.436482,12.19,-2.4,3.62,3.93,5.29,5.66,5.203469e-06,74938.678297,12.19,4.20959,2.974309,28.705276,2.50263,2.60783,2.672366,10.317476,14.10714,1.35217e-05,74563.128945,28.705276,4.618982,0.250223,5.43,3.926,4.034,4.201,5.028,5.195,4.080537e-06,74563.065284,5.43
1,103.006025,432.256164,3894.194205,8899.077054,1778.081264,1753.736076,13008.467215,-0.016746,0.913844,265.294292,1096.751852,7355.981733,11124.480251,3806.170646,2546.431723,17024.790802,-0.003393,0.920536,647.666936,2056.42754,11181.052161,16426.98266,6207.629093,3586.244884,23566.293654,-0.001122,0.908535,670.834218,2226.225751,15177.013868,25823.792387,7711.334054,5419.906778,35981.079671,-0.006243,0.922031,332.465194,1068.575688,8027.668012,14524.229995,...,2.322834,2.510906,13.634641,30.138576,3.3e-05,74709.627147,52.708284,4.32541,0.513053,11.78,-2.7,2.99,3.62,5.0,5.57,3.802535e-06,74705.618281,11.78,5.414893,4.472418,29.594507,2.548829,2.664467,2.760705,16.239974,23.913015,-9.04588e-05,74208.764935,29.594507,4.325288,0.207712,5.216,3.612,3.812,3.99,4.663,4.827,-7.516779e-07,74208.713592,5.216
2,121.840541,354.13224,1812.76244,2953.824113,1030.634121,684.285591,10665.922219,-0.033928,0.928496,235.951723,714.632635,3486.537639,5040.008929,1975.605682,1095.447506,7816.698187,-0.005391,0.917317,491.740705,1354.334138,6334.734457,9033.2772,3670.042914,1931.795674,12545.320475,-0.007111,0.903919,435.187536,1456.769253,9188.303499,14115.878775,4780.926331,3099.43309,19756.115049,-0.006067,0.904687,246.821665,796.153576,4675.630875,7543.95228,...,2.256304,2.409472,8.253215,15.107173,-3.3e-05,74872.665159,34.318383,4.310323,0.401061,7.06,1.85,3.37,3.68,4.96,5.27,-8.672448e-07,74863.629626,7.06,3.922947,2.161912,16.474262,2.456007,2.581889,2.650754,8.401994,14.176844,-7.340644e-06,74532.874909,16.474262,4.310783,0.238572,5.049,3.676,3.811,3.921,4.707,4.835,-1.174497e-06,74532.22223,5.049
3,101.571556,334.674233,1639.884609,2581.813048,964.089147,587.65103,9512.216925,0.014567,0.903097,176.859785,481.129483,2268.498888,3233.275867,1309.797183,697.2699,4148.957142,-0.00628,0.906151,250.144239,662.097318,3122.971376,4407.508271,1817.541485,952.18254,5884.476832,-0.010407,0.907498,262.344445,706.090616,4182.270206,6690.867135,2270.692048,1414.974192,8266.686137,0.027824,0.913188,187.227021,542.287912,2838.605178,4923.677013,...,2.207677,2.335497,3.704693,7.41549,-3e-06,74975.714895,23.088543,4.543683,0.349356,7.31,2.31,3.73,3.97,5.11,5.37,6.004003e-07,74969.72507,7.31,2.942335,0.820362,9.268846,2.437511,2.525215,2.580794,4.529544,7.517345,-2.720343e-07,74586.537529,9.268846,4.544816,0.207969,5.076,3.878,4.009,4.205,4.89,4.96,1.57047e-06,74586.537529,5.076
4,193.779369,602.762412,9999.530411,17349.812757,3346.487848,4028.266506,19602.021689,-0.01852,0.897478,544.992831,2009.314064,19216.443087,28079.956253,9516.446876,6616.195101,33997.69598,0.005338,0.907971,1811.03613,5807.624888,33631.964685,47794.755493,17470.873596,10592.78526,56225.168772,-0.013342,0.914594,1481.614653,4331.207985,26813.695795,42300.444512,13985.6972,9145.840345,59767.643908,-0.028152,0.904733,374.326102,1286.247534,12241.92251,25798.892511,...,2.419011,2.599048,14.802593,45.381436,0.00049,74780.41765,299.865819,4.60795,0.97532,34.59,-25.12,3.04,3.86,5.32,6.22,3.135424e-05,74778.225834,34.59,6.863442,12.795296,152.640688,2.619829,2.723817,2.816206,17.081704,50.443731,0.0001401113,74401.529958,152.640688,4.608671,0.230372,6.991,2.321,4.121,4.265,4.966,5.106,4.006711e-06,74401.059981,6.991


In [3]:
#得到y数据,并重置下标
y = pd.concat([y_0, y_1], axis=0)
y = y.reset_index(drop=True)
y[0].shape

(33000,)

In [4]:
train_y = pd.Series(y[0].values)

In [5]:
# 读取测试集
test_X = pd.read_csv("../input/feature2/test_X_features_10.csv")
test_X.shape

(2624, 866)

In [6]:
#标准化数据
scaler = StandardScaler()
train_columns = train_X.columns

train_X[train_columns] = scaler.fit_transform(train_X[train_columns])
test_X[train_columns] = scaler.transform(test_X[train_columns])

In [7]:
train_columns = train_X.columns
n_fold = 5

In [None]:
%%time
# 5折交叉验证
folds = KFold(n_splits=n_fold, shuffle=True, random_state=42)

# oof，stacking时需要用到的训练集
oof = np.zeros(len(train_X))

# predictions放预测的结果
predictions = np.zeros(len(test_X))

#run model
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_X,train_y.values)):
    strLog = "fold {}".format(fold_)
    print(strLog)

    X_tr, X_val = train_X[train_columns].iloc[trn_idx], train_X[train_columns].iloc[val_idx]
    y_tr, y_val = train_y.iloc[trn_idx], train_y.iloc[val_idx]
    
    # 使用catboostregressor进行回归，estimaters个数为25000
    model = CatBoostRegressor(n_estimators=25000, verbose=-1, objective="MAE", loss_function="MAE", boosting_type="Ordered", task_type="GPU")
   # 对数据集进行拟合
    model.fit(X_tr, 
              y_tr, 
              eval_set=[(X_val, y_val)], 
              verbose=2500, 
              early_stopping_rounds=500)
    oof[val_idx] = model.predict(X_val)
    predictions += model.predict(test_X[train_columns]) / folds.n_splits

# 交叉验证的分数
cv_score = mean_absolute_error(train_y, oof)
print(cv_score)

fold 0
0:	learn: 5.6665578	test: 5.6577379	best: 5.6577379 (0)	total: 67.4ms	remaining: 28m 4s


In [None]:
# 保存结果
today = str(datetime.date.today())
submission = pd.read_csv('../input/LANL-Earthquake-Prediction/sample_submission.csv')
cat_saved=pd.DataFrame(oof,columns=['oof'])
cat_saved.to_csv('cat_oof.csv',index=False)
submission["time_to_failure"] = predictions
submission.to_csv(f'CatBoost_submission_{cv_score:.3f}.csv', index=False)
submission.head()