In [1]:
import pandas as pd
import joblib
import os

In [2]:
data = pd.read_parquet('Data/test_all_slopes.parquet')
data.shape

(924621, 1280)

In [3]:
def predict(data):
    
    models = [
        'fold_0_iter',
        'fold_1_iter',
        'fold_2_iter',
        'fold_3_iter',
        'fold_4_iter',
    ]

    model_list = []
    for path in ['models_DART_slope/']:
        for fname in os.listdir(path):
            for model_name in models:
                if model_name in fname:
                    model_list.append(path + fname)

    pred_list = []
    for model_path in model_list:
        print(model_path)
        print('{:.02f} MB'.format(os.path.getsize(model_path)/1000000))
        model = joblib.load(model_path)
        pred_list.append(model.predict(data))
    
    return pred_list

In [4]:
pred_list = predict(data)

models_DART_slope/fold_4_iter_7999_score_0.79699.pkl
92.22 MB
models_DART_slope/fold_2_iter_7090_score_0.79742.pkl
81.84 MB
models_DART_slope/fold_1_iter_6585_score_0.79487.pkl
76.07 MB
models_DART_slope/fold_3_iter_7256_score_0.79238.pkl
83.73 MB
models_DART_slope/fold_0_iter_5239_score_0.80084.pkl
60.41 MB


In [5]:
column_name = [
    'fold_4',
    'fold_2',
    'fold_1',
    'fold_3',
    'fold_0',
]

pred_df = pd.DataFrame(pred_list).T
pred_df.columns = column_name
pred_df.index = data.index

In [7]:
pred_df.mean(axis=1).to_csv('Output/p_DART_AggPcaDifRouSlo_8000_L1.csv', header=['prediction'])

In [8]:
pred_df

Unnamed: 0_level_0,fold_4,fold_2,fold_1,fold_3,fold_0
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,0.030125,0.025676,0.025289,0.040868,0.040344
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,0.000772,0.001000,0.001126,0.000888,0.001700
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,0.041117,0.042246,0.046200,0.041770,0.043245
00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976cf6e56734528702d694,0.240972,0.264930,0.295015,0.267878,0.257434
00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9a4693dd914fca22557,0.829729,0.858370,0.868450,0.828026,0.838376
...,...,...,...,...,...
ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c5d60460dba6dedc41e,0.013971,0.014393,0.010561,0.007580,0.012712
ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3a4f0ca3de613b0b2ad,0.827007,0.791064,0.765546,0.786081,0.781135
ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475cb095d2443a68030f1,0.448092,0.460661,0.447329,0.379277,0.512443
ffffddef1fc3643ea179c93245b68dca0f36941cd83977822e8b356988ca4d07,0.287224,0.253734,0.276857,0.289288,0.243263


In [35]:
p1 = pred_df.mean(axis=1)
p2 = pd.read_csv('Output/p_blend_kaggle.csv', index_col='customer_ID').loc[pred_df.index]

In [37]:
p_df = pd.concat([p1, p2], axis=1)
p_df

Unnamed: 0_level_0,0,prediction
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,0.032460,0.026103
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,0.001097,0.000873
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,0.042915,0.042910
00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976cf6e56734528702d694,0.265246,0.245682
00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9a4693dd914fca22557,0.844590,0.854114
...,...,...
ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c5d60460dba6dedc41e,0.011843,0.010781
ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3a4f0ca3de613b0b2ad,0.790167,0.774996
ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475cb095d2443a68030f1,0.449560,0.399533
ffffddef1fc3643ea179c93245b68dca0f36941cd83977822e8b356988ca4d07,0.270073,0.273294


In [38]:
ensemble = 0.05*p_df[0] + 0.95*p_df['prediction'] 

In [39]:
ensemble.to_csv('Output/p_ensemble_DART_AggPcaDifRouSlo_8000_L1__MybestKaggle.csv', header=['prediction'])
