## Import

In [15]:
# for read data
import os
import numpy as np
import pandas as pd
import warnings; warnings.filterwarnings("ignore")

# for choose models
import matplotlib.pyplot as plt
import seaborn as sns
plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)

# averaging
from scipy.stats import gmean, hmean

## Read Data

In [72]:
sub_path = (os.path.abspath("./submission"))

# 1차 1~3등 submission files
first = pd.read_csv(sub_path +'/first_LGBM.csv', encoding='cp949')
second = pd.read_csv(sub_path +'/second_CAT.csv', encoding='cp949')
# third = pd.read_csv(sub_path +'/third_CAT.csv', encoding='cp949')

cat_1 = pd.read_csv(sub_path +'/MCAT_FLGBM.csv', encoding='cp949')
# cat_2 = pd.read_csv(sub_path +'/MCAT_FCAT.csv', encoding='cp949')
# cat_3 = pd.read_csv(sub_path +'/MCAT_FW2V.csv', encoding='cp949')

# lgbm_1 = pd.read_csv(sub_path +'/MLGBM_FCAT.csv', encoding='cp949')
# lgbm_2 = pd.read_csv(sub_path +'/MLGBM_FLGBM.csv', encoding='cp949')

dnn_1 = pd.read_csv(sub_path +'/MDNN_FW2V.csv', encoding='cp949')
# dnn_2 = pd.read_csv(sub_path +'/MDNN_FCAT.csv', encoding='cp949')

## Model List

In [73]:
pred = [first, second] + [cat_1] + [dnn_1]
name = ['LGBM', 'CAT'] +['CAT'] + ['DNN']

# public score를 기준으로 한다.
logloss = [1.46109, 1.47846] + [1.48775] + [1.5182]

# 추후에 합칠 ID 저장
test_ID = pred[0]['ID']

In [74]:
ensemble_results = []
for i in range(len(pred)):
    del pred[i]['ID']
    ensemble_results.append(pred[i]['F20'])

## Visualize Correlation

In [75]:
corr = pd.DataFrame(ensemble_results).T.corr().mean(axis=0).values
df = pd.DataFrame({'model': name, 'logloss': [-1*i for i in logloss], 'cor': corr})

In [None]:
plt.figure(figsize=(10,8))
g = sns.scatterplot(x="cor", y="logloss", data=df, s=40, color='red')

for line in range(0, df.shape[0]):
     g.text(df.cor[line]+0.0004, df.logloss[line]-0.00015, 
            f'{df.model[line]}({df.logloss[line]})', horizontalalignment='left', 
            size='medium', color='black', weight='semibold')
        
plt.xlim((df.cor.min()-0.001,df.cor.max()+0.001))
plt.ylim((df.logloss.min()-0.1,df.logloss.max()+0.1))
plt.xlabel('Mean agreement (Corr.)')
plt.ylabel('Public score (Neg Logloss)')

plt.grid()
plt.show()

## Averaging

In [59]:
# Logloss가 큰 순서대로 정렬한다.
# thrid+cat_1+cat_2
# thrid+second+first
averaging = [DNN, third, cat_1, second, first]

- **[Simultaneously]**

In [60]:
# 기하평균
# Logloss는 기하평균을 권장한다.
GMEAN = pd.DataFrame(gmean(averaging))
GMEAN

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.101489,0.133709,0.070646,0.141738,0.088715,0.233243,0.050666,0.154522
1,0.044380,0.174494,0.306773,0.034579,0.007985,0.058436,0.285844,0.054394
2,0.748139,0.155759,0.021696,0.011342,0.015880,0.019343,0.005257,0.008933
3,0.629535,0.111282,0.038583,0.034832,0.034939,0.096939,0.015296,0.024203
4,0.859743,0.066927,0.014557,0.007105,0.022857,0.013829,0.003970,0.005039
...,...,...,...,...,...,...,...,...
14375,0.071223,0.503117,0.041418,0.017506,0.014460,0.256474,0.073794,0.014246
14376,0.404570,0.233996,0.114014,0.056691,0.032308,0.072168,0.044685,0.036119
14377,0.342238,0.179387,0.101253,0.101026,0.063998,0.096907,0.037926,0.065670
14378,0.073948,0.510954,0.071172,0.014032,0.014308,0.227797,0.056137,0.006724


In [61]:
# 조화평균
HMEAN = pd.DataFrame(hmean(averaging))
HMEAN

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.098327,0.131688,0.066451,0.140380,0.084575,0.231852,0.045392,0.148390
1,0.041213,0.157239,0.304487,0.033847,0.007579,0.055857,0.277589,0.053581
2,0.745736,0.145770,0.020383,0.011117,0.015533,0.018140,0.004934,0.008747
3,0.627103,0.108575,0.036923,0.031756,0.033356,0.091205,0.014416,0.022838
4,0.859102,0.063824,0.013728,0.006518,0.022102,0.013578,0.003836,0.004794
...,...,...,...,...,...,...,...,...
14375,0.069676,0.502410,0.040910,0.014487,0.014233,0.254584,0.070889,0.012904
14376,0.402660,0.233727,0.111897,0.055695,0.032027,0.071892,0.043342,0.035570
14377,0.340048,0.176667,0.099543,0.100262,0.061628,0.093878,0.036551,0.065429
14378,0.073096,0.508403,0.060544,0.013715,0.013503,0.218404,0.052518,0.006273


In [62]:
# 산술평균
MEAN = pd.DataFrame(np.mean(averaging, axis=0))
MEAN

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.104781,0.135787,0.075302,0.143055,0.093159,0.234755,0.057039,0.160707
1,0.048066,0.190038,0.308901,0.035358,0.008529,0.061386,0.295034,0.055228
2,0.750606,0.164474,0.023277,0.011561,0.016225,0.020529,0.005628,0.009128
3,0.632168,0.113903,0.040201,0.038832,0.036753,0.102322,0.016425,0.025441
4,0.860383,0.070453,0.015497,0.007848,0.023683,0.014072,0.004117,0.005314
...,...,...,...,...,...,...,...,...
14375,0.072596,0.503848,0.041984,0.019755,0.014704,0.258379,0.077597,0.015502
14376,0.406420,0.234256,0.115987,0.057765,0.032589,0.072439,0.046237,0.036712
14377,0.344352,0.182058,0.102917,0.101811,0.066610,0.100403,0.039081,0.065912
14378,0.074940,0.513538,0.087341,0.014327,0.014933,0.235764,0.059972,0.007131


In [63]:
# 멱평균(p > 1)
p = 2.5    
p_mean = pd.DataFrame(0, index=averaging[0].index, columns=averaging[0].columns)
for i in averaging:
    p_mean = p_mean + i**p
    
POWER = (p_mean / len(averaging))**(1/p)
POWER

Unnamed: 0,F20,F30,F40,F50,M20,M30,M40,M50
0,0.109611,0.138965,0.082147,0.144933,0.099797,0.237239,0.066135,0.169656
1,0.053526,0.207217,0.311793,0.036573,0.009546,0.065967,0.309540,0.056507
2,0.754399,0.174340,0.025854,0.011865,0.016715,0.022058,0.006201,0.009429
3,0.636491,0.117524,0.042307,0.045963,0.039568,0.108885,0.018389,0.026924
4,0.861338,0.076262,0.016909,0.009069,0.024966,0.014407,0.004345,0.005737
...,...,...,...,...,...,...,...,...
14375,0.074316,0.504987,0.042931,0.021680,0.015095,0.261239,0.084744,0.017133
14376,0.409063,0.234630,0.118639,0.059446,0.033006,0.072836,0.048808,0.037657
14377,0.347366,0.185853,0.105234,0.103018,0.070725,0.106371,0.040407,0.066272
14378,0.076683,0.517452,0.114321,0.014725,0.015555,0.244946,0.065300,0.007623


In [64]:
# 이차평균
p = 2 
p_mean = pd.DataFrame(0, index=averaging[0].index, columns=averaging[0].columns)
for i in averaging:
    p_mean = p_mean + i**p
    
QUADRATIC = (p_mean / len(averaging))**(1/p)
QUADRATIC

Unnamed: 0,F20,F30,F40,F50,M20,M30,M40,M50
0,0.108041,0.137901,0.079958,0.144321,0.097631,0.236384,0.063341,0.166744
1,0.051795,0.202311,0.310869,0.036166,0.009191,0.064466,0.304701,0.056079
2,0.753124,0.171468,0.024998,0.011768,0.016557,0.021591,0.006014,0.009328
3,0.635001,0.116366,0.041659,0.043539,0.038646,0.106925,0.017723,0.026482
4,0.861020,0.074290,0.016455,0.008667,0.024541,0.014300,0.004270,0.005599
...,...,...,...,...,...,...,...,...
14375,0.073786,0.504602,0.042604,0.021174,0.014962,0.260287,0.082225,0.016622
14376,0.408200,0.234508,0.117796,0.058883,0.032867,0.072706,0.047934,0.037337
14377,0.346383,0.184624,0.104490,0.102613,0.069350,0.104310,0.040014,0.066153
14378,0.076070,0.516144,0.105783,0.014599,0.015383,0.242224,0.063630,0.007475


- **[Repeatedly]**

In [65]:
# 기하평균
# 추천!!
R_GMEAN = gmean(averaging[:2])
for i in averaging[2:]:
    R_GMEAN = gmean([R_GMEAN, i])
    
R_GMEAN = pd.DataFrame(R_GMEAN)
R_GMEAN

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.110376,0.125294,0.062565,0.129030,0.108790,0.254712,0.044012,0.137415
1,0.048793,0.196095,0.322067,0.030908,0.007187,0.051412,0.267893,0.051345
2,0.731741,0.177909,0.021977,0.012569,0.013742,0.016141,0.004801,0.009333
3,0.610843,0.093577,0.043606,0.040320,0.040961,0.115921,0.015660,0.027264
4,0.860158,0.062894,0.015721,0.009073,0.023391,0.013672,0.004215,0.005292
...,...,...,...,...,...,...,...,...
14375,0.079120,0.488276,0.043810,0.020791,0.014258,0.253442,0.077850,0.014995
14376,0.376054,0.235091,0.126091,0.064229,0.031053,0.072416,0.054856,0.033786
14377,0.358235,0.162447,0.103549,0.105671,0.066430,0.094407,0.035643,0.066119
14378,0.082832,0.498300,0.062370,0.015248,0.016052,0.252224,0.050230,0.006598


In [66]:
# 조화평균
R_HMEAN = hmean(averaging[:2])
for i in averaging[2:]:
    R_HMEAN = hmean([R_HMEAN, i])
    
R_HMEAN = pd.DataFrame(R_HMEAN)
R_HMEAN

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.106136,0.124086,0.060336,0.127744,0.103370,0.252705,0.041805,0.129256
1,0.044940,0.183780,0.320915,0.030418,0.007048,0.050008,0.261852,0.050839
2,0.730136,0.168450,0.020777,0.012374,0.013508,0.015033,0.004610,0.009186
3,0.609863,0.091748,0.042337,0.037463,0.038658,0.112377,0.015084,0.026486
4,0.859587,0.060892,0.014708,0.008080,0.022653,0.013436,0.004074,0.005043
...,...,...,...,...,...,...,...,...
14375,0.078191,0.487949,0.043340,0.019138,0.014136,0.251851,0.074658,0.014384
14376,0.374223,0.234986,0.123776,0.063296,0.030864,0.072185,0.053061,0.033538
14377,0.356988,0.160793,0.102428,0.105120,0.065166,0.093133,0.034395,0.065940
14378,0.081256,0.496606,0.056864,0.015061,0.015676,0.247828,0.047602,0.006411


In [67]:
# 산술평균
R_MEAN = np.mean(averaging[:2], axis=0)
for i in averaging[2:]:
    R_MEAN = np.mean([R_MEAN, i], axis=0)
    
R_MEAN = pd.DataFrame(R_MEAN)
R_MEAN

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.114496,0.126542,0.065472,0.130377,0.113587,0.256688,0.047231,0.147040
1,0.052608,0.206176,0.323102,0.031459,0.007391,0.053304,0.274897,0.051859
2,0.733433,0.184733,0.023525,0.012737,0.014002,0.017493,0.005033,0.009491
3,0.611924,0.095660,0.044644,0.043873,0.043149,0.118494,0.016286,0.027860
4,0.860735,0.064963,0.016719,0.010115,0.024113,0.013896,0.004353,0.005535
...,...,...,...,...,...,...,...,...
14375,0.079879,0.488624,0.044311,0.021690,0.014396,0.254985,0.081920,0.015559
14376,0.377978,0.235193,0.128051,0.065074,0.031242,0.072640,0.056510,0.034064
14377,0.359415,0.164265,0.104600,0.106230,0.067638,0.095802,0.036730,0.066302
14378,0.084430,0.499989,0.070728,0.015416,0.016296,0.255431,0.053419,0.006774


In [68]:
# # 멱평균(p > 1)
# p = 2.5    
# p_mean = pd.DataFrame(0, index=averaging[0].index, columns=averaging[0].columns)
# for i in averaging:
#     p_mean = p_mean + i**p
    
# POWER = (p_mean / len(averaging))**(1/p)

In [55]:
# # 이차평균
# p = 2 
# p_mean = pd.DataFrame(0, index=averaging[0].index, columns=averaging[0].columns)
# for i in averaging:
#     p_mean = p_mean + i**p
    
# POWER = (p_mean / len(averaging))**(1/p)

### Make submission file

In [69]:
sub = GMEAN

In [70]:
sub = pd.concat([test_ID, sub], axis=1)
sub.columns = ['ID', 'F20','F30','F40','F50','M20','M30','M40','M50']

In [71]:
t = pd.Timestamp.now()
fname = f'/ensemble_{t.month:02}{t.day:02}_{t.hour:02}{t.minute:02}.csv'
sub.to_csv(sub_path+fname, index=False, encoding='cp949')
print(f"'{fname}' is ready to submit.")

'/ensemble_0616_0250.csv' is ready to submit.
