## 模型融合

In [None]:
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
import h5py
import os

submission = pd.read_csv('../input/sample_submission.csv')

# read in RF results
with h5py.File('./output/probs/allpreds.h5', 'r') as hf:
        predshf = hf['preds_latest']
        preds = 0.31*normalize(predshf.value, norm='l1', axis=1)

# read in XGB results
with h5py.File('./output/probs/allpreds_xgb.h5', 'r') as hf:
        predshf = hf['preds']
        preds += 0.39*normalize(predshf.value, norm='l1', axis=1)

# read in SGD results
with h5py.File('./output/probs/allpreds_sgd.h5', 'r') as hf:
        predshf = hf['preds']
        preds += 0.27*normalize(predshf.value, norm='l1', axis=1)

# read in Bernoulli NB results
with h5py.File('./output/probs/allpreds_bnb.h5', 'r') as hf:
        predshf = hf['preds']
        preds += 0.03*normalize(predshf.value, norm='l1', axis=1)

print('generating submission')
col_ind = np.argsort(-preds, axis=1)[:,:5]
hc = [' '.join(row.astype(str)) for row in col_ind]

sub = pd.DataFrame(data=hc, index=submission.id)
sub.reset_index(inplace=True)
sub.columns = submission.columns
sub.to_csv('./output/pred_sub.csv', index=False)

## 提交结果格式处理

In [None]:
match_pred = pd.read_csv('./output/match_pred.csv')
match_pred.fillna('', inplace=True)
match_pred = match_pred['hotel_cluster'].tolist()
match_pred = [s.split(' ') for s in match_pred]

pred_sub = pd.read_csv('./output/pred_sub.csv')
ids = pred_sub.id
pred_sub = pred_sub['hotel_cluster'].tolist()
pred_sub = [s.split(' ') for s in pred_sub]

# 取出前5
def f5(seq, idfun=None): 
    if idfun is None:
        def idfun(x): return x
    seen = {}
    result = []
    for item in seq:
        marker = idfun(item)
        if (marker in seen) or (marker == ''): continue
        seen[marker] = 1
        result.append(item)
    return result
    
full_preds = [f5(match_pred[p] + pred_sub[p])[:5] for p in range(len(pred_sub))]

write_p = [" ".join([str(l) for l in p]) for p in full_preds]
write_frame = ["{0},{1}".format(ids[i], write_p[i]) for i in range(len(full_preds))]
write_frame = ["id,hotel_cluster"] + write_frame
with open("./output/predictions.csv", "w+") as f:
    f.write("\n".join(write_frame))