In [None]:
import os

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns', 500)

from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.figure_factory as ff
%matplotlib inline

In [None]:
SubmissionWithCalibration = pd.read_csv('data/SubmissionWithCalibration.csv')

RealLabels = SubmissionWithCalibration.rlabel

XCeptionPredictions = pd.read_csv('data/submission_xception.csv')

ResNetPredictions = pd.read_csv('data/submission_resnext.csv')

NNPredictions = pd.DataFrame({"ResNetPredictions": ResNetPredictions.label,

                              "XCeptionPredictions": XCeptionPredictions.label})

NNPredictions

In [None]:
print("XCeption loss", log_loss(RealLabels, NNPredictions.XCeptionPredictions))

print("ResNet loss", log_loss(RealLabels, NNPredictions.ResNetPredictions))

In [None]:
sns.distplot(RealLabels)

sns.distplot(NNPredictions.XCeptionPredictions)

sns.distplot(NNPredictions.ResNetPredictions)

plt.legend(['Real', 'XCeption', 'ResNet'])

In [None]:
Alphas = np.linspace(0, 1, 1001)

LogLoss = np.array([log_loss(RealLabels, np.dot(NNPredictions,
                                                    
                   [Alpha, 1 - Alpha])) for Alpha in Alphas])

BestAlpha = Alphas[LogLoss.argmin()]

BestLogLoss = LogLoss.min()

print(BestAlpha)

print(BestLogLoss)

In [None]:
BestEnsembleByAlphaMethod = BestAlpha * ResNetPredictions.label + (1 - BestAlpha) * NNPredictions.XCeptionPredictions

In [None]:
sns.distplot(RealLabels)

sns.distplot(BestEnsembleByAlphaMethod)

plt.legend(['Real', 'AlphaMethod'])

In [None]:
from sklearn.linear_model import LogisticRegression as LR

Model = LR(penalty = 'l2', dual = True, tol = 0.0001, C = 1, fit_intercept = True, intercept_scaling = 1.0, 
        
        class_weight = None, random_state = None)


Model.fit(np.array(BestEnsembleByAlphaMethod).reshape(-1, 1), RealLabels)

CalibratedPrediction = Model.predict_proba(np.array(BestEnsembleByAlphaMethod).reshape(-1, 1))[:, 1]

print(log_loss(RealLabels, CalibratedPrediction))

In [None]:
sns.distplot(RealLabels)

sns.distplot(CalibratedPrediction)

In [None]:
from sklearn.model_selection import GridSearchCV

grid = {"C": np.logspace(-3,3,66), "penalty": ["l1", "l2"]}
    
logreg = LR(random_state = 666)

logreg_cv = GridSearchCV(logreg, grid, cv = None)

logreg_cv.fit(np.array(BestEnsembleByAlphaMethod).reshape(-1, 1), RealLabels)

bests = logreg_cv.predict_proba(np.array(BestEnsembleByAlphaMethod).reshape(-1, 1))[:,1]

print("accuracy:",logreg_cv.best_score_)

print("tuned hpyerparameters:",logreg_cv.best_params_)

print("log loss:", log_loss(RealLabels, bests))

In [None]:
BestModel = LR(penalty = 'l1', C = 1.7, fit_intercept = True,
               
               intercept_scaling = 1.0, class_weight = None, random_state = None)

x = np.array(BestEnsembleByAlphaMethod).reshape(-1, 1)

BestModel.fit(x, RealLabels)

BestResult = BestModel.predict_proba(x)[:,1]

print("log loss:", log_loss(RealLabels, BestResult))

In [None]:
print(BestModel.coef_)

print(BestModel.intercept_)

In [None]:
BestResult.mean()