In [1]:
import numpy as np
import pandas as pd

import os
import tqdm

from collections import Counter
from collections import defaultdict

from sklearn.metrics import roc_auc_score

from utils.data import Data, TestData
from utils.validation import MultiClassValidator
from utils.models import CVModel
from utils.text import Tokenizer

data = Data('./data/train/')
test_data = TestData('./data/test/')

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

tok = Tokenizer(min_df=3)
data.X2_corpus = tok.transform(data.X2)
test_data.X2_corpus = tok.transform(test_data.X2)

vect = TfidfVectorizer()
train_corpus = vect.fit_transform(data.X2_corpus.map(lambda x: ' '.join(map(str, x))))
test_corpus = vect.transform(test_data.X2_corpus.map(lambda x: ' '.join(map(str, x))))

In [3]:
def train(data, get_model, fit_model, n_splits=10, test_data=None, sparse=False, seed=19, verbose=False):
    
    val = MultiClassValidator(data.Y.values)
    val_preds = []
    if test_data is not None:
        test_preds = []
    scores = []
    for target in range(5):
        model = CVModel(get_model, fit_model)
        score, preds = model.fit(
            data.X2_corpus, data.Y[str(target + 1)],
            n_splits=n_splits, seed=seed, validator=val,
            sparse=sparse, cache=True, verbose=False
        )
        scores.append(score)
        val_preds.append(preds)
        if verbose:
            print('target {}, score: {:.4f}'.format(target + 1, score))
        if test_data is not None:
            test_preds.append(model.predict(test_data.X2_corpus))
    score = np.mean(scores)
    if verbose:
        print('macro, score: {:.4f}'.format(score))
    val_preds = np.vstack(val_preds).T
    
    if test_data is None:
        return score, val_preds
    else:
        test_preds = np.vstack(test_preds).T
        return scores, val_preds, test_preds

In [4]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=250, n_iter=5, random_state=20)
data.X2_corpus = svd.fit_transform(train_corpus)
test_data.X2_corpus = svd.transform(test_corpus)

In [5]:
from sklearn.linear_model import LogisticRegression

def get_logreg(C=1., solver='lbfgs'):
    def f():
        return LogisticRegression(C=C, solver=solver, fit_intercept=True)
    return f

def fit_logreg():
    def f(model, train_X, train_y, test_X, test_y):
        model.fit(train_X, train_y)
        score = roc_auc_score(test_y, model.predict_proba(test_X)[:, 1])
        return score
    return f

from sklearn.neighbors import KNeighborsClassifier

def get_knn(n_neighbors=5, metric='minkowski'):
    def f():
        return KNeighborsClassifier(n_neighbors=n_neighbors, metric=metric)
    return f

def fit_knn():
    def f(model, train_X, train_y, test_X, test_y):
        model.fit(train_X, train_y)
        return roc_auc_score(test_y, model.predict_proba(test_X)[:, 1])
    return f

from lightgbm import LGBMClassifier

def get_lgb(model_params):
    def f():
        return LGBMClassifier(**model_params)
    return f

def fit_lgb(train_params):
    def f(model, train_X, train_y, test_X, test_y):
        model.fit(train_X, train_y, eval_set=(test_X, test_y),  eval_metric='auc', verbose=False, **train_params)
        return model.best_score_['valid_0']['auc']
    return f

lgb_params = {
    'n_estimators': 10000,
    'num_leaves': 11,
    'learning_rate': 0.05,
    'metrics': None
}

train_params = {
    'early_stopping_rounds': 100
} 

In [6]:
models = [
    ('X2_logreg', get_logreg(C=0.1, solver='lbfgs'), fit_logreg()),
    ('X2_knn', get_knn(n_neighbors=300, metric='cosine'), fit_knn()),
    ('X2_gbm', get_lgb(lgb_params), fit_lgb(train_params))
]


val_preds = dict()
test_preds = dict()
scores = dict()

for model_name, get_model, fit_model in models:
    
    print(model_name)
    scores[model_name], val_preds[model_name], test_preds[model_name] = train(
        data, get_model, fit_model, sparse=True,
        n_splits=10, test_data=test_data, seed=19, verbose=True
    )
    print()

X2_logreg
target 1, score: 0.5687
target 2, score: 0.6148
target 3, score: 0.6107
target 4, score: 0.6116
target 5, score: 0.6311
macro, score: 0.6074

X2_knn
target 1, score: 0.5800
target 2, score: 0.6156
target 3, score: 0.6123
target 4, score: 0.6069
target 5, score: 0.6229
macro, score: 0.6075

X2_gbm
target 1, score: 0.5724
target 2, score: 0.6213
target 3, score: 0.6063
target 4, score: 0.6071
target 5, score: 0.6319
macro, score: 0.6078



In [7]:
tmp = pd.DataFrame(data.X2_corpus, index=data.X1.index)
tmp.rename(columns=lambda x: 'X2_' + str(x), inplace=True)
data.X2_corpus = tmp.merge(data.X1, left_index=True, right_index=True, how='left')

tmp = pd.DataFrame(test_data.X2_corpus, index=test_data.X1.index)
tmp.rename(columns=lambda x: 'X2_' + str(x), inplace=True)
test_data.X2_corpus = tmp.merge(test_data.X1, left_index=True, right_index=True, how='left')

In [9]:
from catboost import CatBoostClassifier

def get_cat(model_params):
    def f():
        return CatBoostClassifier(**model_params)
    return f

def fit_cat(train_params):
    def f(model, train_X, train_y, test_X, test_y):
        model.fit(train_X, train_y, eval_set=[(test_X, test_y)], **train_params)
        return model.best_score_['validation_0']['AUC']
    return f

cat_params = {
    'iterations': 10000,
    'learning_rate': 0.05,
    'depth': 5,
    'eval_metric': 'AUC'
}

cat_train_params = {
    'early_stopping_rounds': 50,
    'verbose': False
}

from xgboost import XGBClassifier

def get_xgb(model_params):
    def f():
        return XGBClassifier(**model_params)
    return f

def fit_xgb(train_params):
    def f(model, train_X, train_y, test_X, test_y):
        model.fit(train_X, train_y, eval_set=[(test_X, test_y)], **train_params, verbose=False)
        
        return roc_auc_score(test_y, model.predict_proba(test_X)[:, 1])
    return f

xgb_params = {
    'max_depth': 4,
    'n_estimators': 10000,
    'learning_rate': 0.05
}

new_models = [
    ('X1_X2_gbm', get_lgb(lgb_params), fit_lgb(train_params)),
    ('X1_X2_cat', get_cat(cat_params), fit_cat(cat_train_params)),
    ('X1_X2_xgb', get_xgb(xgb_params), fit_xgb(train_params))
]

for model_name, get_model, fit_model in new_models:
    
    print(model_name)
    scores[model_name], val_preds[model_name], test_preds[model_name] = train(
        data, get_model, fit_model, sparse=False,
        n_splits=10, test_data=test_data, seed=19, verbose=True
    )
    print()

X1_X2_gbm
target 1, score: 0.6162
target 2, score: 0.6278
target 3, score: 0.6352
target 4, score: 0.6154
target 5, score: 0.6305
macro, score: 0.6250

X1_X2_cat
target 1, score: 0.6062
target 2, score: 0.6291
target 3, score: 0.6323
target 4, score: 0.6199
target 5, score: 0.6263
macro, score: 0.6227

X1_X2_xgb
target 1, score: 0.5819
target 2, score: 0.6197
target 3, score: 0.6134
target 4, score: 0.5941
target 5, score: 0.6133
macro, score: 0.6045



In [27]:
sub = pd.read_csv('./data/digital_reputation_challenge_sample_submit.csv')
sub[['1', '2', '3', '4', '5']] = np.mean([test_preds[model] for model in test_preds], axis=0)
sub.to_csv('./data/mean_sub.csv', index=False)

In [61]:
from scipy.special import softmax

tau = 30

final_test_preds = []

for i in range(5):

    weights = softmax([tau * scores[model][i] for model in list(scores)])
    print(
        roc_auc_score(
            data.Y[str(i + 1)],
            np.sum([weight * val_preds[model][:, i] for weight, model in zip(weights, list(scores))], axis=0)
        )
    )
    final_test_preds.append(
        np.sum([weight * test_preds[model][:, i] for weight, model in zip(weights, list(scores))], axis=0)
    )
final_test_preds = np.array(final_test_preds).T

0.6219113268464954
0.6341430030603811
0.6254590733471446
0.6158166720607764
0.6417339679944092


In [63]:
sub[['1', '2', '3', '4', '5']] = final_test_preds
sub.to_csv('./data/tau_30_sub.csv', index=False)