## Импорт библиотек

In [1]:
import time

import numpy as np
import pandas as pd

from operator import mul
from sklearn import metrics
from collections import defaultdict
from scipy.stats import itemfreq
from sklearn.datasets import load_svmlight_file
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import PassiveAggressiveClassifier

## Загрузка данных

In [2]:
x_train, y_train = load_svmlight_file("/var/local/aostapets/lshtc/wise2014-train.libsvm", multilabel=True)

In [3]:
x_test, y_test = load_svmlight_file("/var/local/aostapets/lshtc/wise2014-test.libsvm", multilabel=True)

# Logistic  regression

## Получение матрицы оценок

In [5]:
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack
from scipy.sparse import csc_matrix

models = []

ans = csc_matrix((14857,204)) # ответы на валидационной выборке

start_time = time.time()

x_train = csc_matrix(x_train)
   
for item in range(1,204):
    
    model = LogisticRegression(penalty='l1', C=6.0, tol=0.001, n_jobs=2)

    new_y_train = list(map(lambda x: 1 if item in x else 0,y_train))
    if item % 20 == 0:
        print (item)
    
    if not sum(new_y_train[:50000]):
        continue
        
    model.fit(x_train[:50000],list(new_y_train[:50000]))    
    
    pred_train = model.predict_proba(x_train)[:,1]
    ans[:,item] = np.resize(pred_train[50000:], (14857,1))
    models.append(model)

20
40
60
80
100
120
140
160
180
200




## Перевод оценок в ответы

In [6]:
answer = defaultdict(str)
for j in range(14857):
    threshold = 0.55 * ans[j,:].max() # решающее правило
    for item in range(1,204):
        if ans[j,item] > threshold:
            if not answer[j]:
                answer[j] = str(item)
            else:
                answer[j] = answer[j] + " " + str(item)
    if j % 5000 == 0:
        print(item)

203
203
203


In [7]:
list_answer = []
for i in range(14857):
    list_answer.append(answer[i])

## Вычисление качества

In [8]:
from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer

binarizer = MultiLabelBinarizer()
binarizer = MultiLabelBinarizer().fit(y_train)
y_true = binarizer.transform(y_train)
tmp = list(map(lambda x: list(map(lambda y: float(y) if y else -1, x.split(" "))),list_answer))
y_pred = binarizer.transform(list(map(lambda x: [103,200] if x == [-1] else x, tmp)))

f_report = metrics.f1_score(y_true[50000:], y_pred, average = 'samples')

In [9]:
f_report

0.78211413193239943

In [10]:
f_report = metrics.precision_score(y_true[50000:], y_pred, average = 'samples')
f_report

0.81416343746895004

In [11]:
f_report = metrics.recall_score(y_true[50000:], y_pred, average = 'samples')
f_report

0.79905258170217441

# Ensembled Probabilistic Classifier Chain

## Первый алгоритм (Probabilistic Classifier Chain, PCC)

In [14]:
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack
from scipy.sparse import csc_matrix

models = []

ans_cascade = csc_matrix((14857,204)) # ответы на валидационной выборке

start_time = time.time()

x_train_cascade = csc_matrix(x_train)
   
for item in range(1,204):
    
    model = LogisticRegression(penalty='l1', C=6.0, tol=0.001, n_jobs=2) # тот же самый распознающий оператор

    new_y_train = list(map(lambda x: 1 if item in x else 0,y_train))
    if item % 20 == 0:
        print (item)
    
    if not sum(new_y_train[:50000]):
        continue
        
    model.fit(x_train_cascade[:50000],list(new_y_train[:50000]))    
    
    pred_train = model.predict_proba(x_train_cascade)[:,1]
    ans_cascade[:,item] = np.resize(pred_train[50000:], (14857,1))
    x_train_cascade = hstack([x_train_cascade, np.resize(pred_train,(64857,1))], format="csc") # расширяем матрицу признаков 
                                                                                               # вероятностями уже просмотренных классов
    models.append(model)

20
40
60
80
100
120
140
160
180
200




## Второй алгоритм

In [15]:
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack
from scipy.sparse import csc_matrix

models = []

ans_cascade2 = csc_matrix((14857,204)) # ответы на валидационной выборке

start_time = time.time()

x_train_cascade = csc_matrix(x_train)
   
for item in range(203,0,-1):
    
    model = LogisticRegression(penalty='l1', C=6.0, tol=0.001, n_jobs=2) # тот же самый распознающий оператор

    new_y_train = list(map(lambda x: 1 if item in x else 0,y_train))
    if item % 20 == 0:
        print (item)
    
    if not sum(new_y_train[:50000]):
        continue
        
    model.fit(x_train_cascade[:50000],list(new_y_train[:50000]))    
    
    pred_train = model.predict_proba(x_train_cascade)[:,1]
    ans_cascade2[:,item] = np.resize(pred_train[50000:], (14857,1))
    x_train_cascade = hstack([x_train_cascade, np.resize(pred_train,(64857,1))], format="csc") # расширяем матрицу признаков 
                                                                                               # вероятностями уже просмотренных классов
    models.append(model)

200
180
160
140
120
100
80
60
40
20




## Проверяем качество работы первого алгоритма PCC

In [16]:
answer = defaultdict(str)
for j in range(14857):
    threshold = 0.55 * ans_cascade[j,:].max()
    for item in range(1,204):
        if ans_cascade[j,item] > threshold:
            if not answer[j]:
                answer[j] = str(item)
            else:
                answer[j] = answer[j] + " " + str(item)
    if j % 5000 == 0:
        print(item)

203
203
203


In [17]:
list_answer = []
for i in range(14857):
    list_answer.append(answer[i])

In [18]:
from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer

binarizer = MultiLabelBinarizer()
binarizer = MultiLabelBinarizer().fit(y_train)
y_true = binarizer.transform(y_train)
tmp = list(map(lambda x: list(map(lambda y: float(y) if y else -1, x.split(" "))),list_answer))
y_pred = binarizer.transform(list(map(lambda x: [103,200] if x == [-1] else x, tmp)))

f_report = metrics.f1_score(y_true[50000:], y_pred, average = 'samples')
f_report

0.77625428463080748

In [19]:
f_report = metrics.precision_score(y_true[50000:], y_pred, average = 'samples')
f_report

0.80886931605111589

In [20]:
f_report = metrics.recall_score(y_true[50000:], y_pred, average = 'samples')
f_report

0.79244143373173459

## Объединяем алгоритмы

In [37]:
from sklearn.externals.joblib import Parallel, delayed  
import multiprocessing

def process(ans_lin_comb, thresh):      
    answer = defaultdict(str)
    for j in range(14857):
        threshold = thresh * ans_lin_comb[j,:].max()
        for item in range(1,204):
            if ans_lin_comb[j,item] > threshold:
                if not answer[j]:
                    answer[j] = str(item)
                else:
                    answer[j] = answer[j] + " " + str(item)
        
    list_answer = []
    for i in range(14857):
        list_answer.append(answer[i])

    binarizer = MultiLabelBinarizer()
    binarizer = MultiLabelBinarizer().fit(y_train)
    y_true = binarizer.transform(y_train)
    tmp = list(map(lambda x: list(map(lambda y: float(y) if y else -1, x.split(" "))),list_answer))
    y_pred = binarizer.transform(list(map(lambda x: [103,200] if x == [-1] else x, tmp)))

    f_report = metrics.f1_score(y_true[50000:], y_pred, average = 'samples')
    return f_report
                
ans_lin_comb = 0.5*(ans_cascade + ans_cascade2)
num_cores = 11
answers = Parallel(n_jobs=num_cores)(delayed(process)(ans_lin_comb, thresh / 20) for thresh in range(5,16))

## Качество для различных порогов в решающем правиле: $a_j = 1 \Leftrightarrow g_j > p \cdot max(g_1, \ldots, g_l)$

In [43]:
list(zip(map(lambda x: x/20, range(5,16)), answers))

[(0.25, 0.77384467670864643),
 (0.3, 0.77858134794090905),
 (0.35, 0.78127088559881186),
 (0.4, 0.7824279532363263),
 (0.45, 0.78320921261622611),
 (0.5, 0.78304714700016576),
 (0.55, 0.78249180351556347),
 (0.6, 0.78169763384476987),
 (0.65, 0.78082893862324443),
 (0.7, 0.77829515357616585),
 (0.75, 0.77596243883852423)]

# Submission

In [46]:
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack
from scipy.sparse import csc_matrix

models = []

ans_cascade = csc_matrix((34923,204))

start_time = time.time()

x_train_cascade = csc_matrix(x_train)
x_test_cascade = csc_matrix(x_test)
   
for item in range(1,204):
    
    model = LogisticRegression(penalty='l1', C=6.0, tol=0.001, n_jobs=2)

    new_y_train = list(map(lambda x: 1 if item in x else 0,y_train))
    if item % 20 == 0:
        print (item)
    
    if not sum(new_y_train):
        continue
        
    model.fit(x_train_cascade,list(new_y_train))    
    
    pred_train = model.predict_proba(x_train_cascade)[:,1]
    pred_test = model.predict_proba(x_test_cascade)[:,1]
    ans_cascade[:,item] = np.resize(pred_test, (34923,1))
    x_train_cascade = hstack([x_train_cascade, np.resize(pred_train,(64857,1))], format="csc")
    x_test_cascade = hstack([x_test_cascade, np.resize(pred_test,(34923,1))], format="csc")
    models.append(model)

20
40
60
80
100
120
140
160
180
200




In [47]:
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack
from scipy.sparse import csc_matrix

models = []

ans_cascade2 = csc_matrix((34923,204))

start_time = time.time()

x_train_cascade = csc_matrix(x_train)
x_test_cascade = csc_matrix(x_test)
   
for item in range(203,0,-1):
    
    model = LogisticRegression(penalty='l1', C=6.0, tol=0.001, n_jobs=2)

    new_y_train = list(map(lambda x: 1 if item in x else 0,y_train))
    if item % 20 == 0:
        print (item)
    
    if not sum(new_y_train):
        continue
        
    model.fit(x_train_cascade,list(new_y_train))    
    
    pred_train = model.predict_proba(x_train_cascade)[:,1]
    pred_test = model.predict_proba(x_test_cascade)[:,1]
    ans_cascade2[:,item] = np.resize(pred_test, (34923,1))
    x_train_cascade = hstack([x_train_cascade, np.resize(pred_train,(64857,1))], format="csc")
    x_test_cascade = hstack([x_test_cascade, np.resize(pred_test,(34923,1))], format="csc")
    models.append(model)

200
180
160
140
120
100
80
60
40
20




In [48]:
ans_lin_comb = 0.5*(ans_cascade + ans_cascade2)
answer = defaultdict(str)

for j in range(34923):
    threshold = 0.4 * ans_lin_comb[j,:].max()
    for item in range(1,204):
        if ans_lin_comb[j,item] > threshold:
            if not answer[j]:
                answer[j] = str(item)
            else:
                answer[j] = answer[j] + " " + str(item)
    if j % 5000 == 0:
        print(item)

203
203
203
203
203
203
203


In [54]:
list_answer = []
for i in range(34923):
    list_answer.append(answer[i])

In [55]:
sub = pd.read_csv("/var/local/aostapets/lshtc/sampleSubmission.csv")

In [56]:
sub.Labels = list(map(lambda x: x,list_answer))

In [57]:
sub.to_csv("wise-submission-epcc.csv", index = False) # Public Score : 0.77510, Private Score: 0.77714