In [1]:
import numpy as np
import pandas as pd
import json

import torch
from torch.utils import data
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline
import time, os
from sklearn import decomposition
from sklearn.datasets import make_classification

# from .autonotebook import tqdm as notebook_tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
f_train = open("../data/train.json", 'r')
train_data = json.load(f_train)

sample = train_data[0]
print(sample.keys())
print(sample)

dict_keys(['authors', 'year', 'abstract', 'venue', 'title'])
{'authors': [42, 13720, 36], 'year': 9, 'abstract': [2455, 1858, 2335, 1543, 1800, 1860, 2000, 2867, 1546, 1874, 2059, 1525, 2590, 4196, 12, 2634, 1543, 1800, 1586, 2866, 3595, 1866, 1670, 2000, 3743, 1542, 1650, 1527, 33, 4407, 1543, 1535, 1962, 1961, 1543, 33, 1700, 1543, 1535, 1647, 1546, 1580, 4720, 12, 1731, 4231, 2601, 1553, 1704, 1605, 2456, 1543, 3281, 1594, 4407, 2168, 1542, 1586, 3781, 2471, 1525, 1859, 1669, 2512, 4572, 1546, 1609, 3781, 2471, 1525, 3393, 12, 37, 1712, 1586, 4196, 1650, 1527, 3281, 1594, 4407, 1800, 4708, 1904, 2059, 2411, 12], 'venue': 20, 'title': [41, 1550, 1563, 1594, 1544, 1919, 1644, 37, 1539, 1715, 1553, 1541, 1536, 1532, 1872, 1538]}


In [23]:
f_test = open("../data/test.json", 'r')
test_data = json.load(f_test)

sample = test_data[0]
print(sample.keys())
print(sample)

dict_keys(['identifier', 'coauthors', 'year', 'abstract', 'venue', 'title'])
{'identifier': 0, 'coauthors': [16336, 1762, 4357, 12564], 'year': 19, 'abstract': [37, 1662, 3207, 10, 33, 2037, 1738, 1642, 1553, 4917, 11, 1910, 3270, 11, 1650, 2156, 1993, 12, 1731, 1642, 1542, 2764, 1738, 1549, 2229, 38, 1657, 1719, 1568, 1549, 2860, 1670, 2921, 2474, 1870, 1659, 10, 4657, 1720, 1580, 4466, 1568, 1549, 3123, 11, 3434, 1857, 2466, 1858, 2609, 1525, 2578, 1777, 1996, 51, 12, 1731, 1642, 4631, 3679, 2256, 1548, 1821, 24, 1660, 1548, 10, 1563, 4022, 1837, 11, 2086, 3270, 1549, 2629, 3534, 1650, 1527, 1529, 3399, 2284, 25, 2453, 10, 1563, 3763, 1535, 2229, 1546, 2247, 1549, 3701, 2769, 3534, 6, 3587, 3530, 7, 25, 1549, 2916, 10, 1563, 3763, 1535, 2229, 1546, 3730, 1529, 3399, 2186, 2284, 1655, 1535, 2403, 1837, 10, 3332, 1563, 10, 1549, 2758, 1529, 3270, 11, 1650, 2156, 2493, 12, 3207, 1866, 2000, 3848, 1857, 1525, 33, 2326, 1543, 2337, 11, 4826, 1793, 3272, 10, 2722, 2307, 1571, 2196, 2726, 1

In [31]:
# read train data and test data
f_train = open("../data/train.json", 'r')
train_data = json.load(f_train)

f_test = open("../data/test.json", 'r')
test_data = json.load(f_test)

def get_attr_matrix(data):
    n_samples = len(data)
    n_features = 5000 -1 

    # get abstract & title feature
    # wmatrix = torch.zeros([n_samples, n_features])
    wmatrix = np.ndarray([n_samples, n_features])

    for i in range(n_samples):
        instance = data[i]
        for title in instance['title']:
            wmatrix[i, title-1] += 1
        for abstract in instance['abstract']:
            wmatrix[i, abstract-1] += 1

    # get venue feature
    # vmatrix = torch.zeros([n_samples, 1])
    vmatrix = np.ndarray([n_samples, 1])

    for i in range(n_samples):
        venue = data[i]['venue']
        
        if venue:
            vmatrix[i, ] = venue
        else:
            vmatrix[i, ] = -1

    # get year feature
    # ymatrix = torch.zeros([n_samples, 1])
    ymatrix = np.ndarray([n_samples, 1])

    for i in range(n_samples):
        year = data[i]['year']
        
        if year:
            ymatrix[i, ] = year
        else:
            ymatrix[i, ] = -1
            
    # return torch.cat((wmatrix, vmatrix, ymatrix), 1)
    return np.concatenate((wmatrix, vmatrix, ymatrix), 1)

attr_matrix = get_attr_matrix(train_data)
attr_matrix_test = get_attr_matrix(test_data)

def handle_authors(data, key="author"):

    n_samples = len(data)

    # prolific authors 
    # y = torch.zeros([n_samples, 100])
    y = np.ndarray([n_samples, 100])

    # get co-author matrix
    amatrix = torch.zeros([n_samples, 21245 - 100 + 1])

    for i in range(n_samples):
        authors = data[i][key]
        
        for au in authors:
            if au < 100:
                
                y[i, au] += 1
            else:
                amatrix[i, au - 100] += 1

    return amatrix, y

amatrix, y = handle_authors(train_data, key="authors")

# amatrix_test, _ = handle_authors(test_data, key="coauthors")

# X = torch.cat((attr_matrix, amatrix), 1)
# X_kaggle = torch.cat((attr_matrix_test, amatrix_test), 1)

X = attr_matrix
X_kaggle = attr_matrix_test

print("Train:")
print("     X : ", X.shape)
print("     y : ", y.shape)
print("Test_Kaggle:")
print("     X : ", X_kaggle.shape)

Train:
     X :  (25793, 5001)
     y :  (25793, 100)
Test_Kaggle:
     X :  (800, 5001)


### Model

In [32]:
train_size = 18000
X_train = X[:train_size]
X_test = X[train_size:]

y_train = y[:train_size]
y_test = y[train_size:]

In [33]:
import numpy as np
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

clf = MultiOutputClassifier(estimator= LogisticRegression()).fit(X_train, y_train)
# clf.predict(X[-2:])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
def multi_label_column(matrix):
    
    n_samples, n_class = matrix.shape
    # print(n_samples, n_class)

    output =[]

    for i in range(n_samples):
        pred = ""
        for j in range(n_class):
            if matrix[i][j]:
                pred += str(j) + " "
        if pred:
            output.append(pred[:-1])
        else:
            output.append("-1")
    
    return output


y_pred_train = clf.predict(X_train)
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, recall_score, f1_score

y_train_list = multi_label_column(y_train)
y_pred_train_list = multi_label_column(y_pred_train)

y_test_list = multi_label_column(y_test)
y_pred_list = multi_label_column(y_pred)

print('='*25 + 'Evaluation results' + '='*25)
print('The accuracy score of prediction is: {}'.format(accuracy_score(y_test_list, y_pred_list)))
print('The racall score of prediction is: {}'.format(recall_score(y_test_list, y_pred_list, average='weighted')))
print('The f1 score of prediction is: {}'.format(f1_score(y_test_list, y_pred_list, average='weighted'))) 

The accuracy score of prediction is: 0.7205122180451128
The racall score of prediction is: 0.7205122180451128
The f1 score of prediction is: 0.6391698408198209


  _warn_prf(average, modifier, msg_start, len(result))


### Multi-lable Logistic Regression Model

In [6]:
import torch.nn as nn
import torch.nn.functional as F

class LogisticRegressionModel(nn.Module):
    
    def __init__(self, n_features, n_classes):
        super(LogisticRegressionModel, self).__init__()
        
        # Register weight matrix and bias term as model parameters - automatically tracks operations for gradient computation.
        self.W = torch.nn.Parameter(torch.nn.init.xavier_uniform_(torch.empty([n_features, n_classes])))  # Weights
        self.b = torch.nn.Parameter(torch.zeros([n_classes]))  # Biases
        
    def forward(self, x):
        """
        Forward pass for logistic regression.
        Input: Tensor x of shape [N,C,H,W] ([batch size, channels, height, width])
        Output: Logits W @ x + b
        """
        # batch_size = x.shape[0]
        
        # x = x.view(batch_size, -1)  # Flatten image into vector, retaining batch dimension

        out = torch.matmul(x,self.W) + self.b  # Compute scores
        return out

In [7]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, recall_score, f1_score

def multi_label_list(matrix):
    
    n_samples, n_class = matrix.shape
    # print(n_samples, n_class)

    output =[]

    for i in range(n_samples):
        pred = ""
        for j in range(n_class):
            if matrix[i][j] > 0.99:
                pred += str(j) + " "
        if pred:
            output.append(pred[:-1])
        else:
            output.append("-1")
    
    return output

def multi_label_f1_score(a, b):
    return f1_score(multi_label_list(a), multi_label_list(b), average='weighted')

In [11]:
def test(model, criterion, X_test, y_test):
    test_loss = 0.
    test_preds, test_labels = list(), list()
    n_samples = X_test.shape[0]
    for i in range(n_samples):
        # x, labels = data
        x = X_test[i]
        labels = y_test[i]

        with torch.no_grad():
            logits = model(x)  # Compute scores
            predictions = torch.argmax(logits, dim=1)
            test_loss += criterion(input=logits, target=labels).item()
            test_preds.append(predictions)
            test_labels.append(labels)

    test_preds = torch.cat(test_preds)
    test_labels = torch.cat(test_labels)

    test_accuracy = torch.eq(test_preds, test_labels).float().mean().item()

    print('[TEST] Mean loss {:.4f} | Accuracy {:.4f}'.format(test_loss/n_samples, test_accuracy))

def train(model, X_train, y_train, X_test, y_test, optimizer, n_epochs=10):
    """
    Generic training loop for supervised multiclass learning
    """
    LOG_INTERVAL = 250
    running_loss, running_accuracy = list(), list()
    start_time = time.time()
    criterion = torch.nn.CrossEntropyLoss()

    n_samples = X_train.shape[0]

    # for epoch in range(n_epochs):  # Loop over training dataset `n_epochs` times
    for epoch in range(1):

        epoch_loss = 0.

        for i in range(n_samples):  # Loop over elements in training set

            # x, labels = data
            x = X_train[i]
            labels = y_train[i]

            logits = model(x)
            return logits

            predictions = torch.argmax(logits, dim=1)
            train_acc = torch.mean(torch.eq(predictions, labels).float()).item()

            loss = criterion(input=logits, target=labels)

            loss.backward()               # Backward pass (compute parameter gradients)
            optimizer.step()              # Update weight parameter using SGD
            optimizer.zero_grad()         # Reset gradients to zero for next iteration


            # ============================================================================
            # You can safely ignore the boilerplate code below - just reports metrics over
            # training and test sets

            running_loss.append(loss.item())
            running_accuracy.append(train_acc)

            epoch_loss += loss.item()

            if i % LOG_INTERVAL == 0:  # Log training stats
                deltaT = time.time() - start_time
                mean_loss = epoch_loss / (i+1)
                print('[TRAIN] Epoch {} [{}/{}]| Mean loss {:.4f} | Train accuracy {:.5f} | Time {:.2f} s'.format(epoch, 
                    i, n_samples, mean_loss, train_acc, deltaT))

        print('Epoch complete! Mean loss: {:.4f}'.format(epoch_loss/n_samples))

        test(model, criterion, X_test, y_test)
        # break
        
    return running_loss, running_accuracy

In [12]:
print("Train:")
print("     X_train : ", X_train.shape)
print("     y_train : ", y_train.shape)
print("Test:")
print("     X_test  : ", X_test.shape)
print("     y_test  : ", y_test.shape)

Train:
     X_train :  torch.Size([18000, 26147])
     y_train :  torch.Size([18000, 100])
Test:
     X_test  :  torch.Size([7793, 26147])
     y_test  :  torch.Size([7793, 100])


In [13]:
n_features, n_classes = X.shape[1], y.shape[1]  
logistic_regression_model = LogisticRegressionModel(n_features, n_classes)

for p in logistic_regression_model.parameters():
    # print(p)
    print(p.shape)

optimizer = torch.optim.SGD(logistic_regression_model.parameters(), lr=1e-2, momentum=0.9)
# lr_loss, lr_acc = train(logistic_regression_model, X_train, y_train, X_test, y_test, optimizer)

logits = train(logistic_regression_model, X_train, y_train, X_test, y_test, optimizer)

torch.Size([26147, 100])
torch.Size([100])


In [16]:
torch.sum(logits)

tensor(0.6575, grad_fn=<SumBackward0>)

In [165]:
# torch.matmul(xx, w) + b
a = torch.zeros([10, 5])
b = torch.zeros([10, 5])

In [177]:
a[0, 4] = 1
a[4, 2] = 1
a[9, 1] = 1

b[0, 4] = 1
b[4, 2] = 3
b[9, 1] = 1

In [252]:
a = torch.asarray([1, 8, 99, 99, 2])
# torch.topk(a, 2)
torch.sort(a, descending=True)

torch.return_types.sort(
values=tensor([99, 99,  8,  2,  1]),
indices=tensor([2, 3, 1, 4, 0]))