In [1]:
import numpy as np
import pandas as pd
import json

import torch
from torch.utils import data
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
%matplotlib inline
import time, os
from sklearn import decomposition
from sklearn.datasets import make_classification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# read train data and test data
f_train = open("../data/train.json", 'r')
train_data = json.load(f_train)

f_test = open("../data/test.json", 'r')
test_data = json.load(f_test)

def get_attr_matrix(data):
    n_samples = len(data)
    n_features = 5000 -1 

    # get abstract & title feature
    wmatrix = np.ndarray([n_samples, n_features])
    wmatrix.fill(0)

    for i in range(n_samples):
        instance = data[i]
        for title in instance['title']:
            wmatrix[i, title-1] += 1
        for abstract in instance['abstract']:
            wmatrix[i, abstract-1] += 1

    # get venue feature
    vmatrix = np.ndarray([n_samples, 1])

    for i in range(n_samples):
        venue = data[i]['venue']
        
        if venue:
            vmatrix[i, ] = venue
        else:
            vmatrix[i, ] = -1

    # get year feature
    ymatrix = np.ndarray([n_samples, 1])

    for i in range(n_samples):
        year = data[i]['year']
        
        if year:
            ymatrix[i, ] = year
        else:
            ymatrix[i, ] = -1
            
    return np.concatenate((wmatrix, vmatrix, ymatrix), axis=1)

attr_matrix = get_attr_matrix(train_data)
attr_matrix_test = get_attr_matrix(test_data)

def handle_authors(data, key="author"):

    n_samples = len(data)

    # prolific authors 
    y = np.ndarray([n_samples, 100])
    y.fill(0)

    # get co-author matrix
    amatrix = np.ndarray([n_samples, 21245 - 100 + 1])
    amatrix.fill(0)

    for i in range(n_samples):
        authors = data[i][key]
        
        for au in authors:
            if au < 100:
                
                y[i, au] += 1
            else:
                amatrix[i, au - 100] += 1

    return amatrix, y

amatrix, y = handle_authors(train_data, key="authors")

amatrix_test, _ = handle_authors(test_data, key="coauthors")

X = np.concatenate((attr_matrix, amatrix), axis=1)
X_kaggle = np.concatenate((attr_matrix_test, amatrix_test), axis=1)

print("Train:")
print("     X : ", X.shape)
print("     y : ", y.shape)
print("Test:")
print("     X : ", X_kaggle.shape)

Train:
     X :  (25793, 26147)
     y :  (25793, 100)
Test:
     X :  (800, 26147)


In [4]:
from scipy import sparse
X = sparse.csr_matrix(X)
X_kaggle = sparse.csr_matrix(X_kaggle)
X

<25793x26147 sparse matrix of type '<class 'numpy.float64'>'
	with 3009689 stored elements in Compressed Sparse Row format>

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) 

In [6]:
import numpy as np
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

clf = MultiOutputClassifier(estimator= LogisticRegression()).fit(X_train, y_train)
# clf.predict(X[-2:])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [9]:
def multi_label_column(matrix):
    
    n_samples, n_class = matrix.shape
    # print(n_samples, n_class)

    output =[]

    for i in range(n_samples):
        pred = ""
        for j in range(n_class):
            if matrix[i][j]:
                pred += str(j) + " "
        if pred:
            output.append(pred[:-1])
        else:
            output.append("-1")
    
    return output


y_pred_train = clf.predict(X_train)
y_pred = clf.predict(X_test)

In [11]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, recall_score, f1_score

y_train_list = multi_label_column(y_train)
y_pred_train_list = multi_label_column(y_pred_train)

y_test_list = multi_label_column(y_test)
y_pred_list = multi_label_column(y_pred)

print('='*25 + 'Evaluation results' + '='*25)
print('The accuracy score of prediction is: {}'.format(accuracy_score(y_test_list, y_pred_list)))
print('The racall score of prediction is: {}'.format(recall_score(y_test_list, y_pred_list, average='weighted')))
print('The f1 score of prediction is: {}'.format(f1_score(y_test_list, y_pred_list, average='weighted'))) 

The accuracy score of prediction is: 0.7205122180451128
The racall score of prediction is: 0.7205122180451128
The f1 score of prediction is: 0.6391698408198209


  _warn_prf(average, modifier, msg_start, len(result))


In [3]:
import torch.nn as nn
import torch.nn.functional as F

class LogisticRegressionModel(nn.Module):
    
    def __init__(self, n_features, n_classes):
        super(LogisticRegressionModel, self).__init__()
        
        # Register weight matrix and bias term as model parameters - automatically tracks operations for gradient computation.
        self.W = torch.nn.Parameter(torch.nn.init.xavier_uniform_(torch.empty([n_features, n_classes])))  # Weights
        self.b = torch.nn.Parameter(torch.zeros([n_classes]))  # Biases
        
    def forward(self, x):
        """
        Forward pass for logistic regression.
        Input: Tensor x of shape [N,C,H,W] ([batch size, channels, height, width])
        Output: Logits W @ x + b
        """
        batch_size = x.shape[0]
        
        x = x.view(batch_size, -1)  # Flatten image into vector, retaining batch dimension
        out = torch.matmul(x,self.W) + self.b  # Compute scores
        return out

In [None]:
def test(model, criterion, test_loader):
    test_loss = 0.
    test_preds, test_labels = list(), list()
    for i, data in enumerate(test_loader):
        x, labels = data

        with torch.no_grad():
            logits = model(x)  # Compute scores
            predictions = torch.argmax(logits, dim=1)
            test_loss += criterion(input=logits, target=labels).item()
            test_preds.append(predictions)
            test_labels.append(labels)

    test_preds = torch.cat(test_preds)
    test_labels = torch.cat(test_labels)

    test_accuracy = torch.eq(test_preds, test_labels).float().mean().item()

    print('[TEST] Mean loss {:.4f} | Accuracy {:.4f}'.format(test_loss/len(test_loader), test_accuracy))

def train(model, train_loader, test_loader, optimizer, n_epochs=10):
    """
    Generic training loop for supervised multiclass learning
    """
    LOG_INTERVAL = 250
    running_loss, running_accuracy = list(), list()
    start_time = time.time()
    criterion = torch.nn.CrossEntropyLoss()

    for epoch in range(n_epochs):  # Loop over training dataset `n_epochs` times

        epoch_loss = 0.

        for i, data in enumerate(train_loader):  # Loop over elements in training set

            x, labels = data

            logits = model(x)

            predictions = torch.argmax(logits, dim=1)
            train_acc = torch.mean(torch.eq(predictions, labels).float()).item()

            loss = criterion(input=logits, target=labels)

            loss.backward()               # Backward pass (compute parameter gradients)
            optimizer.step()              # Update weight parameter using SGD
            optimizer.zero_grad()         # Reset gradients to zero for next iteration


            # ============================================================================
            # You can safely ignore the boilerplate code below - just reports metrics over
            # training and test sets

            running_loss.append(loss.item())
            running_accuracy.append(train_acc)

            epoch_loss += loss.item()

            if i % LOG_INTERVAL == 0:  # Log training stats
                deltaT = time.time() - start_time
                mean_loss = epoch_loss / (i+1)
                print('[TRAIN] Epoch {} [{}/{}]| Mean loss {:.4f} | Train accuracy {:.5f} | Time {:.2f} s'.format(epoch, 
                    i, len(train_loader), mean_loss, train_acc, deltaT))

        print('Epoch complete! Mean loss: {:.4f}'.format(epoch_loss/len(train_loader)))

        test(model, criterion, test_loader)
        
    return running_loss, running_accuracy