In [6]:
import numpy as np

from numpy import mean
import pandas as pd
import json
from numpy import std
from sklearn.model_selection import RepeatedKFold
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score
import tensorflow as tf

In [3]:
# read train data and test data
f_train = open("../data/train.json", 'r')
train_data = json.load(f_train)

f_test = open("../data/test.json", 'r')
test_data = json.load(f_test)

def get_attr_matrix(data):
    n_samples = len(data)
    n_features = 5000 -1 

    # get abstract & title feature
    wmatrix = np.ndarray([n_samples, n_features])
    wmatrix.fill(0)

    for i in range(n_samples):
        instance = data[i]
        for title in instance['title']:
            wmatrix[i, title-1] += 1
        for abstract in instance['abstract']:
            wmatrix[i, abstract-1] += 1

    # get venue feature
    vmatrix = np.ndarray([n_samples, 1])

    for i in range(n_samples):
        venue = data[i]['venue']
        
        if venue:
            vmatrix[i, ] = venue
        else:
            vmatrix[i, ] = -1

    # get year feature
    ymatrix = np.ndarray([n_samples, 1])

    for i in range(n_samples):
        year = data[i]['year']
        
        if year:
            ymatrix[i, ] = year
        else:
            ymatrix[i, ] = -1
            
    return np.concatenate((wmatrix, vmatrix, ymatrix), axis=1)

attr_matrix = get_attr_matrix(train_data)
attr_matrix_test = get_attr_matrix(test_data)

def handle_authors(data, key="author"):

    n_samples = len(data)

    # prolific authors 
    y = np.ndarray([n_samples, 100])
    y.fill(0)

    # get co-author matrix
    amatrix = np.ndarray([n_samples, 21245 - 100 + 1])
    amatrix.fill(0)

    for i in range(n_samples):
        authors = data[i][key]
        
        for au in authors:
            if au < 100:
                
                y[i, au] += 1
            else:
                amatrix[i, au - 100] += 1

    return amatrix, y

amatrix, y = handle_authors(train_data, key="authors")

amatrix_test, _ = handle_authors(test_data, key="coauthors")

X = np.concatenate((attr_matrix, amatrix), axis=1)
X_kaggle = np.concatenate((attr_matrix_test, amatrix_test), axis=1)

print("Train:")
print("     X : ", X.shape)
print("     y : ", y.shape)
print("Test:")
print("     X : ", X_kaggle.shape)

Train:
     X :  (25793, 26147)
     y :  (25793, 100)
Test:
     X :  (800, 26147)


In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) 

print("Train:")
print("     X_train : ", X_train.shape)
print("     y_train : ", y_train.shape)
print("Test:")
print("     X_test  : ", X_test.shape)
print("     y_test  : ", y_test.shape)

Train:
     X_train :  (17281, 26147)
     y_train :  (17281, 100)
Test:
     X_test  :  (8512, 26147)
     y_test  :  (8512, 100)


In [44]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

In [47]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


In [45]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [48]:
model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [50]:
X = torch.rand(1, 28, 28, device=device)
logits = model(X)
pred_probab = nn.Softmax(dim=1)(logits)
y_pred = pred_probab.argmax(1)
print(f"Predicted class: {y_pred}")

Predicted class: tensor([6])
