In [16]:
import torch
from torch.utils import data
import matplotlib.pyplot as plt
import numpy as np

from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, recall_score, f1_score

import pandas as pd
import json

import tqdm

In [2]:
sample = pd.read_csv('../data/sample.csv')
sample.head()

Unnamed: 0,ID,Predict
0,0,28
1,1,62 71
2,2,59
3,3,10
4,4,86


In [4]:
# read train data and test data
f_train = open("../data/train.json", 'r')
raw_train_data = json.load(f_train)

f_test = open("../data/test.json", 'r')
raw_test_data = json.load(f_test)

print(len(raw_train_data))
print(len(raw_test_data))

25793
800


In [6]:
def prolific_authors(authors):
    count = 0
    for i in authors:
        if i < 100:
            count += 1
    return count

train_data = raw_train_data.copy()
test_data = raw_test_data.copy()

In [7]:
def get_attr_matrix(data):
    n_samples = len(data)
    n_features = 5000 -1 

    # get abstract & title feature
    wmatrix = np.ndarray([n_samples, n_features])
    wmatrix.fill(0)

    for i in range(n_samples):
        instance = data[i]
        for title in instance['title']:
            wmatrix[i, title-1] += 1
        for abstract in instance['abstract']:
            wmatrix[i, abstract-1] += 1

    # get venue feature
    vmatrix = np.ndarray([n_samples, 1])

    for i in range(n_samples):
        venue = data[i]['venue']
        
        if venue:
            vmatrix[i, ] = venue
        else:
            vmatrix[i, ] = -1

    # get year feature
    ymatrix = np.ndarray([n_samples, 1])

    for i in range(n_samples):
        year = data[i]['year']
        
        if year:
            ymatrix[i, ] = year
        else:
            ymatrix[i, ] = -1
            
    return np.concatenate((wmatrix, vmatrix, ymatrix), axis=1)

attr_matrix = get_attr_matrix(train_data)
print("train :", attr_matrix.shape)

attr_matrix_test = get_attr_matrix(test_data)
print("test  : ", attr_matrix.shape)

train : (25793, 5001)
test  :  (25793, 5001)


In [8]:
def handle_authors(data, key="author"):

    n_samples = len(data)

    # prolific authors 
    y = np.ndarray([n_samples, 100])
    y.fill(0)

    # get co-author matrix
    amatrix = np.ndarray([n_samples, 21245 - 100 + 1])
    amatrix.fill(0)

    for i in range(n_samples):
        authors = data[i][key]
        
        for au in authors:
            if au < 100:
                
                y[i, au] += 1
            else:
                amatrix[i, au - 100] += 1

    return amatrix, y

amatrix, y = handle_authors(train_data, key="authors")
print("Train:")
print("          amatrix : ", amatrix.shape)
print("                y : ", y.shape)
amatrix_test, _ = handle_authors(test_data, key="coauthors")
print("Test:")
print("     amatrix_test : ", amatrix_test.shape)

Train:
          amatrix :  (25793, 21146)
                y :  (25793, 100)
Test:
     amatrix_test :  (800, 21146)


In [9]:
X = np.concatenate((attr_matrix, amatrix), axis=1)
X_kaggle = np.concatenate((attr_matrix_test, amatrix_test), axis=1)

In [10]:
print("Train:")
print("     X : ", X.shape)
print("     y : ", y.shape)
print("Test:")
print("     X : ", X_kaggle.shape)

Train:
     X :  (25793, 26147)
     y :  (25793, 100)
Test:
     X :  (800, 26147)


In [11]:
from scipy import sparse
X = sparse.csr_matrix(X)
X_kaggle = sparse.csr_matrix(X_kaggle)
X

<25793x26147 sparse matrix of type '<class 'numpy.float64'>'
	with 3009689 stored elements in Compressed Sparse Row format>

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) 

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

model = MLPClassifier()
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)

print('='*25 + 'Training results' + '='*25)
print('The accuracy score of prediction is: {}'.format(accuracy_score(y_train, y_pred_train)))
print('The racall score of prediction is: {}'.format(recall_score(y_train, y_pred_train, average='weighted')))
print('The f1 score of prediction is: {}'.format(f1_score(y_train, y_pred_train, average='weighted'))) 

print('='*25 + 'Evaluation results' + '='*25)
print('The accuracy score of prediction is: {}'.format(accuracy_score(y_test, y_pred)))
print('The racall score of prediction is: {}'.format(recall_score(y_test, y_pred, average='weighted')))
print('The f1 score of prediction is: {}'.format(f1_score(y_test, y_pred, average='weighted'))) 

The accuracy score of prediction is: 1.0
The racall score of prediction is: 1.0
The f1 score of prediction is: 1.0
The accuracy score of prediction is: 0.7410714285714286
The racall score of prediction is: 0.19356955380577429
The f1 score of prediction is: 0.2967069318764589


In [14]:
def kaggle_output(model, X=X_kaggle):
    y_pred = model.predict(X)
    output_df = pd.DataFrame(columns=["ID", "Predict"])

    for i in range(y_pred.shape[0]):
        pred = ""
        for j in range(y_pred.shape[1]):
            if y_pred[i][j] > 0:
                pred += str(j) + " "
        if pred:
            output_df.loc[i, 'Predict'] = pred[:-1]
        else:
            output_df.loc[i, 'Predict'] = "-1"

    output_df['ID'] = output_df.index
    output_df = output_df.set_index('ID')
    return output_df

In [15]:
# count / length
kaggle = kaggle_output(model)
kaggle.to_csv("../kaggle/predict1.csv")