In [38]:
import numpy as np

from numpy import mean
import pandas as pd
import json
from numpy import std
from sklearn.model_selection import RepeatedKFold
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import accuracy_score

In [6]:
# read train data and test data
f_train = open("../data/train.json", 'r')
train_data = json.load(f_train)

f_test = open("../data/test.json", 'r')
test_data = json.load(f_test)

def get_attr_matrix(data):
    n_samples = len(data)
    n_features = 5000 -1 

    # get abstract & title feature
    wmatrix = np.ndarray([n_samples, n_features])
    wmatrix.fill(0)

    for i in range(n_samples):
        instance = data[i]
        for title in instance['title']:
            wmatrix[i, title-1] += 1
        for abstract in instance['abstract']:
            wmatrix[i, abstract-1] += 1

    # get venue feature
    vmatrix = np.ndarray([n_samples, 1])

    for i in range(n_samples):
        venue = data[i]['venue']
        
        if venue:
            vmatrix[i, ] = venue
        else:
            vmatrix[i, ] = -1

    # get year feature
    ymatrix = np.ndarray([n_samples, 1])

    for i in range(n_samples):
        year = data[i]['year']
        
        if year:
            ymatrix[i, ] = year
        else:
            ymatrix[i, ] = -1
            
    return np.concatenate((wmatrix, vmatrix, ymatrix), axis=1)

attr_matrix = get_attr_matrix(train_data)
attr_matrix_test = get_attr_matrix(test_data)

def handle_authors(data, key="author"):

    n_samples = len(data)

    # prolific authors 
    y = np.ndarray([n_samples, 100])
    y.fill(0)

    # get co-author matrix
    amatrix = np.ndarray([n_samples, 21245 - 100 + 1])
    amatrix.fill(0)

    for i in range(n_samples):
        authors = data[i][key]
        
        for au in authors:
            if au < 100:
                
                y[i, au] += 1
            else:
                amatrix[i, au - 100] += 1

    return amatrix, y

amatrix, y = handle_authors(train_data, key="authors")

amatrix_test, _ = handle_authors(test_data, key="coauthors")

X = np.concatenate((attr_matrix, amatrix), axis=1)
X_kaggle = np.concatenate((attr_matrix_test, amatrix_test), axis=1)

print("Train:")
print("     X : ", X.shape)
print("     y : ", y.shape)
print("Test:")
print("     X : ", X_kaggle.shape)

Train:
     X :  (25793, 26147)
     y :  (25793, 100)
Test:
     X :  (800, 26147)


In [7]:
from scipy import sparse
X = sparse.csr_matrix(X)
X_kaggle = sparse.csr_matrix(X_kaggle)
X

<25793x26147 sparse matrix of type '<class 'numpy.float64'>'
	with 3009689 stored elements in Compressed Sparse Row format>

In [3]:
def get_model(n_inputs, n_outputs):
    model = Sequential()
    model.add(Dense(20, input_dim=n_inputs, kernel_initializer='he_uniform', activation='relu'))
    model.add(Dense(n_outputs, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam')
    return model

def evaluate_model(X, y):
    results = list()
    n_inputs, n_outputs = X.shape[1], y.shape[1]

    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

    for train_ix, test_ix in cv.split(X):

        X_train, X_test = X[train_ix], X[test_ix]
        y_train, y_test = y[train_ix], y[test_ix]

        model = get_model(n_inputs, n_outputs)

        model.fit(X_train, y_train, verbose=0, epochs=100)

        yhat = model.predict(X_test)
        yhat = yhat.round()
        acc = accuracy_score(y_test, yhat)

        print('>%.3f' % acc)
        results.append(acc)

    return results

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) 

In [10]:
from numpy import asarray

n_inputs, n_outputs = X.shape[1], y.shape[1]

model = get_model(n_inputs, n_outputs)

model.fit(X_train, y_train, verbose=0, epochs=100)



<keras.callbacks.History at 0x15b183a30>

In [13]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, recall_score, f1_score

y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)



In [91]:
def multi_label_column(matrix):
    
    n_samples, n_class = matrix.shape
    # print(n_samples, n_class)

    output =[]

    for i in range(n_samples):
        pred = ""
        for j in range(n_class):
            if matrix[i][j] >= 0.99999:
                pred += str(j) + " "
        if pred:
            output.append(pred[:-1])
        else:
            output.append("-1")
    
    return output

In [92]:
y_test.shape

(8512, 100)

In [93]:
y_train_list = multi_label_column(y_train)
y_pred_train_list = multi_label_column(y_pred_train)

y_test_list = multi_label_column(y_test)
y_pred_list = multi_label_column(y_pred)

In [94]:
print('='*25 + 'Evaluation results' + '='*25)
print('The accuracy score of prediction is: {}'.format(accuracy_score(y_test_list, y_pred_list)))
print('The racall score of prediction is: {}'.format(recall_score(y_test_list, y_pred_list, average='weighted')))
print('The f1 score of prediction is: {}'.format(f1_score(y_test_list, y_pred_list, average='weighted'))) 

The accuracy score of prediction is: 0.7135808270676691
The racall score of prediction is: 0.7135808270676691
The f1 score of prediction is: 0.6105150709801721


  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
def kaggle_output(model, X=X_kaggle):
    y_pred = model.predict(X)
    output_df = pd.DataFrame(columns=["ID", "Predict"])

    for i in range(y_pred.shape[0]):
        pred = ""
        for j in range(y_pred.shape[1]):
            if y_pred[i][j] > 0.5:
                pred += str(j) + " "
        if pred:
            output_df.loc[i, 'Predict'] = pred[:-1]
        else:
            output_df.loc[i, 'Predict'] = "-1"

    output_df['ID'] = output_df.index
    output_df = output_df.set_index('ID')
    return output_df

In [41]:
# count / length
kaggle = kaggle_output(model)
kaggle.to_csv("../kaggle/predict2.csv")

