In [1]:
import pandas as pd
import numpy as np
import keras
import gensim
import pickle
import gc
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
%matplotlib inline

%load_ext autoreload
%autoreload 2
import helper

Using TensorFlow backend.


In [2]:
X_train = np.load("data/word_vectors/pubmed_stratify_no_wiki_X_train_seeds42.npy")
y_train = np.load("data/word_vectors/pubmed_stratify_no_wiki_y_train_seeds42.npy")

In [3]:
X_test = np.load("data/word_vectors/pubmed_stratify_no_wiki_X_test_seeds42.npy")
y_test = np.load("data/word_vectors/pubmed_stratify_no_wiki_y_test_seeds42.npy")

In [4]:
X_train = np.concatenate((X_train, X_test), axis=0)
y_train = np.concatenate((y_train, y_test), axis=0)

In [5]:
X_train.shape

(45587, 100, 200)

In [8]:
input_shape = (100, 200)

# Model

In [9]:
model = keras.models.Sequential()
model.add(keras.layers.recurrent.GRU(100, input_shape=input_shape))
model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.Dense(23, activation='softmax'))

In [10]:
model.compile(optimizer="adagrad",
              loss='categorical_crossentropy',
              metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, epochs=15, batch_size=256)
yhat = model.predict(X_test)
_ = helper.score_prediction(y_test, yhat, binary=False)
gc.collect()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru_1 (GRU)                  (None, 100)               90300     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 23)                2323      
Total params: 92,623
Trainable params: 92,623
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
             precision    recall  f1-score   support

          0       0.00      0.00      0.00        41
          1       0.48      0.47      0.47       206
          2       0.62      0.77      0.68       482
          3  

7

In [12]:
del X_train
del X_test

In [13]:
gc.collect()

25

# Submission

In [34]:
_,target, raw_cats = helper.load_multiclass_data()

In [14]:
submission_vectors = np.load("data/submission_word_vectors.npy")
submission_ids = np.load("data/submission_ids.npy")

In [40]:
cat_names = list(target.columns)
def one_hot_to_cats(one_hot_row):
    for index, col in enumerate(one_hot_row):
        if col == 1:
            return cat_names[index]

In [69]:
cat_names

['Animal Diseases',
 'Bacterial Infections and Mycoses',
 'Cardiovascular Diseases',
 'Digestive System Diseases',
 'Disorders of Environmental Origin',
 'Endocrine Diseases',
 'Eye Diseases',
 'Female Genital Diseases and Pregnancy Complications',
 'Hemic and Lymphatic Diseases',
 'Immunologic Diseases',
 'Musculoskeletal Diseases',
 'Neonatal Diseases and Abnormalities',
 'Neoplasms',
 'Nervous System Diseases',
 'Nutritional and Metabolic Diseases',
 'Otorhinolaryngologic Diseases',
 'Parasitic Diseases',
 'Pathological Conditions, Signs and Symptoms',
 'Respiratory Tract Diseases',
 'Skin and Connective Tissue Diseases',
 'Stomatognathic Diseases',
 'Urologic and Male Genital Diseases',
 'Virus Diseases']

### Reconstruct the categories from the one-hot encoding

In [57]:
yhat = model.predict(submission_vectors)

In [58]:
yhat_cats = np.apply_along_axis(helper.max_to_one, arr=yhat, axis=1)

In [74]:
submission_cats = list()
for row in yhat_cats:
    submission_cats.append(one_hot_to_cats(row))
submission_cats[0:10]

['Parasitic Diseases',
 'Immunologic Diseases',
 'Bacterial Infections and Mycoses',
 'Respiratory Tract Diseases',
 'Parasitic Diseases',
 'Cardiovascular Diseases',
 'Urologic and Male Genital Diseases',
 'Musculoskeletal Diseases',
 'Cardiovascular Diseases',
 'Skin and Connective Tissue Diseases']

In [75]:
test = pd.get_dummies(submission_cats)
for i in range(len(submission_vectors)):
    # if all elements of the arrays match we get 23 True values, which gives a sum of 23
    if np.sum(yhat_cats[i] == test.values[i]) != 23:
        print(f"Error at {i}")

In [76]:
submission_df = pd.DataFrame()
submission_df["Id"] = submission_ids
submission_df["Category"] = submission_cats

In [77]:
submission_df.head()

Unnamed: 0,Id,Category
0,4132,Parasitic Diseases
1,23198,Immunologic Diseases
2,8,Bacterial Infections and Mycoses
3,34357,Respiratory Tract Diseases
4,3800,Parasitic Diseases


In [78]:
submission_df.to_csv("data/multiclass_submissions.csv", index=False)