In [1]:
# Common imports
import numpy as np 
seed=42069
np.random.seed(42069)
import os 
import datetime
import pandas as pd 
from functools import reduce
from pprint import pprint
import sys

# Custom imports
import tensorflow_setup

# ML imports
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Dropout, LSTM, Dense
from tensorflow.keras.utils import plot_model, to_categorical
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
tensorflow_setup.init(shutup=True)

Enabled dynamic gpu memory
Ran tensorflow_shutup


In [3]:
df = pd.read_csv('data/cord-19-data.csv')

In [4]:
df.columns

Index(['cord_uid', 'paper_id', 'source', 'is_pmc', 'title', 'body_text', 'doi',
       'pubmed_id', 'license', 'abstract', 'publish_time', 'authors',
       'journal', 'url', 'language'],
      dtype='object')

In [5]:
n = 2
# Sample n texts from dataframe
texts = df.body_text.sample(n=n, random_state=seed)
# Concatenate sampled texts
print(texts)
text = reduce(str.__add__, texts)

32005    Infectious diseases have caused and will conti...
25733    Respiratory tract diseases (RTDs) are a leadin...
Name: body_text, dtype: object


In [6]:
vocab = sorted(set(text))
char_to_int = {c:i for i, c in enumerate(vocab)}
int_to_char = np.array(vocab) 

In [7]:
print(f'Vocab size {len(vocab)}')
print(f'Text size {len(text)}')

Vocab size 83
Text size 26865


In [8]:
# X is the text represented as integers, dictionary is char_int
X = np.array([char_to_int[c] for c in text], dtype=np.int8)
X_train, X_val = np.array_split(X, 2) 

In [9]:
batch_size = 128
X_train = X_train[:len(X_train)//128*128]

In [10]:
# prepare the dataset of input to output pairs encoded as integers
n_chars = len(text)
seq_length = 64
dataX = []
dataY = []
for i in range(n_chars - seq_length):
    seq_in = text[i:i + seq_length]
    seq_out = text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)

In [11]:
X = np.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X/len(vocab)
# one hot encode the output variable
y = to_categorical(dataY)

In [12]:
print(X.shape)

(26801, 64, 1)


In [13]:
# def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
#     model = Sequential([
#          Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
#          Dropout(0.2),
#          LSTM(rnn_units, return_sequences=True, stateful=True, recurrent_initializer='glorot_uniform'),
#          Dropout(0.2), 
#          LSTM(rnn_units,
#          return_sequences=True,
#          stateful=True,
#          recurrent_initializer='glorot_uniform'),
#          Dropout(0.2),
#          Dense(vocab_size)
#      ])
 
#     return model
# model = build_model(vocab_size=len(vocab), embedding_dim=69, rnn_units=10, batch_size=64)

def build_model(input_shape, output_shape):
    model = Sequential([
        LSTM(256, input_shape=input_shape, return_sequences=True),
        Dropout(0.1),
        LSTM(256),
#         Dropout(0.2),
        Dense(output_shape, activation='softmax')
    ])
    return model
model = build_model(X.shape[1:], y.shape[1])

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 64, 256)           264192    
_________________________________________________________________
dropout (Dropout)            (None, 64, 256)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               525312    
_________________________________________________________________
dense (Dense)                (None, 83)                21331     
Total params: 810,835
Trainable params: 810,835
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.compile(optimizer='adam', loss='categorical_crossentropy')

In [19]:
model.fit(X, y, epochs=69, batch_size=batch_size)

Train on 26801 samples
Epoch 1/69
Epoch 2/69
Epoch 3/69
Epoch 4/69
Epoch 5/69
Epoch 6/69
Epoch 7/69
Epoch 8/69
Epoch 9/69
Epoch 10/69
Epoch 11/69
Epoch 12/69
Epoch 13/69
Epoch 14/69
Epoch 15/69
Epoch 16/69
Epoch 17/69
Epoch 18/69
Epoch 19/69
Epoch 20/69
Epoch 21/69
Epoch 22/69
Epoch 23/69
Epoch 24/69
Epoch 25/69
Epoch 26/69
Epoch 27/69
Epoch 28/69
Epoch 29/69
Epoch 30/69
Epoch 31/69
Epoch 32/69
Epoch 33/69
Epoch 34/69
Epoch 35/69
Epoch 36/69
Epoch 37/69
Epoch 38/69
Epoch 39/69
Epoch 40/69
Epoch 41/69
Epoch 42/69
Epoch 43/69
Epoch 44/69
Epoch 45/69
Epoch 46/69
Epoch 47/69
Epoch 48/69
Epoch 49/69
Epoch 50/69
Epoch 51/69
Epoch 52/69
Epoch 53/69
Epoch 54/69
Epoch 55/69
Epoch 56/69
Epoch 57/69
Epoch 58/69
Epoch 59/69
Epoch 60/69
Epoch 61/69
Epoch 62/69
Epoch 63/69
Epoch 64/69
Epoch 65/69
Epoch 66/69
Epoch 67/69
Epoch 68/69
Epoch 69/69


<tensorflow.python.keras.callbacks.History at 0x7f6e64149dd0>

In [20]:
print(f"{'Cat and dog':>64s}")

                                                     Cat and dog


In [23]:
pattern = f"{'Oh my goodness Corona Corona':>64s}"
print(pattern)
pattern = [char_to_int[c] for c in pattern]
# generate characters
for i in range(1000):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(len(vocab))
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print("\nDone.")

                                    Oh my goodness Corona Corona
cted eroation barrien, and a significant decrease in transepithelial electrical resistance) [20, 21]. Furthermore, serological diagnosis of HBoV has recently confirmed significant increases in IgG antibodies in children with pneumonia. These results support the idea that it is a true pathogen in RTI in children [22, 23]. Prolonged viral shedding has been described, about 2.5 months in outpatients and about 4.5 months to 1 year in hospitalized children, which probably explains why HBoV is detected in asymptomatic cases. This prolonged shedding may also explain why the rate of coinfection with other viruses is so high, ranging from 75% to 85% [24].
In this study, we retrospectively analyzed data for 1352 nasopharyngeal samples (NPSs, aspirates and swabs) that were molecularly tested for the presence of HBoV DNA. Our aim was to determine the prevalence of HBoV in children up to 16 years of age who presented at the hospital w

In [None]:
text[64:1064]