In [2]:
###############
### IMPORTS ###
###############

import pandas as pd
import requests
import re
import numpy as np
import seaborn as sns
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Concatenate, TimeDistributed, Dense
from tensorflow.keras.layers import Embedding, GRU
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding, Masking, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential, load_model
sns.set()

In [35]:
#################
### Functions ###
#################

def reformat_text(text):
    text = re.sub(r'\(.+?\)', '', text)
    text = re.sub(r'\{.+?\}', '', text)
    text = re.sub(r'[-_()0-9%$:\^\/°\∼\~π]', '', text)
    text = re.sub(r'(\.)\1+','', text)
    text = re.sub(r'\w*<sub>','', text)
    text = re.sub(r'\$.+?\$', '', text)
    text = re.sub(r'<.+?>','',text)
    text = re.sub(r'[<>]', '', text)
    text = re.sub(r'(\s)\1+', ' ', text)
    return text

def lookup(param):
    if type(param) is int:
        return word_index[param]
    elif type(param) is str:
        return word_lexicon[param]
    else:
        print('Parameter not accepted.')

def shuffle(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

def remove_words(arr,lexicon):
    arr = arr.split(' ')

    for token in arr:
        if token not in lexicon:
            arr.remove(token)
            
    return arr

def convert_back(arr):
    sentence = []

    for token in arr:
        sentence.append(lookup(token))

    return ' '.join(sentence)

def least_used_words(word_counts,min_freq):
    delwords = []

    for word in word_counts:
        if word_counts[word] <= min_freq:
            delwords.append(word)

    return delwords

def make_model(num_words,
                          embedding_matrix,
                          lstm_cells=64,
                          trainable=False,
                          lstm_layers=1,
                          bi_direc=False):

    model = Sequential()

    # Map words to an embedding
    if not trainable:
        model.add(
            Embedding(
                input_dim=num_words,
                output_dim=embedding_matrix.shape[1],
                weights=[embedding_matrix],
                trainable=False,
                mask_zero=True))
        model.add(Masking())
    else:
        model.add(
            Embedding(
                input_dim=num_words,
                output_dim=embedding_matrix.shape[1],
                weights=[embedding_matrix],
                trainable=True))

    # If want to add multiple LSTM layers
    if lstm_layers > 1:
        for i in range(lstm_layers - 1):
            model.add(
                LSTM(
                    lstm_cells,
                    return_sequences=True,
                    dropout=0.1,
                    recurrent_dropout=0.1))

    # Add final LSTM cell layer
    if bi_direc:
        model.add(
            Bidirectional(
                LSTM(
                    lstm_cells,
                    return_sequences=False,
                    dropout=0.1,
                    recurrent_dropout=0.1)))
    else:
        model.add(
            LSTM(
                lstm_cells,
                return_sequences=False,
                dropout=0.1,
                recurrent_dropout=0.1))
    model.add(Dense(128, activation='relu'))
    # Dropout for regularization
    model.add(Dropout(0.5))

    # Output layer
    model.add(Dense(num_words, activation='softmax'))

    # Compile the model
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy'])
    return model

def load_and_evaluate(model_name):

    model = load_model(f'{model_dir}{model_name}.h5')
    r = model.evaluate(X_test, y_test, batch_size=2048, verbose=1)

    valid_crossentropy = r[0]
    valid_accuracy = r[1]

    print(f'Cross Entropy: {round(valid_crossentropy, 4)}')
    print(f'Accuracy: {round(100 * valid_accuracy, 2)}%')


In [None]:
##########################
### Obtaining the Data ###
##########################

token = 'N/A' #Available by request
rows = 1
start = 0
abstracts = []
titles = []

docs = True
while docs:
    result = requests.get("https://api.adsabs.harvard.edu/v1/search/query?" \
                       "q=%20abs%3AThermodynamics" \
                       "&sort=date%20desc%2C%20bibcode%20desc" \
                       "&fl=title,abstract", \
                       "&rows={rows}" \
                       "&start={start}".format(rows=rows,start=start), \
                       headers={'Authorization': 'Bearer ' + token})
#If the qeury is successful
    if result.status_code == 200:
        docs = result.json()['response']['docs']
        if 'abstract' in docs[0] and 'title' in docs[0]:
            tmp_abstract = [d['abstract'] for d in docs]
            tmp_titles = [d['title'] for d in docs]
            titles = titles + tmp_titles
            abstracts = abstracts + tmp_abstract
    elif result.status_code == 429:
        print(f'Collected {len(titles)} samples.')
        break
    start += rows


In [None]:
##################
### COPY LISTS ###
##################
CT = titles[:]
CA = abstracts[:]

In [None]:
####################
### INSPECT Data ###
####################

data = {'Title':CT,'Abstract':CA}
data = pd.DataFrame(data)
data.sample(10)

In [13]:
data = pd.read_csv(r'D:\PythonProjects\AbstractGenerator\data\data.csv')
data = data.drop('Unnamed: 0',axis=1)
data.sample(10)

Unnamed: 0,Title,Abstract
3187,Phase Transformation Modeling for Hypo Peritec...,Phase change of steel during cooling affects t...
4405,Single Crystal Elasticity of MgSiO<SUB>3</SUB>...,The combination of seismic observations and mi...
928,Theoretical Interpretation of Thermophysical P...,"In continuation to our previous publication, n..."
615,Scandium decorated C$_{24}$ fullerene as high ...,Using first-principles density functional theo...
4238,Modified strong-coupling treatment of a spin-1...,A quantum spin-1/2 antiferromagnetic Heisenber...
1255,E<SUP>2</SUP> and gamma distributions in polyg...,From solar supergranulation to salt flats in B...
3954,Disinfection By-Product Removal by Activated C...,This research was aimed to study the efficienc...
1565,Collective variable-based enhanced sampling an...,Collective variable-based enhanced sampling me...
3207,Study on variation of thermodynamic parameters...,To study the variation regularities of the mas...
3614,Contribution of electronic entropy to the orde...,Cu<SUB>3</SUB>Au experiences a phase transitio...


In [14]:
##########################
### Initial formatting ###
##########################

if type(data.iloc[0,0]) == list: 
    data['Title'] = data['Title'].apply(lambda x: x[0])


column_values = data[['Title']].values.ravel()
unique_values =  pd.unique(column_values)

#Tokenizer on the entire dataset

data['Reformatted'] = data['Abstract'].apply(reformat_text)
tokenizer = Tokenizer(lower=False)
tokenizer.fit_on_texts(data['Reformatted'])
data['Tokens'] = tokenizer.texts_to_sequences(data['Reformatted'])

word_lexicon = tokenizer.word_index
word_index = tokenizer.index_word
num_words = len(word_lexicon) + 1
word_counts = tokenizer.word_counts
sorted_counts = dict(sorted(dict(word_counts).items(), key=lambda item: item[1],reverse=True))


print(f'The database contains {len(unique_values)/data.shape[0]*100}% unique entries.\n With {num_words} unique words')


The database contains 99.41398074508162% unique entries.
 With 36919 unique words


In [15]:
##########################
### Further Formatting ###
##########################

### Removing the lowest used words.
delwords = least_used_words(word_counts=word_counts,min_freq=15)

for word in delwords:
    del word_lexicon[word]

data['Reformatted'] = data['Reformatted'].apply(lambda x : remove_words(x,word_lexicon))

#Running tokenizer again to update the word lexicon to be used in the model

tokenizer = Tokenizer(lower=False)
tokenizer.fit_on_texts(data['Reformatted'])
data['Tokens'] = tokenizer.texts_to_sequences(data['Reformatted'])

word_lexicon = tokenizer.word_index
word_index = tokenizer.index_word
num_words = len(word_lexicon) + 1
word_counts = tokenizer.word_counts
sorted_counts = dict(sorted(dict(word_counts).items(), key=lambda item: item[1],reverse=True))


print(f'The database contains {len(unique_values)/data.shape[0]*100}% unique entries.\n With {num_words} unique words')

The database contains 99.41398074508162% unique entries.
 With 17794 unique words


In [16]:
##########################
### GENERATE SEQUENCES ###
##########################
training = []
labels = []

base_length = 35
seq_length = base_length*2

data = data.drop(data[data['Tokens'].map(len) < seq_length].index)

#CUtting the tokens into sequences and adding them to an array, here every 36 words in our token sequence forms
# a training label pair from the begging to the end of the token sequence. 

lengths = [len(sequence) for sequence in data['Tokens']]
if min(lengths) >= seq_length:
     for sequence in data['Tokens']:
        for i in range(seq_length, len(sequence)):
            cut = sequence[i - seq_length:i + 1]
            training.append(cut[:-1])
            labels.append(cut[-1])
else: #Not expected to be used but here to avoid an error
    print(f'The sequence at {lengths.index(min(lengths))} is too short.')


In [17]:
########################
### TRAIN/TEST SPLIT ###
########################

### Shuffle the sets by shuffling a tuple of (sequence, label)

compact = list(zip(training,labels))
np.random.shuffle(compact)
training, labels = zip(*compact)

#split into 75% training to 25% test

X_train = np.array(training[:int(0.75*len(training))])
X_test = np.array(training[int(0.75*len(training)):])

y_train_base = np.array(labels)[:int(0.75*len(labels))]
y_test_base = np.array(labels)[int(0.75*len(labels)):]

y_train = np.zeros((len(y_train_base), num_words), dtype=np.int8)
y_test = np.zeros((len(y_test_base), num_words), dtype=np.int8)

# One hot encoding of labels
for example_index, word_index in enumerate(y_train_base):
    y_train[example_index, word_index] = 1

for example_index, word_index in enumerate(y_test_base):
    y_test[example_index, word_index] = 1

print(f'The training sequence shape is {X_train.shape}, the training label shape is {y_train.shape}')
print(f'The test sequence shape is {X_test.shape}, the test label shape is  {y_test.shape}')

The training sequence shape is (287175, 70), the training label shape is (287175, 17794)
The test sequence shape is (95725, 70), the test label shape is  (95725, 17794)


In [18]:
########################
### Embedding Matrix ###
########################

path = 'D:/PythonProjects/AbstractGenerator/data/glove.6B/glove.6B.100d.txt'
glove = np.loadtxt(path, dtype='str', comments=None, encoding= 'UTF-8')
vectors = glove[:, 1:].astype('float')
words = glove[:, 0]
del glove

#set up embedding matrix

word_lookup = {word: vector for word, vector in zip(words, vectors)}

embedding_matrix = np.zeros((num_words, len(word_lookup['the'])))

not_found = 0

for i, word in enumerate(word_lexicon.keys()):
    # Look up the word embedding
    vector = word_lookup.get(word, None)

    # Record in matrix
    if vector is not None:
        embedding_matrix[i + 1, :] = vector
    else:
        not_found += 1

print(f'There were {not_found} words without pre-trained embeddings.')
embedding_matrix.shape

There were 10879 words without pre-trained embeddings.


(17794, 100)

In [19]:
model_dir = 'D:/PythonProjects/AbstractGenerator/models/'
model_name = 'Main_model'
SAVE_MODEL = True
BATCH_SIZE = 2048
VERBOSE = 0
EPOCHS = 150

#Run the model until the accuracy does not improve significantly and save that model.

def make_callbacks(model_name, save=SAVE_MODEL):
    callbacks = [EarlyStopping(monitor='val_loss', patience=5)]
    if save:
        callbacks.append(
            ModelCheckpoint(
                f'{model_dir}{model_name}.h5',
                save_best_only=True,
                save_weights_only=False))
    return callbacks


callbacks = make_callbacks(model_name)

In [20]:
#Create the model

LSTM_CELLS = 64

model = make_model(
    num_words,
    embedding_matrix,
    lstm_cells=LSTM_CELLS,
    trainable=True,
    lstm_layers=1)
model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         1779400   
                                                                 
 lstm (LSTM)                 (None, 64)                42240     
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 17794)             2295426   
                                                                 
Total params: 4,125,386
Trainable params: 4,125,386
Non-trainable params: 0
_________________________________________________________________


In [None]:
#Compile and run the model, it takes around 80mins on my laptop.

model.compile(
    optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(
    X_train,
    y_train,
    batch_size=BATCH_SIZE,
    verbose=VERBOSE,
    epochs=EPOCHS,
    callbacks=callbacks,
    validation_data=(X_test, y_test))

In [38]:
#Load the best model and test the accuracy

model_dir = r'D:/PythonProjects/AbstractGenerator/deployment/models/'
model_name = 'Main_model'
model = load_model(f'{model_dir}{model_name}.h5')
model = load_and_evaluate(model_name)