In [None]:
################
### UI SHELL ###
################

# from tkinter import *
# root = Tk()
# root.title("Chatbot")
# def send():
#     send = "You -> "+e.get()
#     txt.insert(END, "n"+send)
#     user = e.get().lower()
#     if(user == "hello"):
#         txt.insert(END, "n" + "Bot -> Hi")
#     elif(user == "hi" or user == "hii" or user == "hiiii"):
#         txt.insert(END, "n" + "Bot -> Hello")
#     elif(e.get() == "how are you"):
#         txt.insert(END, "n" + "Bot -> fine! and you")
#     elif(user == "fine" or user == "i am good" or user == "i am doing good"):
#         txt.insert(END, "n" + "Bot -> Great! how can I help you.")
#     else:
#         txt.insert(END, "n" + "Bot -> Sorry! I dind't got you")
#     e.delete(0, END)
# txt = Text(root)
# txt.grid(row=0, column=0, columnspan=2)
# e = Entry(root, width=100)
# e.grid(row=1, column=0)
# send = Button(root, text="Send", command=send).grid(row=1, column=1)
# root.mainloop()

In [201]:
###############
### IMPORTS ###
###############

import pandas as pd
import requests
import tensorflow as tf
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
import re
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Concatenate, TimeDistributed, Dense
from tensorflow.keras.layers import Embedding, GRU
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding, Masking, Bidirectional
from tensorflow.keras.optimizers import Adam
sns.set()


In [391]:
#################
### Functions ###
#################

def reformat_text(text):
    text = re.sub(r'\(.+?\)', '', text)
    text = re.sub(r'\{.+?\}', '', text)
    text = re.sub(r'[-_()0-9%$:\^\/°\∼\~π]', '', text)
    text = re.sub(r'(\.)\1+','', text)
    text = re.sub(r'\w*<sub>','', text)
    text = re.sub(r'\$.+?\$', '', text)
    text = re.sub(r'<.+?>','',text)
    text = re.sub(r'[<>]', '', text)
    text = re.sub(r'(\s)\1+', ' ', text)
    return text

def lookup(param):
    if type(param) is int:
        return word_index[param]
    elif type(param) is str:
        return word_lexicon[param]
    else:
        print('Parameter not accepted.')

def shuffle(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

def remove_words(arr,lexicon):
    arr = arr.split(' ')
    for token in arr:
        if token not in lexicon:
            arr.remove(token)
    return arr

def create_model(training_shape,label_shape, n_word_embedding_nodes,
                 n_tag_embedding_nodes, n_hidden_nodes, stateful=True, batch_size=1):
    
    #Layers 1
    word_input = Input(batch_shape=(batch_size, training_shape[1]), name='word_input_layer')
    tag_input = Input(batch_shape=(batch_size, label_shape[1]), name='tag_input_layer')

    #Layers 2
    word_embeddings = Embedding(input_dim=training_shape[0],
                                output_dim=n_word_embedding_nodes, 
                                mask_zero=True, name='word_embedding_layer')(word_input) #mask_zero will ignore 0 padding
    #Output shape = (batch_size, seq_input_len, n_word_embedding_nodes)
    tag_embeddings = Embedding(input_dim=label_shape[0],
                               output_dim=n_tag_embedding_nodes,
                               mask_zero=True, name='tag_embedding_layer')(tag_input) 
    #Output shape = (batch_size, seq_input_len, n_tag_embedding_nodes)
    
    #Layer 3
    merged_embeddings = Concatenate(axis=0, name='concat_embedding_layer')([word_embeddings, tag_embeddings])
    #Output shape =  (batch_size, seq_input_len, n_word_embedding_nodes + n_tag_embedding_nodes)
    
    #Layer 4
    hidden_layer = GRU(units=n_hidden_nodes, return_sequences=True, 
                       stateful=stateful, name='hidden_layer')(merged_embeddings)
    #Output shape = (batch_size, seq_input_len, n_hidden_nodes)
    
    #Layer 5
    output_layer = TimeDistributed(Dense(units=n_tag_input_nodes, 
                                         activation='softmax'), name='output_layer')(hidden_layer)
    # Output shape = (batch_size, seq_input_len, n_tag_input_nodes)
    
    #Specify which layers are input and output, compile model with loss and optimization functions
    model = Model(inputs=[word_input, tag_input], outputs=output_layer)
    model.compile(loss="sparse_categorical_crossentropy",
                  optimizer='adam')
    
    return model

def convert_back(arr):
    sentence = []
    for token in arr:
        sentence.append(lookup(token))
    return ' '.join(sentence)

def least_used_words(word_counts,min_freq):
    delwords = []
    for word in word_counts:
        if word_counts[word] <= min_freq:
            delwords.append(word)
    return delwords

def make_word_level_model(num_words,
                          embedding_matrix,
                          lstm_cells=64,
                          trainable=False,
                          lstm_layers=1,
                          bi_direc=False):
    """Make a word level recurrent neural network with option for pretrained embeddings
       and varying numbers of LSTM cell layers."""

    model = Sequential()

    # Map words to an embedding
    if not trainable:
        model.add(
            Embedding(
                input_dim=num_words,
                output_dim=embedding_matrix.shape[1],
                weights=[embedding_matrix],
                trainable=False,
                mask_zero=True))
        model.add(Masking())
    else:
        model.add(
            Embedding(
                input_dim=num_words,
                output_dim=embedding_matrix.shape[1],
                weights=[embedding_matrix],
                trainable=True))

    # If want to add multiple LSTM layers
    if lstm_layers > 1:
        for i in range(lstm_layers - 1):
            model.add(
                LSTM(
                    lstm_cells,
                    return_sequences=True,
                    dropout=0.1,
                    recurrent_dropout=0.1))

    # Add final LSTM cell layer
    if bi_direc:
        model.add(
            Bidirectional(
                LSTM(
                    lstm_cells,
                    return_sequences=False,
                    dropout=0.1,
                    recurrent_dropout=0.1)))
    else:
        model.add(
            LSTM(
                lstm_cells,
                return_sequences=False,
                dropout=0.1,
                recurrent_dropout=0.1))
    model.add(Dense(128, activation='relu'))
    # Dropout for regularization
    model.add(Dropout(0.5))

    # Output layer
    model.add(Dense(num_words, activation='softmax'))

    # Compile the model
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy'])
    return model



In [92]:
##########################
### Obtaining the Data ###
##########################

# token = 'ansPyi7MStG33JewfUJsi0SN4wlRyUFqBUMjAj02'
# rows = 1 # fetch 1 record at a time
# start = 0  # start with the first result
# abstracts = []
# titles = []

# docs = True
# while docs:
#     result = requests.get("https://api.adsabs.harvard.edu/v1/search/query?" \
#                        "q=%20abs%3AThermodynamics" \
#                        "&sort=date%20desc%2C%20bibcode%20desc" \
#                        "&fl=title,abstract", \
#                        "&rows={rows}" \
#                        "&start={start}".format(rows=rows,start=start), \
#                        headers={'Authorization': 'Bearer ' + token})
# #If the qeury is successful
#     if result.status_code == 200:
#         docs = result.json()['response']['docs']
#         if 'abstract' in docs[0] and 'title' in docs[0]:
#             tmp_abstract = [d['abstract'] for d in docs]
#             tmp_titles = [d['title'] for d in docs]
#             titles = titles + tmp_titles
#             abstracts = abstracts + tmp_abstract
#     elif result.status_code == 429:
#         print(f'Collected {len(titles)} samples.')
#         break
#     start += rows # increment the start value to move to the next page of result


Collected 4778 samples.


In [379]:
data = pd.read_csv('data.csv')
data = data.iloc[:2500]
titles = data['Title']
abstracts = data['Abstract']

In [380]:
##################
### COPY LISTS ###
##################
CT = titles[:]
CA = abstracts[:]

In [381]:
####################
### INSPECT Data ###
####################

data = {'Title':CT,'Abstract':CA}
data = pd.DataFrame(data)
data.sample(10)

Unnamed: 0,Title,Abstract
2139,Study the Role of R<SUP>2</SUP> Term in Cosmol...,It has been shown that AdS/CFT correspondence ...
2474,Mechanism and Thermodynamic Characteristics of...,Results are presented from determining the ade...
174,Revealing curvature and stochastic effects on ...,In the theoretical development of normal grain...
2204,Modeling dynamics of the spacecraft power plan...,Modern space programs cover a wide range of mi...
2486,"First principle calculations of structural, el...","Structural, elastic, electronic and optical pr..."
2439,Validity and reliability of oral temperature c...,Complex thermodynamics of the human body and e...
2261,Assessing Past and future Hazardous freezing R...,"Freezing precipitation, in the form of freezin..."
691,Kerr black hole surrounded by a cloud of strin...,"In this paper, an exact solution of the Kerr b..."
1526,Continuous cosmic evolution with diffusive bar...,In the background of homogeneous and isotropic...
2332,Entropy-Driven Microstructure Evolution Calcul...,A Potts model and the Replica Exchange Wang-La...


In [382]:
##########################
### Initial formatting ###
##########################

#filters='!()"%;[\\]^_`{|}~\t\n'
if type(data.iloc[0,0]) == list: 
    data['Title'] = data['Title'].apply(lambda x: x[0])
column_values = data[['Title']].values.ravel()
unique_values =  pd.unique(column_values)
data['Reformatted'] = data['Abstract'].apply(reformat_text)
tokenizer = Tokenizer(lower=False)
tokenizer.fit_on_texts(data['Reformatted'])
data['Tokens'] = tokenizer.texts_to_sequences(data['Reformatted'])

word_lexicon = tokenizer.word_index
word_index = tokenizer.index_word
num_words = len(word_lexicon) + 1
word_counts = tokenizer.word_counts
sorted_counts = dict(sorted(dict(word_counts).items(), key=lambda item: item[1],reverse=True))


print(f'The database contains {len(unique_values)/data.shape[0]*100}% unique entries.\n With {num_words} unique words')

# fig, ax = plt.subplots()
# x, y = zip(*sorted_counts.items()) # unpack a list of pairs into two tuples
# ax.plot(x, y)
# ax.yaxis.set_major_locator(plt.NullLocator())
# ax.xaxis.set_major_formatter(plt.NullFormatter())
# ax.set_xlabel('Words')
# ax.set_ylabel('Word Frequency')
# plt.show()


The database contains 99.6% unique entries.
 With 26012 unique words


In [383]:
##########################
### Further Formatting ###
##########################

### Removing the lowest 10% of words.
delwords = least_used_words(word_counts=word_counts,min_freq=15)

for word in delwords:
    del word_lexicon[word]

data['Reformatted'] = data['Reformatted'].apply(lambda x : remove_words(x,word_lexicon))

tokenizer = Tokenizer(lower=False)
tokenizer.fit_on_texts(data['Reformatted'])
data['Tokens'] = tokenizer.texts_to_sequences(data['Reformatted'])

word_lexicon = tokenizer.word_index
word_index = tokenizer.index_word
num_words = len(word_lexicon) + 1
word_counts = tokenizer.word_counts
sorted_counts = dict(sorted(dict(word_counts).items(), key=lambda item: item[1],reverse=True))


print(f'The database contains {len(unique_values)/data.shape[0]*100}% unique entries.\n With {num_words} unique words')

# fig, ax = plt.subplots()
# x, y = zip(*sorted_counts.items()) # unpack a list of pairs into two tuples
# ax.plot(x, y)
# ax.yaxis.set_major_locator(plt.NullLocator())
# ax.xaxis.set_major_formatter(plt.NullFormatter())
# ax.set_xlabel('Words')
# ax.set_ylabel('Word Frequency')
# plt.show()

The database contains 99.6% unique entries.
 With 12980 unique words


In [384]:
data.iloc[:3000]

Unnamed: 0,Title,Abstract,Reformatted,Tokens
0,Cosmological models reconstructed from jerk: A...,The thermodynamic viability of some dark energ...,"[The, thermodynamic, of, some, dark, energy, m...","[8, 18, 2, 171, 989, 31, 77, 109, 1, 574, 164,..."
1,Investigation into the effect of energy densit...,"In this study, the effects of volume energy de...","[In, this, the, effects, of, volume, energy, d...","[25, 23, 1, 118, 2, 337, 31, 70, 14, 5858, 67,..."
2,The effects of substrate morphology by regulat...,When cells are cultured on the micro- or nano-...,"[When, cells, are, on, the, micro, or, structu...","[713, 1579, 12, 14, 1, 2440, 52, 74, 12, 428, ..."
3,Holographic dual approach to magnetism and mag...,We propose a dual gravitational theory corresp...,"[We, propose, a, dual, gravitational, theory, ...","[17, 353, 5, 2344, 526, 71, 258, 6, 1064, 1065..."
4,Defect induced ferromagnetism in a two-dimensi...,Two-dimensional ferromagnetic materials are po...,"[ferromagnetic, materials, are, potential, can...","[1065, 179, 12, 68, 1464, 10, 27, 24, 1624, 11..."
...,...,...,...,...
2495,Principles of low dissipation computing from a...,"We introduce a thermodynamically consistent, m...","[We, introduce, a, thermodynamically, minimal,...","[17, 707, 5, 220, 1055, 545, 26, 9, 2746, 1928..."
2496,Effect of chaos on the simulation of quantum c...,"We study how chaos, introduced by a weak pertu...","[We, study, how, introduced, by, a, weak, affe...","[17, 44, 214, 878, 13, 5, 450, 1114, 1, 2743, ..."
2497,Non-equilibrium quadratic measurement-feedback...,Measurement and feedback control of thermomech...,"[and, feedback, control, of, thermomechanical,...","[3, 1377, 382, 2, 2360, 603, 4, 5, 5856, 40, 4..."
2498,Harnessing fluctuations in thermodynamic compu...,We experimentally demonstrate that highly stru...,"[We, experimentally, demonstrate, that, highly...","[17, 611, 178, 10, 418, 2778, 875, 2, 98, 2819..."


In [385]:
##########################
### GENERATE SEQUENCES ###
##########################
training = []
labels = []

base_length = 35
seq_length = base_length + base_length//2

data = data.drop(data[data['Tokens'].map(len) < seq_length].index)

lengths = [len(sequence) for sequence in data['Tokens']]
if min(lengths) >= seq_length:
     for sequence in data['Tokens']:
        for i in range(seq_length, len(sequence)):
            cut = sequence[i - seq_length:i + 1]
            training.append(cut[:-1])
            labels.append(cut[-1])
else:
    print(f'The sequence at {lengths.index(min(lengths))} is too short.')


In [386]:
########################
### TRAIN/TEST SPLIT ###
########################

### Shuffle the sets by shuffling a tuple of (sequence, label)

compact = list(zip(training,labels))
np.random.shuffle(compact)
training, labels = zip(*compact)

#split into 75% training to 25% test

X_train = np.array(training[:int(0.75*len(training))])
X_test = np.array(training[int(0.75*len(training)):])

y_train_base = np.array(labels)[:int(0.75*len(labels))]
y_test_base = np.array(labels)[int(0.75*len(labels)):]

y_train = np.zeros((len(y_train_base), num_words), dtype=np.int8)
y_test = np.zeros((len(y_test_base), num_words), dtype=np.int8)

# One hot encoding of labels
for example_index, word_index in enumerate(y_train_base):
    y_train[example_index, word_index] = 1

for example_index, word_index in enumerate(y_test_base):
    y_test[example_index, word_index] = 1

# y_train = keras.utils.to_categorical(y_train,num_classes=len(np.unique(y_train)))
# y_test = keras.utils.to_categorical(y_test,num_classes=len(np.unique(y_test)))

print(f'The training sequence shape is {X_train.shape}, the training label shape is {y_train.shape}')
print(f'The test sequence shape is {X_test.shape}, the test label shape is  {y_test.shape}')

The training sequence shape is (179412, 52), the training label shape is (179412, 12980)
The test sequence shape is (59804, 52), the test label shape is  (59804, 12980)


In [388]:
########################
### Embedding Matrix ###
########################

path = 'D:/PythonProjects/PhysicsChatbot/data/glove.6B/glove.6B.100d.txt'
glove = np.loadtxt(path, dtype='str', comments=None, encoding= 'UTF-8')
vectors = glove[:, 1:].astype('float')
words = glove[:, 0]
del glove

word_lookup = {word: vector for word, vector in zip(words, vectors)}

embedding_matrix = np.zeros((num_words, vectors.shape[1]))

not_found = 0

for i, word in enumerate(word_lexicon.keys()):
    vector = word_lookup.get(word, None)
    if vector is not None:
        embedding_matrix[i + 1, :] = vector
    else:
        not_found += 1

embedding_matrix = np.zeros((num_words, len(word_lookup['the'])))

not_found = 0

for i, word in enumerate(word_lexicon.keys()):
    # Look up the word embedding
    vector = word_lookup.get(word, None)

    # Record in matrix
    if vector is not None:
        embedding_matrix[i + 1, :] = vector
    else:
        not_found += 1

print(f'There were {not_found} words without pre-trained embeddings.')
embedding_matrix.shape

There were 7442 words without pre-trained embeddings.


In [395]:

model_dir = '../models/'
model_name = 'Main_model'

In [399]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

SAVE_MODEL = True
BATCH_SIZE = 2048
VERBOSE = 0
EPOCHS = 150


def make_callbacks(model_name, save=SAVE_MODEL):
    """Make list of callbacks for training"""
    callbacks = [EarlyStopping(monitor='val_loss', patience=5)]

    if save:
        callbacks.append(
            ModelCheckpoint(
                f'{model_dir}{model_name}.h5',
                save_best_only=True,
                save_weights_only=False))
    return callbacks


callbacks = make_callbacks(model_name)

In [397]:
LSTM_CELLS = 64

model = make_word_level_model(
    num_words,
    embedding_matrix,
    lstm_cells=LSTM_CELLS,
    trainable=True,
    lstm_layers=1)
model.summary()



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 100)         1298000   
                                                                 
 lstm_1 (LSTM)               (None, 64)                42240     
                                                                 
 dense_2 (Dense)             (None, 128)               8320      
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_3 (Dense)             (None, 12980)             1674420   
                                                                 
Total params: 3,022,980
Trainable params: 3,022,980
Non-trainable params: 0
_________________________________________________________________


In [401]:
model.compile(
    optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

history = model.fit(
    X_train,
    y_train,
    batch_size=BATCH_SIZE,
    verbose=VERBOSE,
    epochs=EPOCHS,
    callbacks=callbacks,
    validation_data=(X_test, y_test))

KeyboardInterrupt: 