<a href="https://colab.research.google.com/github/JURASA/USE/blob/main/Next_Word_Prediction_Model_with_Google_Universal_Sentence_Encoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Google drive for local storage**


In [27]:
# This cell will prompt an external url to accept permissions for Colab to access Google Drive

from google.colab import drive
drive.mount("/gdrive")

%ls

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
corpus.txt  [0m[01;34mNWP-USE[0m/  [01;34msample_data[0m/  vocabulary.npy


# **Import ***

In [28]:
# Getting all required libraries

import os
import re
import gdown
import numpy
import string
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from absl import logging
import tensorflow_hub as hub
from tensorflow import keras
import matplotlib.pyplot as plt
from keras.models import Sequential
import tensorflow.keras.backend as K
from keras.layers.recurrent import LSTM
from keras.layers import Dense, Activation
from keras.callbacks import LambdaCallback
from keras.utils.data_utils import get_file
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split

## **Data preparation - _Generating Corpus_**
The Dataset is based on a Tensorflow tutorial from Stanford, so all predicted words will be based on Deep learning and Machine learning terms

In [29]:
# Download data from Google drive

'''
ORIGINAL DATASET URL:
    https://raw.githubusercontent.com/maxim5/stanford-tensorflow-tutorials/master/data/arxiv_abstracts.txt

'''

url = ' https://drive.google.com/uc?id=1YTBR7FiXssaKXHhOZbUbwoWw6jzQxxKW'
output = 'corpus.txt'
gdown.download(url, output, quiet=False)

# Read local file from directory and clean contents
with open('corpus.txt') as subject:
  cache = subject.readlines()
translator = str.maketrans('', '', string.punctuation) # Remove punctuation
lines = [doc.lower().translate(translator) for doc in cache] # Switch to lower case

Downloading...
From:  https://drive.google.com/uc?id=1YTBR7FiXssaKXHhOZbUbwoWw6jzQxxKW
To: /content/corpus.txt
7.55MB [00:00, 184MB/s]


In [30]:
# Generating a list of single words, the "vocabulary"
vocabulary = list(set(' '.join(lines).replace('\n','').split(' ')))
primary_store = {}
for strings, texts in enumerate(vocabulary):
  primary_store[texts] = strings

In [31]:
# Splitting data into Train sets and test sets

X = [] 
y = []

for c in lines:
  xxxx = c.replace('\n','').split(' ')
  X.append(' '.join(xxxx[:-1])) # X from the corpus

  yyyy = [0 for i in range(len(vocabulary))] # Generate Y from the Vocabulary
  # yyyy[primary_store[xxxx[-1]]] = 1
  yyyy[primary_store[xxxx[-1]]] = 1
  y.append(yyyy)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
y_test = numpy.array(y_test)
y_train = numpy.array(y_train)

## **Embedding**

In [32]:
# Import the Universal Sentence Encoder's TF Hub module 
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"  
appreciate = hub.load(module_url)


In [33]:
# Implementing the Universal Search Encoder
X_train = appreciate(X_train)
X_test = appreciate(X_test)
X_train = X_train.numpy()
X_test = X_test.numpy()

# **Buildinging the model**

In [36]:
model = Sequential()
# model.add(LSTM(units=100, input_shape=[512]))
model.add(Dense(512, input_shape=[512], activation = 'relu'))
model.add(Dense(units=len(vocabulary), activation = 'softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 1024)              525312    
_________________________________________________________________
dense_7 (Dense)              (None, 2694)              2761350   
Total params: 3,286,662
Trainable params: 3,286,662
Non-trainable params: 0
_________________________________________________________________


In [42]:
# Training the model. 
model.fit(X_train, y_train, batch_size=2048, shuffle=True, epochs=30, validation_data=(X_test, y_test), callbacks=[LambdaCallback()])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f660a569320>

#**Testing**

In [44]:
# Create function to predict and show detailed output
def next_word(collection=[], extent=1):

  for item in collection:
    text = item
    for i in range(extent):
      prediction = model.predict(x=appreciate([item]).numpy())
      idx = np.argmax(prediction[-1])
      item += ' ' + vocabulary[idx]
      
      print(text + ' --> ' + item + '\nNEXT WORD: ' + item.split(' ')[-1] + '\n')

In [45]:
# Tests
single_text = ['we note that']
next_word(single_text)

we note that --> we note that experiments
NEXT WORD: experiments



In [46]:
# Testing on a collection of words

text_collection = ['deep convolutional', 'simple and effective', 'complex hilbert', 'a', 'there is', 'that party was']
next_word(text_collection)

deep convolutional --> deep convolutional networks
NEXT WORD: networks

simple and effective --> simple and effective estimators
NEXT WORD: estimators

complex hilbert --> complex hilbert relu
NEXT WORD: relu

a --> a accuracy
NEXT WORD: accuracy

there is --> there is accuracy
NEXT WORD: accuracy

that party was --> that party was networks
NEXT WORD: networks



In [41]:
# Storing data
vocabulary = numpy.array(vocabulary)
numpy.save('./vocabulary.npy', vocabulary)
model.save('./NWP-USE')

INFO:tensorflow:Assets written to: ./NWP-USE/assets


INFO:tensorflow:Assets written to: ./NWP-USE/assets


In [None]:
##                                                                  END OF NOTEBOOK