In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#lib

In [2]:
import tarfile
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from gensim.models import Word2Vec
import gensim.downloader as api
from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
import nltk
from nltk.corpus import stopwords
import re
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.preprocessing import StandardScaler

# Load Data

In [3]:
# Open the tar archive in read mode
tar = tarfile.open('/content/drive/MyDrive/NLP_Bonus/enronsentv1.tar.gz', 'r:gz')

# Extract all the files in the archive to the specified path
tar.extractall(path='/content/Dataset')

# Close the tar archive
tar.close()


In [4]:
# Define a function to read the text from a file
def read_file(filepath):
    with open(filepath, 'r') as f:
        text = f.read()
    return text

# Load the files into a DataFrame
data = pd.DataFrame(columns=['text'])
# from eronsent00 to eronsent15
for i in range(0, 16):
    filename = f'enronsent{i:02d}'
    filepath = os.path.join('/content/Dataset/enronsent', filename)
    text = read_file(filepath)
    data = pd.concat([data, pd.DataFrame({'text': [text]})], ignore_index=True)



In [5]:
data

Unnamed: 0,text
0,\nAttached are two files that illustrate the ...
1,\n2.\tEntergy - \n\n\t\tThey want a straight 7...
2,\nThanks! Elyse\nSusan White <Susan.White@bus...
3,pulling for Enron. I hope the bank deal comes...
4,Ricki Winters \nAssistant to Shelley Corman \n...
5,"\nIf possible, I'd like to add the ""benchmark""..."
6,confidence in the judgment of Enron's senior m...
7,The new electric transmission business will be...
8,no partnering in the project. There has been n...
9,Apparently GS is going to issue a report sayin...


In [6]:
str=data["text"][0]
print (str)


Attached  are two files that illustrate the following:

As prices rose, supply increased and demand decreased.  Now prices are 
beginning to fall in response these market responses. 

Financial  (6)
  West Desk  (14)
Mid Market (16)

Share information about yourself, create your own public profile at
http://profiles.msn.com.

 - utility.xls
 - utility.xls

Enron-admin@FSDDataSvc.com on 09/06/2000 10:12:33 AM
Executive Impact & Influence Program
* IMMEDIATE ACTION REQUIRED - Do Not Delete *

As part of the Executive Impact and Influence Program, each participant
is asked to gather input on the participant's own management styles and
practices as experienced by their immediate manager, each direct report,
and up to eight peers/colleagues.

You have been requested to provide feedback for a participant attending
the next program.  Your input (i.e., a Self assessment, Manager assessment,
Direct Report assessment, or Peer/Colleague assessment) will be combined
with the input of others and u

#preprocessing

In [5]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
#  list of words ,  lowercase , removes :  digits '\d+' , special characters '[^\w\s] , stopword 
data['text'] = data['text'].apply(lambda x: [re.sub(r'[^\w\s]', '', re.sub(r'\d+', '', word)) for word in x.lower().split() if word not in stop_words])


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
data

Unnamed: 0,text
0,"[attached, two, files, illustrate, following, ..."
1,"[, entergy, , want, straight, x, day, power, p..."
2,"[thanks, elyse, susan, white, susanwhitebusute..."
3,"[pulling, enron, hope, bank, deal, comes, quic..."
4,"[ricki, winters, assistant, shelley, corman, e..."
5,"[possible, id, like, add, benchmark, committee..."
6,"[confidence, judgment, enrons, senior, managem..."
7,"[new, electric, transmission, business, califo..."
8,"[partnering, project, serious, negotiation, da..."
9,"[apparently, gs, going, issue, report, saying,..."


In [6]:
# Prepare the data
tokenizer = Tokenizer()
# unique integer index to each word
tokenizer.fit_on_texts(data['text'])
# sequences of integers
sequences = tokenizer.texts_to_sequences(data['text'])

vocab_size = len(tokenizer.word_index) + 1  # 1 is added for padding that will be added later.
print(vocab_size)

84062


In [8]:
sequences

[[32,
  63,
  1115,
  8093,
  72,
  141,
  2791,
  337,
  1511,
  201,
  5959,
  141,
  1120,
  1165,
  387,
  45,
  1649,
  125,
  1,
  394,
  334,
  1,
  263,
  45,
  1,
  530,
  31,
  3137,
  648,
  255,
  2335,
  7268,
  1,
  34977,
  1,
  34977,
  34978,
  1,
  1,
  631,
  883,
  1,
  3507,
  313,
  1,
  1586,
  558,
  451,
  1,
  870,
  1,
  254,
  631,
  883,
  3507,
  313,
  2122,
  169,
  3690,
  1039,
  1320,
  185,
  13077,
  1928,
  2778,
  1586,
  433,
  219,
  114,
  3271,
  34979,
  498,
  112,
  1016,
  2122,
  1622,
  53,
  313,
  1039,
  568,
  4559,
  2452,
  433,
  2452,
  219,
  114,
  2452,
  34980,
  2452,
  2486,
  1039,
  469,
  297,
  313,
  2122,
  1255,
  558,
  88,
  1696,
  7399,
  185,
  13077,
  1928,
  613,
  596,
  2452,
  324,
  470,
  52,
  202,
  455,
  1,
  98,
  1016,
  613,
  254,
  313,
  2122,
  169,
  2245,
  7399,
  2395,
  577,
  1016,
  101,
  788,
  1016,
  1227,
  596,
  2452,
  2,
  456,
  72,
  867,
  1138,
  282,
  782,
  3526,
  59,
 

In [11]:
# Should be attached as its the 1st word in 1st document"
tokenizer.index_word[32]

'attached'

# preparing data

In [7]:
# Create input and output samples for the LSTM model

maxlen = 3 #  input sequences to 3 words
#  empty lists to store the input and output
X = []
y = []
for sequence in sequences:
    for i in range(maxlen, len(sequence)):
        X.append(sequence[i-maxlen:i]) # Appends a subsequence of maxlen 
        y.append(sequence[i]) # Appends the next word
X = np.array(X)
y = np.array(y)
X = pad_sequences(X, maxlen=maxlen,padding='pre') #  padding to the beginning of the sequences

In [13]:
# attached to files illustrate
# X[0] = [attached, two, files] , y[0]= [illustrate]  
X[0]

array([  32,   63, 1115], dtype=int32)

In [14]:
y[0] 

8093

In [15]:
tokenizer.index_word[8093]

'illustrate'

In [8]:
# Divide the data into training and validation sets
train_idx = sum(len(s) for s in sequences[:11])
X_train, y_train = X[:train_idx], y[:train_idx]    # lengths of the first 11 sequences ,  o Training data from enronsent00 to enronsent10
X_val, y_val = X[train_idx:], y[train_idx:]        # the remaining part , o Validation data from enronsent11 to enronsent15

In [None]:
X_train

array([[  32,   63, 1115],
       [  63, 1115, 8093],
       [1115, 8093,   72],
       ...,
       [ 608, 8368,    1],
       [8368,    1,    2],
       [   1,    2,    8]], dtype=int32)

# word2vec

In [None]:
# Load the pretrained word2vec model
model = api.load('word2vec-google-news-300')



In [None]:
# Save the pre-trained Word2Vec model to Google Drive
model_path = "/content/drive/MyDrive/Models/word2vec-google-news-300.bin"
model.save_word2vec_format(model_path, binary=True)

In [9]:
# Load word2vec Google news model from drive
word2vec_model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/Models/word2vec-google-news-300.bin', binary=True, encoding='latin1')


In [10]:
# Prepare the embedding matrix
embedding_matrix = np.zeros((vocab_size, word2vec_model.vector_size))
for word, i in tokenizer.word_index.items(): # tokenizer's vocab  and its corresponding integer index.
    if word in word2vec_model:
         embedding_matrix[i] = word2vec_model[word] # each word in the tokenizer's vocabs ===> corresponding word vector in a pre-trained word2vec 

In [20]:
embedding_matrix # word vectors

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.03100586,  0.12109375,  0.13964844, ..., -0.23144531,
        -0.15527344, -0.12402344],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

# LSTM

In [11]:
# LSTM MODEL
model = Sequential()
model.add(Embedding(vocab_size, word2vec_model.vector_size, weights=[embedding_matrix], input_length=maxlen, trainable=False)) #  pretrained word2vec model as an embedding layer 
model.add(LSTM(128)) 
model.add(Dense(vocab_size, activation='softmax')) # likelihood of each word being the next word
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# couldn't make more complex model as it takes long time in training and crashes 

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 300)            25218600  
                                                                 
 lstm (LSTM)                 (None, 128)               219648    
                                                                 
 dense (Dense)               (None, 84062)             10843998  
                                                                 
Total params: 36,282,246
Trainable params: 11,063,646
Non-trainable params: 25,218,600
_________________________________________________________________


In [14]:
# Define the callbacks
early_stop = EarlyStopping(monitor='val_loss', patience=5) # avoid overfiting and meaneaning less resources consumption (training)
# Save Checkpoints Because I suffered from from Sessions Reastart (⋋▂⋌)                                                                    
model_checkpoint = ModelCheckpoint('/content/drive/MyDrive/Models/Lstm_model_{epoch:02d}.h5', save_best_only=True)


In [15]:
# Train the model using callbacks
history = model.fit(X_train, y_train,validation_data=(X_val, y_val), epochs=50, batch_size=256 ,  callbacks=[early_stop, model_checkpoint])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50


In [21]:
# Test the model using the interactive loop
def predict_next_word(model, tokenizer, input_text):
    encoded_text = tokenizer.texts_to_sequences([input_text])[0] # input text to a sequence of integer-encoded words
    pad_encoded = pad_sequences([encoded_text], maxlen=maxlen, padding='pre') # Pads the encoded sequence ensure the same length as the input sequences used to train the LSTM model.
    pred_word_ind = np.argmax(model.predict(pad_encoded), axis=-1) # select the index with the highest predicted probability next word
    return tokenizer.index_word[pred_word_ind[0]] # highest predicted probability inex ===> predicted word.

In [20]:
# Load the saved model
model_path = "/content/drive/MyDrive/Models/Lstm_model_07.h5"
model = tf.keras.models.load_model(model_path)

In [27]:
for _ in range(3):  # loop 3 times
    input_text = ''
    while True:
        word = input("Enter next word (-1 to terminate): ") 
        if word == '-1':
            break
        input_text += ' ' + word # input sequence
        predicted_word = predict_next_word(model, tokenizer, input_text) # predic the next word
        print(f"Is your next word: '{predicted_word}'?") # True or False
        correct = input("Yes/No: ").lower()
        if correct == 'yes':
            input_text += ' ' + predicted_word # if true append  if not write your word
    print(f"Your final sentence is: '{input_text.strip()}'") 

Enter next word (-1 to terminate): thank
Is your next word: 'you'?
Yes/No: yes
Enter next word (-1 to terminate): very
Is your next word: 'good'?
Yes/No: no
Enter next word (-1 to terminate): much
Is your next word: 'better'?
Yes/No: no
Enter next word (-1 to terminate): -1
Your final sentence is: 'thank you very much'
Enter next word (-1 to terminate): new
Is your next word: 'york'?
Yes/No: yes
Enter next word (-1 to terminate): -1
Your final sentence is: 'new york'
Enter next word (-1 to terminate): san
Is your next word: 'francisco'?
Yes/No: yes
Enter next word (-1 to terminate): -1
Your final sentence is: 'san francisco'
