In [2]:
#unzip enronsent1.zip

In [26]:
import tensorflow as tf
import os
import glob
import numpy as np
import gensim.downloader as api
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
import time
from gensim.models import Word2Vec

In [2]:

def load_data(folder_path):
    data = []
    files = glob.glob(os.path.join(folder_path, "*.txt"))
    for file in files:
        with open(file, "r") as f:
            text = f.read()
            data.append(text)
    return data

def tokenize(text):
    tokens = text.split()
    return tokens

def preprocess(tokens):
    stop_words = ["a", "an", "the", "in", "on", "at", "to", "from", "of", "for", "by", "with", "about", "as"]
    punctuation = [",", ".", "!", "?", ";", ":", "-", "--", "(", ")", "[", "]", "{", "}", "'", "\""]
    tokens = [token.lower() for token in tokens if token.lower() not in stop_words and token not in punctuation]
    return tokens


In [3]:

# Load the dataset
data_folder = "D:\\projects\\jupyter\\gam3a\\sem 8\\NLP\\enronsent1\\enronsent1"
data = load_data(data_folder)



In [4]:
# Tokenize and preprocess the data
tokenized_data = [tokenize(text) for text in data]
preprocessed_data = [preprocess(tokens) for tokens in tokenized_data]




In [5]:
# Split the data into training and validation sets
train_data = preprocessed_data[0:10]
val_data = preprocessed_data[11:15]

In [10]:
model_name = "glove-twitter-50"
model_path = f"{model_name}.model"

if not os.path.exists(model_path):
    word2vec_model = api.load(model_name)
    word2vec_model.save(model_path)
else:
    word2vec_model = gensim.models.Word2Vec.load(model_path)
    
    
#word2vec_model = api.load("glove-twitter-50")


In [32]:
def prepare_input_output(data, word2vec_model):
    samples = []
    for paragraph in data:
        # Convert each token to word2vec embedding
        paragraph_embeddings = [word2vec_model.get_vector(token) for token in paragraph if token in word2vec_model.key_to_index]
        # Split each paragraph into fixed time steps
        time_steps = 10  # set a fixed number of time steps
        for i in range(0, len(paragraph_embeddings) - time_steps, time_steps):
            input_sample = paragraph_embeddings[i:i+time_steps]
            output_sample = paragraph_embeddings[i+time_steps]
            samples.append((input_sample, output_sample))
    # Convert samples to numpy array and reshape for input to model
    input_data = np.array([sample[0] for sample in samples])
    output_data = np.array([sample[1] for sample in samples])
    return input_data, output_data


In [12]:
# Prepare training and validation input and output
train_input, train_output = prepare_input_output(train_data, word2vec_model)
val_input, val_output = prepare_input_output(val_data, word2vec_model)

In [13]:
train_input.shape

(205733, 10, 50)

In [14]:
train_output.shape

(205733, 50)

In [16]:

with tf.device('/GPU:0'):
    # Define the model architecture
    model = Sequential()
    model.add(LSTM(128, input_shape=(train_input.shape[1], train_input.shape[2])))
    model.add(Dense(word2vec_model.vector_size, activation='linear'))
    model.compile(loss='mse', optimizer='adam')
t = time.time()
with tf.device('/GPU:0'):
    # Train the model
    model.fit(train_input, train_output, validation_data=(val_input, val_output), epochs=50, batch_size=128)

time_tak = time.time()-t
print('Training time: {} minutes and {} seconds'.format(time_tak/60, time_tak%60))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Training time: 5.099514321486155 minutes and 5.9708592891693115 seconds


In [22]:
while True:
    sentence = []
    word = input("Enter the first word: ")
    if word == '-1':
        break
    sentence.append(word)
    while True:
        # Check if the input word is in the vocabulary
        if word not in word2vec_model.key_to_index:
            word = input("Word not found in vocabulary, please enter another word: ")
            continue
        input_sample = [word2vec_model.get_vector(word)]
        # Repeat the input sample to create a batch of sequences with length 10
        input_sample = np.array([input_sample]*10)
        # Swap the first two dimensions of the input sample
        input_sample = np.swapaxes(input_sample, 0, 1)
        predicted_output = model.predict(input_sample)
        next_word_vec = predicted_output[0]
        next_word = word2vec_model.most_similar(positive=[next_word_vec], topn=1)[0][0]
        print("Is your next word: '{}  (-1 to terminate)'?".format(next_word))
        correct = input("Yes or No? ")
        if correct.lower() == 'yes':
            sentence.append(next_word)
            word = next_word
        else:
            word = input("Please enter the correct word: ")
            sentence.append(word)
        if word == '-1':
            break
            
    if '-1' in sentence:
        sentence.remove('-1')
    print("Your final sentence is: ", ' '.join(sentence))
    break

Enter the first word (-1 to terminate)you
Is your next word: 'have'?
Yes or No? yes
Is your next word: 'any'?
Yes or No? no
Please enter the correct word: have
Is your next word: 'any'?
Yes or No? no
Please enter the correct word: been
Is your next word: 'availability'?
Yes or No? no
Please enter the correct word: requested
Is your next word: 'it'?
Yes or No? no
Please enter the correct word: to
Is your next word: 'any'?
Yes or No? no
Please enter the correct word: provide
Is your next word: 'pricing'?
Yes or No? no
Please enter the correct word: feedback
Is your next word: 'you'?
Yes or No? no
Please enter the correct word: -1
Your final sentence is:  you have have been requested to provide feedback


#so the three sentences ghich we the mode made:<br>

1- You have been requested to provide feedback <br>


# USE skp gram Model


In [45]:
# Define the new model name and model path
model_name = "skp_gram"
model_path = f"{model_name}.model"

# Train the word embeddings using Skip-gram if the model doesn't exist
if not os.path.exists(model_path):

    # Set the desired dimensionality of the embeddings and the context window size
    embedding_size = 100
    #window_size = 5

    # Initialize and train the Skip-gram model
    word2vec_model2 = Word2Vec(tokenized_data, window=window_size, sg=1)
    word2vec_model2.save(model_path)
else:
    # Load the pre-trained Skip-gram model
    word2vec_model2 = gensim.models.Word2Vec.load(model_path)


In [46]:
def prepare_input_output2(data, word2vec_model):
    samples = []
    for paragraph in data:
        # Convert each token to word2vec embedding
        paragraph_embeddings = [word2vec_model.wv.get_vector(token) for token in paragraph if token in word2vec_model.wv.key_to_index]
        # Split each paragraph into fixed time steps
        time_steps = 10  # Set a fixed number of time steps
        for i in range(0, len(paragraph_embeddings) - time_steps, time_steps):
            input_sample = paragraph_embeddings[i:i + time_steps]
            output_sample = paragraph_embeddings[i + time_steps]
            samples.append((input_sample, output_sample))
    # Convert samples to numpy array and reshape for input to model
    input_data = np.array([sample[0] for sample in samples])
    output_data = np.array([sample[1] for sample in samples])
    return input_data, output_data


In [47]:
# Prepare training and validation input and output
train_input2, train_output2 = prepare_input_output2(train_data, word2vec_model2)
val_input2, val_output2 = prepare_input_output2(val_data, word2vec_model2)

In [48]:
with tf.device('/GPU:0'):
    # Define the model architecture
    model = Sequential()
    model.add(LSTM(128, input_shape=(train_input2.shape[1], train_input2.shape[2])))
    model.add(Dense(word2vec_model2.vector_size, activation='linear'))
    model.compile(loss='mse', optimizer='adam')
t = time.time()
with tf.device('/GPU:0'):
    # Train the model
    model.fit(train_input2, train_output2, validation_data=(val_input2, val_output2), epochs=50, batch_size=128)

time_tak = time.time()-t
print('Training time: {} minutes and {} seconds'.format(time_tak/60, time_tak%60))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Training time: 6.019236179192861 minutes and 1.1541707515716553 seconds


In [49]:
while True:
    sentence = []
    word = input("Enter the first word: ")
    if word == '-1':
        break
    sentence.append(word)
    while True:
        # Check if the input word is in the vocabulary
        if word not in word2vec_model2.wv.key_to_index:
            word = input("Word not found in vocabulary, please enter another word: ")
            continue
        input_sample = [word2vec_model2.wv.get_vector(word)]
        # Repeat the input sample to create a batch of sequences with length 10
        input_sample = np.array([input_sample] * 10)
        # Swap the first two dimensions of the input sample
        input_sample = np.swapaxes(input_sample, 0, 1)
        predicted_output = model.predict(input_sample)
        next_word_vec = predicted_output[0]
        next_word = word2vec_model2.wv.most_similar(positive=[next_word_vec], topn=1)[0][0]
        print("Is your next word: '{}  (-1 to terminate)'?".format(next_word))
        correct = input("Yes or No? ")
        if correct.lower() == 'yes':
            sentence.append(next_word)
            word = next_word
        else:
            word = input("Please enter the correct word: ")
            sentence.append(word)
        if word == '-1':
            break

    if '-1' in sentence:
        sentence.remove('-1')
    print("Your final sentence is: ", ' '.join(sentence))
    break

Enter the first word: you
Is your next word: 'come.  (-1 to terminate)'?
Yes or No? no
Please enter the correct word: have
Is your next word: 'e  (-1 to terminate)'?
Yes or No? no
Please enter the correct word: been
Is your next word: 'creating  (-1 to terminate)'?
Yes or No? no
Please enter the correct word: requested
Is your next word: 'e  (-1 to terminate)'?
Yes or No? no
Please enter the correct word: to
Is your next word: 'present  (-1 to terminate)'?
Yes or No? no
Please enter the correct word: provide
Is your next word: 'monthly  (-1 to terminate)'?
Yes or No? no
Please enter the correct word: feedback
Is your next word: 'process.  (-1 to terminate)'?
Yes or No? -1
Please enter the correct word: -1
Your final sentence is:  you have been requested to provide feedback
