# README

> This Google Colab Notebook includes the following:

1. Loading `Pre-trained Word2Vec word embeddings`.
1. Training `our own Word embeddings using the Word2Vec algorithm (CBoW)`
1. Training a `Simple RNN using Keras for sentiment analysis` on `IMDB reviews dataset`.

> References

1. [NLP - Word Embedding with Gensim library for Text-Classification](https://michael-fuchs-python.netlify.app/2021/09/01/nlp-word-embedding-with-gensim-for-text-classification/)
1. [The Unreasonable Effectiveness of Recurrent Neural Networks](https://karpathy.github.io/2015/05/21/rnn-effectiveness/)
1. [Text classification with an RNN](https://www.tensorflow.org/text/tutorials/text_classification_rnn)
1. [Machine Learning — Word Embedding & Sentiment Classification using Keras](https://towardsdatascience.com/machine-learning-word-embedding-sentiment-classification-using-keras-b83c28087456)

> Notes:

1. Make sure to run the notebook using `Google colab's free tier Nvidia T4 GPU`:
    - Runtime -> Change runtime type -> T4 GPU -> Save

# Loading Model

In [81]:
# import pickle
# # Load the model from the file
# file_path = r'D:\AAST\Semester 10\Image Processing\.Marc\Labs\Lab 2\trained_model.pkl'
# with open(file_path, 'rb') as f:
#     loaded_model = pickle.load(f)

# Imports 

In [169]:
import re
import string

import numpy as np
import pandas as pd
import gensim.downloader
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, SimpleRNN
from tensorflow.keras.initializers import Constant
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

import os
from azure.storage.blob import BlobServiceClient
import pickle
import logging
import json


In [83]:

# Define the connection string
connection_string = "DefaultEndpointsProtocol=https;AccountName=mystorageproject11;AccountKey=kQYwYpoZgb7W12FhVHSipbIRGAe9OUuibCTFhViwlObyVU3iHhqgH2UD1JLLBnqEf2x9/sgBz2yY+AStRYvDoQ==;EndpointSuffix=core.windows.net"

# Connect to Azure Blob Storage
blob_service_client = BlobServiceClient.from_connection_string(connection_string)
container_name = "myconatinerblob"
blob_name = "trained_model.pkl"  # Adjust this to match the name of your model file in Blob Storage

# Retrieve the model file from Blob Storage
blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)
model_bytes = blob_client.download_blob().readall()

# Load the machine learning model
loaded_model = pickle.loads(model_bytes)


In [84]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords') 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [85]:
# # For reproducible results
# SEED = 42

# keras.utils.set_random_seed(SEED)
# tf.config.experimental.enable_op_determinism()

# Loading IMDB reviews dataset

In [86]:
# # Define the file path
# file_path = r"D:\AAST\Semester 10\Image Processing\.Marc\Labs\Lab 2\IMDB-Dataset.csv"

# # Read the dataset into a DataFrame
# df = pd.read_csv(file_path)


In [87]:
# df.shape

In [88]:
# df.info()

In [89]:
# df.head(10)

In [90]:
# df['sentiment'].value_counts()

In [91]:
# df['sentiment_id'] = df['sentiment'].map({'negative': 0, 'positive': 1})

In [92]:
# df.head(n=10)

In [93]:
# print(stopwords.words('english'))

In [94]:
# The set() function converts the list of stopwords into a set data structure.
# Checking if a word is present in the stopwords list becomes significantly faster using a set compared to iterating through a list. usefual in large data 
#Sets, unlike lists, eliminate duplicates. This ensures that each stopword is only included once, saving memory and avoiding unnecessary processing.

english_stop_words = set(stopwords.words('english'))

In [95]:
# Apply some quick text pre-processing

def preprocess(text):
  # Remove HTML
  html_remover= re.compile('<.*?>')
  text = re.sub(html_remover, '', text)

  # Create a translation table to remove punctuation
  table = str.maketrans("", "", string.punctuation)

  # Remove punctuation using translate
  text = text.translate(table)

  # Tokenize sentence (the punkt library) 
  words = word_tokenize(text)  

  # Remove Stop words and lower each word
  filtered_sentence = []
  for word in words:
      if word not in english_stop_words and word.isalpha():
          filtered_sentence.append(word.lower())

  return filtered_sentence

In [96]:
# sentence = "The movie wasn't as good as i me my expected?"
# preprocessed_sentence = preprocess(sentence)

# preprocessed_sentence

In [97]:
# df['filtered_reviews'] = df['review'].apply(preprocess) # Pre-process sentences

In [98]:
# df.info()

In [99]:
# df.head()

# Word2Vec Pre-trained word embeddings

In [100]:
# # # NOTE: We can either use Pre-trained word embeddings (e.g. word embeddings generated using the word2vec algorithm provided by some organization (e.g. Google) after
# # # training on lots and lots of text (100 billion words)) or We can train our own word embeddings using the word2vec algorithm on our own data (IMDB sentiment dataset)

# wv = gensim.downloader.load('word2vec-google-news-300')

In [101]:
# wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=3) # ========> king - man + woman

In [102]:
# print("Vocab size:", len(wv))                                      

In [103]:
# wv.most_similar("egypt")

In [104]:
# wv.most_similar("good")

In [105]:
# wv.most_similar("bad")

# Training our own word embeddings using Word2Vec `(CBoW)`

In [106]:
# processed_sentences = df['filtered_reviews'].to_list() #Lists in Python are more flexible data structures compared to pandas DataFrames for text data
# processed_labels = df['sentiment_id'].to_numpy() #NumPy arrays offer several advantages for numerical data like sentiment IDs : effiency and memory management 

In [107]:
# len(processed_sentences)

In [108]:
# print("First 10 reviews :",processed_labels [:10] )  
# print ("Nb of rows (reviews) = " , processed_labels.shape)

In [109]:
# print(df['review'].iloc[18400])
# print(processed_sentences[18400])
# print(processed_labels[18400])

In [110]:
# EMBEDDING_DIM = 25 # The dimensionality of each word vector
# # Train our own word embeddings using the word2vec algorithm (CBoW) on IMDB sentiment dataset.
# model = gensim.models.Word2Vec(sentences=processed_sentences, vector_size=EMBEDDING_DIM, window=5, workers=8, min_count=1, sg=0)
# #workers =8 : nb of cpu
# #min_count =1 : minimum count of words 
# #sg =0 : use the CGW algorithm 


In [111]:
# print("Vocab size:", len(model.wv))

In [112]:
# model.wv.most_similar("god")

In [113]:
# model.wv.most_similar("bad")

> `NOTE`: Our `custom word embeddings` seem to have correctly captured semantic relationships between words

In [114]:
# # Save word embeddings

# filename="word-vectors.txt"
# model.wv.save_word2vec_format(filename)

In [115]:
# # Load word embeddings

# embeddings = {}

# with open(filename, 'r', encoding='utf-8') as fin:
#   lines = fin.readlines()
#   lines = [line.strip() for line in lines]

#   for line in lines:
#     elements = line.split()
#     word = elements[0]
#     vector = np.array(elements[1:], dtype=np.float32)

#     embeddings[word] = vector

In [116]:
# embeddings["bad"]

# RNNs for Sentiment Analysis

In [117]:
tokenizer = Tokenizer()
# tokenizer.fit_on_texts(processed_sentences) 

In [118]:
# sequences = tokenizer.texts_to_sequences(processed_sentences) # Convert all sentences to integer id sequences

In [119]:
# print(df['review'].iloc[18400])
# print(processed_sentences[18400])
# print(sequences[18400])

In [120]:
# max_length = max([len(sentence) for sentence in processed_sentences]) # => Max sequence length
# print(max_length)

# # Note: max_length is too much, we will default to a more reasonable max_length

# max_length = 128

In [121]:
# word_index = tokenizer.word_index

In [122]:
# type(word_index)

In [123]:
# print("Number of unique tokens:", len(word_index))

In [124]:
# word_index['good']

In [125]:
# # ex : if we have a sentence 120 words we add 8 padding characters to the messsage ( the padding characters are always 0 )
# padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post') # post : add the padding characters at the end of the short message 
# padded_sequences.shape # retun ( nb of rows , max length)

In [126]:
# padded_sequences [2] #Here an example of the 2nd review after padding the rest of the sentence to zeros to reach the 128 maxlength 

In [127]:
# # Creating an embedding matrix to pass to the embedding layer of the RNN network

# vocab_size = len(tokenizer.word_index) + 1
# embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))

# for word, idx in word_index.items():
#   if idx > vocab_size:
#     continue

#   embedding_vector = embeddings.get(word)

#   if embedding_vector is not None:
#     embedding_matrix[idx] = embedding_vector

In [128]:
# embedding_matrix.shape

In [129]:
# X_train, X_test, y_train, y_test = train_test_split(padded_sequences, processed_labels, test_size=0.1, shuffle=True, random_state=SEED, stratify=processed_labels)

In [130]:
# X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [131]:
# X_train[0] 

In [132]:
# y_train[0]

In [133]:
# model = Sequential()

# # Initialize our embedding layer with our custom trained word embeddings
# model.add(Embedding(vocab_size, EMBEDDING_DIM, embeddings_initializer=Constant(embedding_matrix), input_length=max_length, trainable=False))
# model.add(SimpleRNN(units=16, return_sequences=False, activation='relu'))
# model.add(Dense(16, activation='relu'))
# model.add(Dense(1, activation='sigmoid'))

In [134]:
# model.compile(optimizer=Adam(learning_rate=1e-3), loss='binary_crossentropy', metrics=['accuracy'])

In [135]:
# print(model.summary())

In [136]:
# model.fit(x=X_train, y=y_train, epochs=100, validation_data=(X_test, y_test), verbose=1, batch_size=2048, callbacks=[EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)])

In [137]:
# preds = (model.predict(X_test) > 0.5).astype("int32")

In [138]:
# print(classification_report(y_test, preds))

In [170]:
def main(req):
    try:
        # Get the text from the request body
        req_body = req.get_json()
        text = req_body.get('text')

        if not text:
            logging.error('No text found in request body')
            return ('Error: No text found in request body', 400)

        # Process input data from examples
        examples = [
            "I really don't understand the plot of this movie",
            "what a brilliant set of characters",
            "What a masterpiece !!!",
            "Couldn't get through the first 5 minutes",
            "Boring",
            "Very nicely executed",
            "Should have been given an oscar",
            "ahmed yehia saiddd that the film is sooo soo excited ",
            "i feel sad",
            "negative",
            "zero",
            "THE FILM IS SO boring and rude actors",
            "This restaurant has terrible service",
            "The food was awful and overpriced",
            "I wouldn't recommend this place to anyone",
            "The staff was rude and unprofessional",
            "Worst dining experience ever",
            "I got sick after eating here",
            "The hygiene standards are appalling",
            "Save your money and go elsewhere"
        ]

        # Add the text from the request to the examples list
        if text:
            examples.append(text)

        cleaned_examples = [preprocess(example) for example in examples]
        encoded_examples = tokenizer.texts_to_sequences(cleaned_examples)
        padded_examples = pad_sequences(encoded_examples, maxlen=128, padding='post')

        # Make predictions using loaded_model
        predictions = loaded_model.predict(padded_examples)
        preds = [1 if prediction > 0.5 else 0 for prediction in predictions]
        label_preds = ["Positive" if pred == 1 else "Negative" for pred in preds]

        # Return predictions as JSON with Content-Type header
        headers = {
            "Content-Type": "application/json"
        }
        return json.dumps({"predictions": label_preds}), 200, headers
    except Exception as e:
        logging.error(f'An error occurred: {str(e)}')
        return ('An error occurred', 500)


In [150]:
# def is_valid_json(my_json_string):
#     try:
#         json.loads(my_json_string)
#         return True
#     except ValueError:
#         return False

# # Usage:
# response_from_function =   # Replace with your actual response
# if is_valid_json(response_from_function):
#     print("Valid JSON")
# else:
#     print("Invalid JSON")


In [151]:
# class RequestMock:
#     def __init__(self, json_data):
#         self.json_data = json_data

#     def get_json(self):
#         return self.json_data

# req = RequestMock({"text": "boring"})
# result = main(req)




In [171]:
# examples = [
#     "I really don't understand the plot of this movie",
#     "what a briliant set of characters",
#     "What a masterpiece !!!",
#     "Couldn't get through the first 5 minutes",
#     "Boring",
#     "Very nicely executed",
#     "Should have been given an oscar",
#     "ahmed yehia saiddd that the film is sooo soo excited ",
#     "i felel sad", 
#     "negative", 
#     "zero", 
#     "THE FILM IS SO boring and rude actors",
#     "This restaurant has terrible service",
#     "The food was awful and overpriced",
#     "I wouldn't recommend this place to anyone",
#     "The staff was rude and unprofessional",
#     "Worst dining experience ever",
#     "I got sick after eating here",
#     "The hygiene standards are appalling",
#     "Save your money and go elsewhere"
# ]


In [172]:
# cleaned_examples = [preprocess(example) for example in examples] # Preprocess Each sequences
# encoded_examples = tokenizer.texts_to_sequences(examples) # Convert each word to an integer id
# padded_examples = pad_sequences(encoded_examples, maxlen=128, padding='post') # Pad each sequence to the max_length for batching
# predictions = loaded_model.predict(padded_examples) # Predict the label for each example

# preds = [1 if prediction > 0.5 else 0 for prediction in predictions]
# label_preds = ["Positive" if pred == 1 else "Negative" for pred in preds]



In [173]:
# for example, prediction in zip(examples, label_preds):
#   print(f"Example: {example}")
#   print(f"Sentiment: {prediction}\n")

In [174]:

# def main(req):
#     # Process input data from examples
#     cleaned_examples = [preprocess(example) for example in examples]
#     encoded_examples = tokenizer.texts_to_sequences(cleaned_examples)
#     padded_examples = pad_sequences(encoded_examples, maxlen=128, padding='post')
    
#     # Make predictions using loaded_model
#     predictions = loaded_model.predict(padded_examples)
#     preds = [1 if prediction > 0.5 else 0 for prediction in predictions]
#     label_preds = ["Positive" if pred == 1 else "Negative" for pred in preds]

#     return {"predictions": label_preds}

In [175]:
# # Call the main function
# result = main(None)  # Pass None as req since we're not using it in this case

# # Print the predictions
# print(result)

In [176]:
# from flask import Flask 
# app = Flask(__name__)
# @app.route('/api')
# def helloworld():
#     return prediction
# if __name__ == '__main__':
#     app.run(host='0.0.0.0', port=5000)  

In [177]:
# import pickle
# # Save the model to a file
# file_path = r'D:\AAST\Semester 10\Image Processing\.Marc\Labs\Lab 2\trained_model.pkl'
# with open(file_path, 'wb') as f:
#     pickle.dump(model, f)
