# Learning Aug 13, 2024

In [7]:
# Import the one_hot function from Keras
from tensorflow.keras.preprocessing.text import one_hot

# List of sentences (corpus)
sent = ['the glass of milk',
        'the glass of juice',
        'the cup of tea',
        'I am a good boy',
        'I am a good developer',
        'understand the meaning of words',
        'your videos are good']

# Define the vocabulary size (the number of distinct "words" or tokens)
voc_size = 10000  # Arbitrary large number for the size of the vocabulary

# One-Hot Representation
# The one_hot function converts each word in a sentence to a unique integer within the range of the vocabulary size
# This representation encodes each sentence into a list of integers
one_hot_rep = [one_hot(words, voc_size) for words in sent]

# Output the one-hot representations of the sentences
one_hot_rep  # Display the one-hot encoded representation of each sentence in the corpus


[[9970, 4748, 2683, 6373],
 [9970, 4748, 2683, 8011],
 [9970, 4227, 2683, 6540],
 [1639, 8660, 8960, 1878],
 [1639, 8660, 8960, 6775],
 [9384, 9970, 7881, 2683, 6468],
 [5139, 8604, 360, 8960]]

In [11]:
# Import necessary libraries and modules
from tensorflow.keras.layers import Embedding  # For creating the Embedding layer
from tensorflow.keras.preprocessing.sequence import pad_sequences  # For padding sequences to a fixed length
from tensorflow.keras.models import Sequential  # For creating a sequential model
import numpy as np  # For numerical operations (though not directly used here)
import pandas as pd  # For data manipulation (though not directly used here)

# Define the fixed length for padding
sent_length = 8  # The maximum length for each sentence after padding

# Pad the sequences
# The one_hot_rep list contains one-hot encoded sentences (each as a list of integers)
# pad_sequences is used to ensure that all sentences have the same length (sent_length)
# padding='pre': Pads at the beginning of each sequence
# maxlen=sent_length: Ensures each sequence is padded or truncated to this length
embedding_docs = pad_sequences(one_hot_rep, padding='pre', maxlen=sent_length)

# Output the padded sequences
embedding_docs  # Display the padded one-hot encoded sentences


array([[   0,    0,    0,    0, 9970, 4748, 2683, 6373],
       [   0,    0,    0,    0, 9970, 4748, 2683, 8011],
       [   0,    0,    0,    0, 9970, 4227, 2683, 6540],
       [   0,    0,    0,    0, 1639, 8660, 8960, 1878],
       [   0,    0,    0,    0, 1639, 8660, 8960, 6775],
       [   0,    0,    0, 9384, 9970, 7881, 2683, 6468],
       [   0,    0,    0,    0, 5139, 8604,  360, 8960]], dtype=int32)

In [15]:
# Feature representation
dim = 10
model = Sequential()
model.add(Embedding(input_dim=voc_size, output_dim=dim, input_length=sent_lenght))
model.compile(optimizer='adam', loss='mse')
model.summary()


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 8, 10)             100000    
                                                                 
Total params: 100000 (390.62 KB)
Trainable params: 100000 (390.62 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [18]:
model.predict(embeding_docs[0])



array([[ 0.00870134, -0.04319935,  0.03288524, -0.04634121,  0.0322769 ,
         0.03393401, -0.04155124, -0.00121266,  0.02294954,  0.01713358],
       [ 0.00870134, -0.04319935,  0.03288524, -0.04634121,  0.0322769 ,
         0.03393401, -0.04155124, -0.00121266,  0.02294954,  0.01713358],
       [ 0.00870134, -0.04319935,  0.03288524, -0.04634121,  0.0322769 ,
         0.03393401, -0.04155124, -0.00121266,  0.02294954,  0.01713358],
       [ 0.00870134, -0.04319935,  0.03288524, -0.04634121,  0.0322769 ,
         0.03393401, -0.04155124, -0.00121266,  0.02294954,  0.01713358],
       [ 0.03360671, -0.00490683,  0.04243331, -0.03041276,  0.04366989,
         0.01851756, -0.00270362, -0.0348999 , -0.02017366,  0.0052737 ],
       [ 0.01419992, -0.04662051,  0.01390103,  0.04335174,  0.02910485,
         0.0476536 ,  0.01059177, -0.04762473, -0.00821515, -0.04748931],
       [ 0.03951428,  0.01920395,  0.02698118, -0.00295428, -0.01484007,
         0.02776779, -0.02510238, -0.02383589

# Review project with Simple RNN

In [61]:
# Import necessary libraries
import numpy as np  # NumPy for numerical computations
import pandas as pd  # Pandas for data manipulation (though not used in this specific code)
import tensorflow as tf  # TensorFlow for building and training the neural network models
from tensorflow.keras.datasets import imdb  # IMDb dataset for sentiment analysis
from tensorflow.keras.preprocessing import sequence  # Utility for padding sequences to a fixed length
from tensorflow.keras.models import Sequential  # Sequential model for building the neural network
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense  # Layers for the neural network: Embedding, RNN, and Dense (fully connected)

# Load the IMDb dataset
max_features = 100000  # Maximum number of words to consider in the vocabulary (most frequent)
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=max_features)  # Load training and testing data with a limited vocabulary
# Output the shapes of the data arrays to understand their structure
X_train.shape, y_train.shape, X_test.shape, y_test.shape  # Should return the shape of the training and testing datasets

# Preprocess the data by padding sequences
max_len = 500  # Maximum length of each sequence (number of words in each review)
X_train = sequence.pad_sequences(X_train, maxlen=max_len)  # Pad training sequences to ensure uniform length
X_test = sequence.pad_sequences(X_test, maxlen=max_len)  # Pad testing sequences similarly

# Build a simple RNN model
model = Sequential()  # Initialize a Sequential model, which allows stacking layers sequentially
model.add(Embedding(input_dim=max_features, output_dim=128, input_length=max_len))  # Add an Embedding layer to convert word indices into dense vectors of fixed size (128 dimensions)
model.add(SimpleRNN(128, activation='relu'))  # Add a SimpleRNN layer with 128 units and ReLU activation function
model.add(Dense(1, activation="sigmoid"))  # Add a Dense layer with 1 unit and sigmoid activation, suitable for binary classification (positive/negative sentiment)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])  # Compile the model with Adam optimizer, binary cross-entropy loss, and accuracy metric
model.summary()  # Print a summary of the model architecture



Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 500, 128)          12800000  
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 128)               32896     
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 12833025 (48.95 MB)
Trainable params: 12833025 (48.95 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [62]:
# Import EarlyStopping callback from Keras
from tensorflow.keras.callbacks import EarlyStopping

# Initialize the EarlyStopping callback
# monitor='val_loss': The metric to be monitored during training is the validation loss
# patience=5: Training will stop if the validation loss does not improve for 5 consecutive epochs
# restore_best_weights=True: After stopping, the model weights will be restored to those from the epoch with the best validation loss
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with early stopping
# epochs=10: The model will train for a maximum of 10 epochs
# batch_size=32: The model will update weights after every batch of 32 samples
# validation_split=0.2: 20% of the training data will be used as validation data to monitor performance
# callbacks=[early_stopping]: The EarlyStopping callback is passed to the model, so training can be stopped early if the validation loss does not improve
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[early_stopping])


Epoch 1/10


2024-08-13 09:56:23.214392: W tensorflow/core/grappler/utils/graph_view.cc:849] No registered '' OpKernel for CPU devices compatible with node {{node sequential_6/simple_rnn_1/while/body/_1/sequential_6/simple_rnn_1/while/simple_rnn_cell/Relu}}
	.  Registered:  <no registered kernels>

2024-08-13 09:56:23.252321: E tensorflow/core/grappler/optimizers/tfg_optimizer_hook.cc:134] tfg_optimizer{any(tfg-consolidate-attrs,tfg-toposort,tfg-shape-inference{graph-version=0},tfg-prepare-attrs-export)} failed: INVALID_ARGUMENT: Node sequential_6/simple_rnn_1/while/body/_1/sequential_6/simple_rnn_1/while/simple_rnn_cell/Relu has an empty op name
	when importing GraphDef to MLIR module in GrapplerHook
2024-08-13 09:56:23.274073: E tensorflow/core/grappler/optimizers/tfg_optimizer_hook.cc:134] tfg_optimizer{any(tfg-consolidate-attrs,tfg-functional-to-region,tfg.func(tfg-cf-sink),tfg-region-to-functional{force-control-capture=true},tfg-lift-legacy-call,symbol-privatize{},symbol-dce,tfg-prepare-attrs-

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


<keras.src.callbacks.History at 0x7f38b812c8b0>

In [64]:
# save model file
model.save('simple_run_imdb.h5')

In [67]:
# Import necessary libraries and modules
from tensorflow.keras.models import load_model  # For loading the pre-trained model
from tensorflow.keras.preprocessing import sequence  # For padding sequences to a fixed length

# Load the pre-trained model
# The model was trained previously and saved as 'simple_run_imdb.h5'
# This model will be used for predicting the sentiment of new movie reviews
model1 = load_model('simple_run_imdb.h5')

# Function to decode the encoded review back to words
# The IMDb dataset encodes reviews as sequences of integers corresponding to word indices
# This function converts those integers back to the original words using a dictionary (reverse_word_index)
def decoded_review(encoded_review):
    # The integer indices are shifted by 3 to account for special tokens, so we subtract 3 from each index
    # If a word index is not found in the reverse_word_index, a '?' is used as a placeholder
    return ' '.join([reverse_word_index.get(i - 3, '?') for i in encoded_review])

# Function to preprocess text for prediction
# Converts a raw text review into a format suitable for the model
def preprocessing_text(text):
    words = text.lower().split()  # Convert the review to lowercase and split it into individual words
    encoded_review = [word_index.get(word, 2) + 3 for word in words]  # Encode the words as integers using word_index
    padded_review = sequence.pad_sequences([encoded_review], maxlen=500)  # Pad the sequence to ensure it has a length of 500
    return padded_review

# Prediction function
# Takes a text review, preprocesses it, and then uses the model to predict the sentiment
def predict_sentiment(review):
    preprocessed_input = preprocessing_text(review)  # Preprocess the input text to prepare it for the model
    prediction = model1.predict(preprocessed_input)  # Use the model to predict sentiment
    sentiment = 'positive' if prediction[0][0] > 0.5 else 'negative'  # Determine the sentiment based on the model's output
    return sentiment, prediction[0][0]  # Return the sentiment and the prediction score (probability)

# Example usage of the prediction function
example_review = "This movie was fantastic! The acting was great and the plot was thrilling."
sentiment, score = predict_sentiment(example_review)  # Predict sentiment for the example review

# Print the results
print(f'Review: {example_review}')  # Output the original review text
print(f'Sentiment: {sentiment}')  # Output the predicted sentiment (positive/negative)
print(f'Prediction score: {score}')  # Output the model's prediction score (probability)



Review: This movie was fantastic! The acting was great and the plot was thrilling.
Sentiment: positive
Prediction score: 0.8728386163711548


In [70]:
#
# Import necessary libraries
import streamlit as st

# Set up the Streamlit app
st.title('IMDB Movie Review Sentiment Analysis')
st.write('Enter a movie review to classify it as positive or negative.')

# User input
user_input = st.text_area('Movie Review')

# Check if the classify button is pressed
if st.button('Classify'):
    # Preprocess the user input
    preprocessed_input = preprocessing_text(user_input)
    
    # Make prediction
    prediction = model1.predict(preprocessed_input)
    sentiment = 'positive' if prediction[0][0] > 0.5 else 'negative'
    
    # Display the result
    st.write(f'Sentiment: {sentiment}')
    st.write(f'Prediction Score: {prediction[0][0]}')
else:
    st.write('Please enter a movie review')


2024-08-13 10:38:11.086 
  command:

    streamlit run /usr/local/lib/python3.8/dist-packages/ipykernel_launcher.py [ARGUMENTS]
2024-08-13 10:38:11.087 Session state does not function when running a script without `streamlit run`
