In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('stopwords')
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
def load_dataset(train_data):
    df = pd.read_csv(train_data)
    # df2 = pd.read_csv("/content/drive/MyDrive/Senti_Task/CSV/Test/merged_test.csv")
    x_data = df['comment']       # Reviews/Input
    y_data = df['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    english_stops = set(stopwords.words('english'))
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    # x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops ])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case

    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_train, y_train  = load_dataset("/content/drive/MyDrive/Senti_Task/CSV/Train/merged_train.csv")

print('Reviews')
print(x_train, '\n')
print('Sentiment')
print(y_train)

Reviews
0        [story, man, unnatural, feelings, pig, starts,...
1        [airport, 77, starts, brand, new, luxury, 747,...
2        [film, lacked, something, couldnt, put, finger...
3        [sorry, everyone, know, supposed, art, film, w...
4        [little, parents, took, along, theater, see, i...
                               ...                        
24995    [seeing, vote, average, pretty, low, fact, cle...
24996    [plot, wretched, unbelievable, twists, however...
24997    [amazed, movieand, others, average, 5, stars, ...
24998    [christmas, together, actually, came, time, iv...
24999    [workingclass, romantic, drama, director, mart...
Name: comment, Length: 25000, dtype: object 

Sentiment
0        0
1        0
2        0
3        0
4        0
        ..
24995    1
24996    1
24997    1
24998    1
24999    1
Name: sentiment, Length: 25000, dtype: int64


In [None]:
vocab_size = 10000
oov_tok = "<OOV>"
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(x_train)
print("Number of Documents: ", tokenizer.document_count)
print("Number of Words: ", tokenizer.num_words)

Number of Documents:  25000
Number of Words:  10000


In [None]:
train_sequences = tokenizer.texts_to_sequences(x_train)
print(train_sequences[0])

[12, 50, 7492, 1247, 4500, 390, 503, 51, 1157, 346, 1631, 123, 1, 7611, 206, 564, 1964, 978, 2885, 801, 1, 5281, 350, 2467, 1631, 122, 10, 689, 1175, 710, 147, 1346, 8, 951, 564, 1, 301, 9, 25, 2112, 187, 657, 724, 1, 1549, 567, 47, 130, 30, 7, 496, 594, 20, 1, 1, 594, 285, 3344, 1, 1, 8263, 34, 3189]


In [None]:
sequence_length = 200
train_padded = pad_sequences(train_sequences, maxlen=sequence_length, padding='post', truncating='post')

In [None]:
x_test, y_test  = load_dataset("/content/drive/MyDrive/Senti_Task/CSV/Test/merged_test.csv")
test_sequences = tokenizer.texts_to_sequences(x_test)
test_padded = pad_sequences(test_sequences, maxlen=sequence_length, padding='post', truncating='post')

In [None]:
embedding_dim = 32
lstm_out = 64

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=sequence_length))
model.add(Bidirectional(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))  # Added dropout for regularization
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 32)           320000    
                                                                 
 bidirectional (Bidirection  (None, 128)               49664     
 al)                                                             
                                                                 
 dense (Dense)               (None, 32)                4128      
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 10)                330       
                                                                 
 dense_2 (Dense)             (None, 1)                 11        
                                                        

In [None]:
checkpoint_filepath = os.getcwd()
model_checkpoint_callback = ModelCheckpoint(filepath=checkpoint_filepath, save_weights_only=False, monitor='val_loss', mode='min', save_best_only=True)
callbacks = [EarlyStopping(patience=2), model_checkpoint_callback]

In [None]:
history = model.fit(train_padded, y_train, epochs=10, validation_data=(test_padded, y_test), callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


In [None]:
metrics_df = pd.DataFrame(history.history)
print(metrics_df)

       loss  accuracy  val_loss  val_accuracy
0  0.435463   0.79540  0.352795      0.847355
1  0.263754   0.90516  0.511506      0.826137
2  0.201042   0.92812  0.341468      0.851473
3  0.162614   0.94332  0.570158      0.809837
4  0.137098   0.95284  0.422329      0.829568


Based on the provided data, it appears to be a summary of the performance of a machine learning model across five training epochs. Let's analyze the key metrics:

1. **Loss:**
   - Initial epoch: 0.435463
   - Second epoch: 0.263754
   - Third epoch: 0.201042
   - Fourth epoch: 0.162614
   - Fifth epoch: 0.137098

   The decreasing trend in training loss over epochs indicates that the model is learning and improving its performance on the training data.

2. **Accuracy:**
   - Initial epoch: 0.79540 (79.54%)
   - Second epoch: 0.90516 (90.52%)
   - Third epoch: 0.92812 (92.81%)
   - Fourth epoch: 0.94332 (94.33%)
   - Fifth epoch: 0.95284 (95.28%)

   The increasing trend in training accuracy suggests that the model is getting better at correctly classifying instances in the training set.

3. **Validation Loss and Accuracy:**
   - Initial epoch: val_loss = 0.352795, val_accuracy = 0.847355 (84.74%)
   - Second epoch: val_loss = 0.511506, val_accuracy = 0.826137 (82.61%)
   - Third epoch: val_loss = 0.341468, val_accuracy = 0.851473 (85.15%)
   - Fourth epoch: val_loss = 0.570158, val_accuracy = 0.809837 (80.98%)
   - Fifth epoch: val_loss = 0.422329, val_accuracy = 0.829568 (82.96%)

   The validation metrics provide insight into how well the model generalizes to unseen data. The fluctuation in validation metrics, especially the increase in validation loss in the fourth epoch, suggests some level of overfitting or instability.

In summary, the model shows improvement in training accuracy and loss over the five epochs. However, there are signs of potential overfitting or instability, as indicated by the fluctuating validation metrics. Further analysis and potential adjustments, such as regularization techniques or model architecture changes, might be necessary to enhance generalization performance. Monitoring these metrics in future training epochs and iterations is crucial for refining the model.

In [None]:
# Tokenize and pad the testing data
X_test_sequence = tokenizer.texts_to_sequences(x_test)
X_test_padded = pad_sequences(X_test_sequence, maxlen=sequence_length)

# Load the trained model
model.load_weights(checkpoint_filepath)

# Make predictions
predictions = model.predict(X_test_padded)

# Convert predictions to binary labels (0 or 1) based on a threshold (e.g., 0.5)
threshold = 0.7
binary_predictions = (predictions > threshold).astype(int)

# Display the predictions
print(binary_predictions)


 60/547 [==>...........................] - ETA: 53s

In [None]:

true = 0
for i, y in enumerate(y_test):
    if y == binary_predictions[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(binary_predictions) - true))
print('Accuracy: {}'.format(true/len(binary_predictions)*100))

Correct Prediction: 13516
Wrong Prediction: 3969
Accuracy: 77.30054332284816
