##**NLP Sentiment Analysis**

   ### Objective: Train → Save → Evaluate → Load → Predict

In [1]:
!pip install tensorflow nltk



Import libraries

In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import string
import os
import pickle

    Downloads the tokenizer (punkt) and list of English stopwords.

In [3]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Download the missing resource

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [54]:
# 1. Sample raw text and labels
raw_sentences = [
    "I love machine learning!",
    "This is a great course.",
    "NLP is fascinating.",
    "I hate this subject.",
    "This is boring.",
    "I do not like winters.",
    "Ecuador is amazing",
    "Soccer is the best sport",
    "I regret coming here",
    "The food is the worst"
]
labels = [1, 1, 1, 0, 0, 0, 1, 1, 0, 0]  # 1 = positive, 0 = negative

In [55]:
# 2. Preprocessing: lowercase, remove punctuation, remove stopwords
stop_words = set(stopwords.words('english'))

In [56]:
print(stop_words)

{'further', 'm', 'only', 'few', 'after', "doesn't", 'shan', 'is', 'having', 'him', 'our', "should've", 'ma', 'under', 'we', 'which', 'are', 'he', 'myself', 'should', 'she', 'had', "haven't", "she'll", 'very', 'until', 'does', "i'll", 'for', 'her', "they've", 'yourselves', 's', 'were', 'you', 'shouldn', 'why', "i'm", 'this', 'it', "that'll", 'those', 'his', 'itself', 'haven', 'when', "wasn't", "he'd", 'below', 'ain', "mightn't", 'at', "weren't", 'or', 'do', 'all', 'now', 'because', 'while', 'did', 'am', "hadn't", "we've", 'between', 'couldn', 'above', 'isn', "i've", 'can', 'of', 'here', "shan't", 'y', 'where', 'my', "won't", 'mustn', 'ourselves', 'about', 're', 'ours', 'these', "couldn't", "they'd", "isn't", 'hasn', 'been', 'hadn', 'then', 'just', 'yours', 'them', 't', "we'd", 'me', 'out', 'than', 'd', 'if', "it's", 'their', 'nor', 'needn', 'through', 'what', 'other', 'before', 'your', 'being', 'down', 'wasn', 'herself', 'on', "aren't", 'hers', "you'll", 'up', 'to', "you'd", 'with', 'su

In [57]:
# 2. Preprocessing: lowercase, remove punctuation, remove stopwords
stop_words = set(stopwords.words('english'))

def preprocess_text(sentence):
    tokens = word_tokenize(sentence.lower())  # lowercase + tokenize
    tokens = [word for word in tokens if word.isalpha()]  # remove punctuation
    tokens = [word for word in tokens if word not in stop_words]  # remove stopwords
    return ' '.join(tokens)

cleaned_sentences = [preprocess_text(sent) for sent in raw_sentences]

In [58]:
print(cleaned_sentences)

['love machine learning', 'great course', 'nlp fascinating', 'hate subject', 'boring', 'like winters', 'ecuador amazing', 'soccer best sport', 'regret coming', 'food worst']


In [59]:
# Tokenize texts
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(cleaned_sentences)
sequences = tokenizer.texts_to_sequences(cleaned_sentences)
padded = pad_sequences(sequences, padding='post')



In [60]:
print("Word Index (Tokenized Vocabulary):")
print(tokenizer.word_index)

Word Index (Tokenized Vocabulary):
{'<OOV>': 1, 'love': 2, 'machine': 3, 'learning': 4, 'great': 5, 'course': 6, 'nlp': 7, 'fascinating': 8, 'hate': 9, 'subject': 10, 'boring': 11, 'like': 12, 'winters': 13, 'ecuador': 14, 'amazing': 15, 'soccer': 16, 'best': 17, 'sport': 18, 'regret': 19, 'coming': 20, 'food': 21, 'worst': 22}


In [61]:
print("\nTokenized Sentences (as sequences):")
for i, seq in enumerate(sequences):
    print(f"{raw_sentences[i]} → {seq}")


Tokenized Sentences (as sequences):
I love machine learning! → [2, 3, 4]
This is a great course. → [5, 6]
NLP is fascinating. → [7, 8]
I hate this subject. → [9, 10]
This is boring. → [11]
I do not like winters. → [12, 13]
Ecuador is amazing → [14, 15]
Soccer is the best sport → [16, 17, 18]
I regret coming here → [19, 20]
The food is the worst → [21, 22]


In [39]:
# with open("tokenizer.pkl", "rb") as f:
#     tokenizer = pickle.load(f)
#     print(tokenizer)

In [62]:
# Build model with input_shape explicitly declared in build()
model = Sequential([
    Embedding(input_dim=100, output_dim=16, input_length=padded.shape[1]),
    GlobalAveragePooling1D(),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Explicitly build the model before summary
model.build(input_shape=(None, padded.shape[1]))

# Now summary will display shapes and parameters
model.summary()




In [63]:
# Compile and train
import numpy as np

# Convert labels list to numpy array
labels = np.array(labels)

# Compile and train
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(padded, labels, epochs=10, verbose=2)



Epoch 1/10
1/1 - 1s - 1s/step - accuracy: 0.7000 - loss: 0.6913
Epoch 2/10
1/1 - 0s - 44ms/step - accuracy: 0.7000 - loss: 0.6895
Epoch 3/10
1/1 - 0s - 59ms/step - accuracy: 0.9000 - loss: 0.6878
Epoch 4/10
1/1 - 0s - 63ms/step - accuracy: 0.9000 - loss: 0.6861
Epoch 5/10
1/1 - 0s - 44ms/step - accuracy: 1.0000 - loss: 0.6845
Epoch 6/10
1/1 - 0s - 59ms/step - accuracy: 1.0000 - loss: 0.6829
Epoch 7/10
1/1 - 0s - 49ms/step - accuracy: 1.0000 - loss: 0.6814
Epoch 8/10
1/1 - 0s - 60ms/step - accuracy: 1.0000 - loss: 0.6798
Epoch 9/10
1/1 - 0s - 47ms/step - accuracy: 1.0000 - loss: 0.6782
Epoch 10/10
1/1 - 0s - 58ms/step - accuracy: 1.0000 - loss: 0.6765


<keras.src.callbacks.history.History at 0x7de8daedab10>

In [66]:
# 9. Testing on new sentences
test_sentences = ["I love this subject!", "This is terrible.", "Colombia is the best", "Basketball is boring", "The chair is horrible", "I don't like the hair cut"]
cleaned_test = [preprocess_text(sent) for sent in test_sentences]
test_seq = tokenizer.texts_to_sequences(cleaned_test)
test_pad = pad_sequences(test_seq, padding='post', maxlen=padded.shape[1])

predictions = model.predict(test_pad)

for i, sentence in enumerate(test_sentences):
    sentiment = "Positive" if predictions[i][0] > 0.5 else "Negative"
    print(f"'{sentence}' → {sentiment} ({predictions[i][0]:.2f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
'I love this subject!' → Negative (0.49)
'This is terrible.' → Negative (0.50)
'Colombia is the best' → Positive (0.51)
'Basketball is boring' → Negative (0.50)
'The chair is horrible' → Positive (0.50)
'I don't like the hair cut' → Positive (0.50)
