In [None]:
import collections
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import spacy
import tensorflow as tf

from gensim.models.keyedvectors import KeyedVectors
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
import kagglehub

In [None]:
path = kagglehub.dataset_download("leadbest/googlenewsvectorsnegative300")

print("Path to dataset files:", path)

embedding_file = '/root/.cache/kagglehub/datasets/leadbest/googlenewsvectorsnegative300/versions/2/GoogleNews-vectors-negative300.bin.gz'

In [None]:
word_vectors = KeyedVectors.load_word2vec_format(embedding_file, binary=True, limit=200000)

vector = word_vectors['apple']
print(vector)
similar = word_vectors.most_similar('apple')
print(similar)


In [None]:
pizza = word_vectors['pizza']
print(f'Vector dimension: {pizza.shape}')
print(pizza)

In [None]:
print(word_vectors.similarity('pizza', 'tomato'))
print(word_vectors.similarity('pizza', 'sauce'))
print(word_vectors.similarity('pizza', 'cheese'))
print(word_vectors.similarity('pizza', 'burger'))
print(word_vectors.similarity('pizza', 'car'))
print(word_vectors.similarity('pizza', 'restaurant'))
print(word_vectors.similarity('pizza', 'Italy'))
print(word_vectors.similarity('pizza', 'computer'))

In [None]:
word_vectors.n_similarity("king rules kingdom".split(), "monarch governs empire".split())
word_vectors.n_similarity("dog bites man".split(), "man bites dog".split())

In [None]:
s1 = "Apple plans to increase iPhone production in India".lower().split()
print(s1)
s2 = "Samsung to launch new Galaxy phones with foldable screens".lower().split()
print(s2)
word_vectors.n_similarity(s1, s2)

In [None]:
word_vectors.most_similar(positive=['king'], topn=10)

In [None]:
word_vectors.most_similar(positive=['king', 'woman'], negative=['man'], topn=10)

In [None]:
word_vectors.doesnt_match(["cat", "dog", "hamster", "car", "rabbit"])


In [None]:
def display_pca_scatterplot(model, word_list):
    # Extract word vectors from the model
    vectors = np.array([model[word] for word in word_list])

    # Reduce dimensions from high-dimensional space (300) to 2D
    reduced_vectors = PCA(n_components=2).fit_transform(vectors)

    # Plot the 2D vectors
    plt.figure(figsize=(10, 10))
    plt.scatter(reduced_vectors[:, 0], reduced_vectors[:, 1], c='r', edgecolors='k', s=128)

    # Label each point with the corresponding word
    for word, (x, y) in zip(word_list, reduced_vectors):
        plt.text(x + 0.05, y + 0.05, word)

    plt.title("2D PCA of Word Embeddings")
    plt.grid(True)
    plt.show()

words = ['apple', 'banana', 'orange', 'grape', 'dog', 'cat', 'lion', 'tiger']
display_pca_scatterplot(word_vectors, words)


In [None]:
word_vectors = KeyedVectors.load_word2vec_format(embedding_file, binary=True, limit=1000000)

In [None]:
word_vectors['king']
word_vectors.similarity('car', 'bus')
word_vectors.most_similar('apple')

In [None]:
!wget -P /root/input/ -c "https://s3.amazonaws.com/fast-ai-nlp/yelp_review_polarity_csv.tgz"

In [None]:
!tar xvzf /root/input/yelp_review_polarity_csv.tgz
!pwd

In [None]:
yelp_train = pd.read_csv('yelp_review_polarity_csv/train.csv', names=['sentiment', 'review'])
print(yelp_train.shape)
yelp_train.head()

In [None]:
yelp_train = yelp_train.sample(frac=1, random_state=1)[:100000].copy()
print(yelp_train.shape)
yelp_train.head()

In [None]:
yelp_train['sentiment'].replace(to_replace=1, value=0, inplace=True)
yelp_train['sentiment'].replace(to_replace=2, value=1, inplace=True)
yelp_train.head()

In [None]:
yelp_train_split, yelp_val_split = train_test_split(yelp_train, train_size=0.85, random_state=1)

# Training
X_train = yelp_train_split['review']
y_train = yelp_train_split['sentiment']

# Validation
X_val = yelp_val_split['review']
y_val = yelp_val_split['sentiment']

collections.Counter(y_train)

In [None]:
tokenizer = Tokenizer(num_words=20000,  # or adjust based on your vocab size
                      filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                      lower=True)

tokenizer.fit_on_texts(yelp_train_split['review'])
train_sequences = tokenizer.texts_to_sequences(yelp_train_split['review'])

X_train = keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=200)
y_train = yelp_train_split['sentiment'].astype('float32')  # ensure correct type

print(train_sequences[0])
tokenizer.word_index

In [None]:
print([tokenizer.index_word[x] for x in train_sequences[1][:2]])
print(train_sequences[1][:2])
print(tokenizer.index_word[14])
print(tokenizer.index_word[382])
print(tokenizer.sequences_to_texts([[14]]))
print(tokenizer.sequences_to_texts([[382]]))
print(tokenizer.sequences_to_texts([train_sequences[0]])[0][:300])
print(X_train[0][:300])

In [None]:
# Tokenize and pad training reviews
train_sequences = tokenizer.texts_to_sequences(yelp_train_split['review'])
X_train = keras.preprocessing.sequence.pad_sequences(train_sequences, maxlen=200)

# Tokenize and pad validation reviews
val_sequences = tokenizer.texts_to_sequences(yelp_val_split['review'])
X_val = keras.preprocessing.sequence.pad_sequences(val_sequences, maxlen=200)

print(X_train.shape)
print(X_val.shape)

In [None]:
vocab_size = len(tokenizer.word_index) + 1

embedding_size = 300
pretrained_embeddings = np.zeros((vocab_size, embedding_size))

for term, index in tokenizer.word_index.items():
    if word_vectors.has_index_for(term):
        pretrained_embeddings[index] = word_vectors[term].copy()

print(pretrained_embeddings[tokenizer.word_index['good']][:50])

In [None]:
embedding_layer = keras.layers.Embedding(
    input_dim=vocab_size,
    output_dim=embedding_size,
    embeddings_initializer=keras.initializers.Constant(pretrained_embeddings),
    input_length=200,
    trainable=True  #
)


In [None]:
tf.random.set_seed(0)

text_classifier = keras.Sequential()

# This layer will output a sequence of 300-dimensional vectors (one per word).
text_classifier.add(embedding_layer)

# This layer averages the word vectors into a single vector per review
text_classifier.add(layers.GlobalAveragePooling1D())

text_classifier.add(layers.Dense(128, activation='relu',
                                 kernel_initializer=tf.keras.initializers.RandomNormal(seed=1)))
text_classifier.add(layers.Dense(64, activation='relu',
                                 kernel_initializer=tf.keras.initializers.RandomNormal(seed=1)))
text_classifier.add(layers.Dense(1, activation='sigmoid',
                                 kernel_initializer=tf.keras.initializers.RandomNormal(seed=1)))

text_classifier.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)


In [None]:
# Sample input review
sample_review = "football is good game"
print(f"Review: {sample_review}")

# Convert the review into a sequence of token IDs
sample_sequence = tokenizer.texts_to_sequences([sample_review])
print(f"Tokenized sequence: {sample_sequence}")

# Pass the sequence through the embedding layer
sample_input_array = np.array(sample_sequence)
embedded_output = embedding_layer(sample_input_array)

# Print the shape of the embedding output
batch_size, seq_len, embed_dim = embedded_output.shape
print(f"Embedding shape → (Batch size: {batch_size}, Sequence length: {seq_len}, Embedding size: {embed_dim})")

# Compute the average embedding (pooled representation)
avg_embedding = np.mean(embedded_output, axis=1)
print(f"Averaged embedding shape: {avg_embedding.shape}")


In [None]:
text_classifier.summary()

In [None]:
training_history = text_classifier.fit(
    X_train,
    y_train,
    epochs=20,
    batch_size=512,
    validation_data=(X_val, y_val)
)


In [None]:
def plot_model_performance(training_history):
    train_loss = training_history.history['loss']
    val_loss = training_history.history['val_loss']

    train_acc = training_history.history['accuracy']
    val_acc = training_history.history['val_accuracy']

    num_epochs = range(1, len(train_loss) + 1)

    import matplotlib.pyplot as plt
    fig, (loss_ax, acc_ax) = plt.subplots(2)
    fig.set_figheight(15)
    fig.set_figwidth(15)
    fig.tight_layout(pad=5.0)

    # Plot training vs. validation loss
    loss_ax.plot(num_epochs, train_loss, 'bo', label='Training Loss')
    loss_ax.plot(num_epochs, val_loss, 'b', label='Validation Loss')
    loss_ax.set_title('Training vs. Validation Loss')
    loss_ax.set_xlabel('Epoch')
    loss_ax.set_ylabel('Loss')
    loss_ax.legend()

    # Plot training vs. validation accuracy
    acc_ax.plot(num_epochs, train_acc, 'bo', label='Training Accuracy')
    acc_ax.plot(num_epochs, val_acc, 'b', label='Validation Accuracy')
    acc_ax.set_title('Training vs. Validation Accuracy')
    acc_ax.set_xlabel('Epoch')
    acc_ax.set_ylabel('Accuracy')
    acc_ax.legend()

    plt.show()
plot_model_performance(training_history)

In [None]:
text_embedding_layer = layers.Embedding(
    input_dim=vocab_size,
    output_dim=embedding_size,
    embeddings_initializer=keras.initializers.Constant(pretrained_embeddings),
    input_length=200,
    trainable=True
)

text_classifier = keras.Sequential()
text_classifier.add(text_embedding_layer)
text_classifier.add(layers.GlobalAveragePooling1D())
text_classifier.add(layers.Dense(128, activation='relu'))
text_classifier.add(layers.Dense(64, activation='relu'))
text_classifier.add(layers.Dense(1, activation='sigmoid'))

text_classifier.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

history = text_classifier.fit(
    X_train,
    y_train,
    epochs=3,
    batch_size=512,
    validation_data=(X_val, y_val)
)


In [None]:
# Load test data
yelp_test = pd.read_csv('yelp_review_polarity_csv/test.csv', names=['sentiment', 'review'])

# Fix sentiment labels
yelp_test['sentiment'] = yelp_test['sentiment'].replace({1: 0, 2: 1})

# Tokenize and pad reviews
X_test = tokenizer.texts_to_sequences(yelp_test['review'])
X_test = keras.preprocessing.sequence.pad_sequences(X_test, maxlen=200)

# Convert labels to float32 numpy array
y_test = yelp_test['sentiment'].astype('float32').values

# Evaluate model
test_loss, test_accuracy = text_classifier.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.2%}")


In [None]:
def predict_sentiment(review_list):
    encoded_sequences = tokenizer.texts_to_sequences(review_list)
    padded_sequences = keras.preprocessing.sequence.pad_sequences(encoded_sequences, maxlen=200)
    predictions = text_classifier.predict(padded_sequences)
    return predictions

# Real reviews from Google Reviews.
pos_review = "The best seafood joint in East Village San Diego!  Great lobster roll, great fish, great oysters, great bread, great cocktails, and such amazing service.  The atmosphere is top notch and the location is so much fun being located just a block away from Petco Park (San Diego Padres Stadium)."
neg_review = "A thoroughly disappointing experience. When you book a Marriott you expect a certain standard. Albany falls way short. Room cleaning has to be booked 24 hours in advance but nobody thought to mention this at check in. The hotel is tired and needs a face-lift. The only bright light in a sea of mediocrity were the pancakes at breakfast. Sadly they weren't enough to save the experience. If you travel to Albany, then do yourself a big favour and book the Westin."

sample_reviews = [pos_review, neg_review]
results = predict_sentiment(sample_reviews)

for review, score in zip(sample_reviews, results):
    sentiment = "Positive" if score >= 0.5 else "Negative"
    print(f"{sentiment} ({score[0]:.2f}): {review[:100]}...")
