In [20]:
#Run this command in terminal-> python -m spacy download en_core_web_lg
import pandas as pd
import numpy as np
from tqdm import tqdm
import spacy
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import tensorflow as tf
from gensim.models import KeyedVectors

# Load data
df = pd.read_csv("./Week_1/Day_4/movies_sentiment_data.csv")

In [None]:
df['sentiment_numerical'] = df.sentiment.map({'positive': 1, 'negative': 0})


In [None]:
# Load the Word2Vec model
from huggingface_hub import hf_hub_download
repo_id = "NathaNn1111/word2vec-google-news-negative-300-bin"
filename = "GoogleNews-vectors-negative300.bin"
model_path = hf_hub_download(repo_id=repo_id, filename=filename)
word2vec = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [None]:
# Load spacy model
nlp = spacy.load("en_core_web_lg")


# Function to create mean vector for a review
def review_to_vector(review):
    tokens = [token.text.lower() for token in nlp(review) if token.is_alpha]
    vectors = [word2vec[word] for word in tokens if word in word2vec]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(word2vec.vector_size)

In [None]:
# Generate mean vectors for all reviews
tqdm.pandas()
df['vector'] = df['review'].progress_apply(review_to_vector)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 623/623 [00:33<00:00, 18.51it/s]


In [None]:
# Prepare data for training
X = np.stack(df['vector'].values)
y = df['sentiment_numerical'].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a simple neural network
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=500, batch_size=32, validation_split=0.2)

# Evaluate the model
y_pred = (model.predict(X_test) > 0.5).astype(int).flatten()
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Epoch 1/500


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.5047 - loss: 0.6908 - val_accuracy: 0.5000 - val_loss: 0.6898
Epoch 2/500
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5633 - loss: 0.6794 - val_accuracy: 0.5000 - val_loss: 0.6777
Epoch 3/500
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5905 - loss: 0.6674 - val_accuracy: 0.5000 - val_loss: 0.6700
Epoch 4/500
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6133 - loss: 0.6538 - val_accuracy: 0.6400 - val_loss: 0.6490
Epoch 5/500
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7102 - loss: 0.6287 - val_accuracy: 0.6500 - val_loss: 0.6262
Epoch 6/500
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6993 - loss: 0.6049 - val_accuracy: 0.7300 - val_loss: 0.5860
Epoch 7/500
[1m13/13[0m [32m━━━━━━━━━━━━━━

In [None]:
# Function to preprocess and predict sentiment for a new review
def predict_sentiment(review, model, word2vec, nlp):
    # Tokenize and create a mean vector for the review
    tokens = [token.text.lower() for token in nlp(review) if token.is_alpha]
    vectors = [word2vec[word] for word in tokens if word in word2vec]
    if vectors:
        mean_vector = np.mean(vectors, axis=0)
    else:
        mean_vector = np.zeros(word2vec.vector_size)
    
    # Predict sentiment
    prediction = model.predict(mean_vector.reshape(1, -1))[0][0]
    sentiment = "positive" if prediction > 0.5 else "negative"
    confidence = prediction if sentiment == "positive" else 1 - prediction
    return sentiment, confidence

# Example reviews for inference
example_reviews = [
    "The movie was absolutely fantastic, a masterpiece!",
    "It was a total waste of time. Horrible acting and a boring plot.",
    "I enjoyed the performances, but the story was predictable.",
    "The cinematography was stunning, but the dialogues were weak.",
    "An amazing experience, I would watch it again!"
]

# Run inference on example reviews
for review in example_reviews:
    sentiment, confidence = predict_sentiment(review, model, word2vec, nlp)
    print(f"Review: {review}\nPredicted Sentiment: {sentiment} (Confidence: {confidence:.2f})\n")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Review: The movie was absolutely fantastic, a masterpiece!
Predicted Sentiment: positive (Confidence: 1.00)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Review: It was a total waste of time. Horrible acting and a boring plot.
Predicted Sentiment: negative (Confidence: 1.00)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
Review: I enjoyed the performances, but the story was predictable.
Predicted Sentiment: positive (Confidence: 1.00)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
Review: The cinematography was stunning, but the dialogues were weak.
Predicted Sentiment: negative (Confidence: 1.00)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Review: An amazing experience, I would watch it again!
Predicted Sentiment: positive (Confidence: 1.00)

