In [82]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Process data

In [83]:
reviews = pd.read_csv('reviews.txt', header=None)
labels = pd.read_csv('labels.txt', header=None)
Y = (labels=='positive').astype(np.int_)

- Split the reviews and labels in test, train and validation sets.

In [84]:
# Split the data into training, validation, and test sets
X_train, X_test, Y_train, Y_test = train_test_split(reviews, Y, test_size=0.2, random_state=42)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)

# Create a CountVectorizer with a maximum of 10,000 most frequent words
max_features = 10000
vectorizer = CountVectorizer(max_features=max_features)

# Fit and transform the CountVectorizer on the training data
X_train_bow = vectorizer.fit_transform(X_train[0])
X_val_bow = vectorizer.transform(X_val[0])
X_test_bow = vectorizer.transform(X_test[0])

- Explore the representation of the reviews.

single word

In [85]:
def explore_word(word_to_explore):
    # Check if the word exists in the vocabulary
    if word_to_explore in vectorizer.vocabulary_:
        word_index = vectorizer.vocabulary_[word_to_explore]
    
        # Term frequency (TF) of the word "movie" in the training data
        tf_movie = X_train_bow[:, word_index].toarray()
    
        # Count how many times "movie" appears in the training data
        word_count = tf_movie.sum()
    
        # Print the results
        print(f"Index of '{word_to_explore}' in the vocabulary: {word_index}")
        print(f"Term Frequency (TF) of '{word_to_explore}' in the training data: {tf_movie}")
        print(f"Count of '{word_to_explore}' appearances in the training data: {word_count}")
    else:
        print(f"'{word_to_explore}' is not in the vocabulary.")

In [86]:
# Create a CountVectorizer with a maximum of 10,000 most frequent words
max_features = 10000
vectorizer = CountVectorizer(max_features=max_features)

# Fit and transform the CountVectorizer on the training data
X_train_bow = vectorizer.fit_transform(X_train[0])

# Assuming to explore the representation of the word
explore_word("movie")
print("-----------------")
explore_word("excellent")
print("-----------------")
explore_word("oooooo")

Index of 'movie' in the vocabulary: 5850
Term Frequency (TF) of 'movie' in the training data: [[0]
 [0]
 [2]
 ...
 [0]
 [7]
 [0]]
Count of 'movie' appearances in the training data: 28015
-----------------
Index of 'excellent' in the vocabulary: 3097
Term Frequency (TF) of 'excellent' in the training data: [[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]
Count of 'excellent' appearances in the training data: 1325
-----------------
'oooooo' is not in the vocabulary.


whole review

In [87]:
def explore_whole_review(review):
    # Use the CountVectorizer to transform the review into a BoW representation
    review_bow = vectorizer.transform([review])

    # Get the vocabulary (list of words) from the CountVectorizer
    vocabulary = vectorizer.get_feature_names_out()

    # Extract the term frequencies (TF) from the BoW representation
    tf_review = review_bow.toarray()[0]

    # Print the BoW representation for the review
    print("BoW Representation for the Review:")
    for word, tf in zip(vocabulary, tf_review):
        if tf > 0:
            print(f"'{word}': {tf}")

In [88]:
explore_whole_review("The movie was excellent, and the acting was outstanding.")
print('--------------')
explore_whole_review("the giant monster movie genre with the living mummy movie genre .")


BoW Representation for the Review:
'acting': 1
'and': 1
'excellent': 1
'movie': 1
'outstanding': 1
'the': 2
'was': 2
--------------
BoW Representation for the Review:
'genre': 2
'giant': 1
'living': 1
'monster': 1
'movie': 2
'mummy': 1
'the': 2
'with': 1


# Train network

- Train a neural network with a single hidden layer on the dataset, tuning the relevant hyperparameters to optimize accuracy.

In [89]:
import tensorflow as tf
from tensorflow import keras
from sklearn.metrics import accuracy_score


Define a simple neural network with one hidden layer


In [90]:
model = keras.Sequential([
    keras.layers.Input(shape=(10000,)),  
    keras.layers.Dense(150, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

Compile the model

In [91]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

Train the model on your training and validation data

In [92]:
history = model.fit(X_train_bow.toarray(), Y_train, epochs=8, validation_data=(X_val_bow.toarray(), Y_val))

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


After tuning and training, evaluate the model on the test data

In [93]:
y_pred = (model.predict(X_test_bow.toarray()) > 0.5).astype(int)
test_accuracy = accuracy_score(Y_test, y_pred)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.8756


# Use the classifier to classify a few sentences

In [94]:
# Sample sentences to classify
sample_sentences = [
    "I loved the movie, it was amazing!",
    "The weather today is terrible.",
    "The food at that restaurant was outstanding.",
    "I'm feeling really sad today.",
    "The service was terrible, and I had a bad experience.",
]

sample_bow = vectorizer.transform(sample_sentences)

# Make predictions using the classifier
sample_predictions = (model.predict(sample_bow.toarray()) > 0.5).astype(int)

# Display the results
for sentence, prediction in zip(sample_sentences, sample_predictions):
    sentiment = "positive" if prediction == 1 else "negative"
    print(f'Sentence: "{sentence}" => Predicted Sentiment: {sentiment}')

Sentence: "I loved the movie, it was amazing!" => Predicted Sentiment: positive
Sentence: "The weather today is terrible." => Predicted Sentiment: positive
Sentence: "The food at that restaurant was outstanding." => Predicted Sentiment: positive
Sentence: "I'm feeling really sad today." => Predicted Sentiment: positive
Sentence: "The service was terrible, and I had a bad experience." => Predicted Sentiment: negative
