In [25]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, confusion_matrix
import os
import re

In [18]:
# Directory paths
train_dir = "aclImdb_v1/aclImdb/train"
test_dir = "aclImdb_v1/aclImdb/test"

# Read and preprocess the movie reviews
def read_reviews(directory):
    reviews = []
    ratings = []
    for label in ["pos", "neg"]:
        label_dir = os.path.join(directory, label)
        for filename in os.listdir(label_dir):
            with open(os.path.join(label_dir, filename), "r") as file:
                try:
                    review = file.read()
                except UnicodeDecodeError:
                    with open(os.path.join(label_dir, filename), "r", encoding="latin-1") as file:
                        review = file.read()
                rating = int(filename.split("_")[1].split(".")[0])
                rating = (rating - 1) / 9.0
                reviews.append(review)
                ratings.append(rating)
    return reviews, ratings

# Read training and testing data
train_reviews, train_ratings = read_reviews(train_dir)
test_reviews, test_ratings = read_reviews(test_dir)

In [19]:
#print an example
print("Example")
print("review: ", train_reviews[0])
print("rating: ", train_ratings[0])

Example
review:  Zentropa has much in common with The Third Man, another noir-like film set among the rubble of postwar Europe. Like TTM, there is much inventive camera work. There is an innocent American who gets emotionally involved with a woman he doesn't really understand, and whose naivety is all the more striking in contrast with the natives.<br /><br />But I'd have to say that The Third Man has a more well-crafted storyline. Zentropa is a bit disjointed in this respect. Perhaps this is intentional: it is presented as a dream/nightmare, and making it too coherent would spoil the effect. <br /><br />This movie is unrelentingly grim--"noir" in more than one sense; one never sees the sun shine. Grim, but intriguing, and frightening.
rating:  0.6666666666666666


In [34]:
len(train_ratings)

25000

### Baseline Bag of Words Model

In [20]:
# Create bag of words representation
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform(train_reviews)
test_features = vectorizer.transform(test_reviews)

# Train the model
model = LinearRegression()
model.fit(train_features, train_ratings)

print('Training finished')

Training finished


In [21]:
# Function to predict the rating of a new review
def predict_rating(review_text):
    # Preprocess the review text
    review_features = vectorizer.transform([review_text])
    
    # Make prediction
    predicted_rating = model.predict(review_features)
    
    return predicted_rating[0]

# Example usage
new_review = "This film powerfully demonstrates the struggle of two women in love in a culture so deeply entrenched in ritual and tradition. All this against a backdrop of an India which itself is struggling for freedom from these same values. This film is both political and personal and never too preachy or idealistic on either front. It is easy to see why 'Fire' has caused riots in India, but tragic nonetheless. A true film such as this one deserves to be seen by all people of the world, not just privileged westerners."
predicted_rating = predict_rating(new_review)
print("Predicted Rating:", predicted_rating)

Predicted Rating: 0.4387218480281554


### Interpretation

In [22]:
top_k = 20
bottom_k = 20

coefficients = model.coef_

# Get the whole vocabulary
tokens = vectorizer.get_feature_names_out()
print(f'Size of the vocabulary (i.e. the dimension of the data): {len(tokens)}')
print(f'Size of the training data: {len(train_reviews)}\n')

# Pick the most and the least important tokens
coeff_tokens = list(zip(coefficients, tokens))
sorted_coeff_tokens = sorted(coeff_tokens, key=lambda x: np.abs(x[0]), reverse=True)
print(f'top {top_k} most important tokens, with their coefficients: ')
for coef, token in sorted_coeff_tokens[:top_k]:
    print(f'{token} : {coef}')
print()
print(f'bottom {bottom_k} least important tokens, with their coefficients: ')
for coef, token in sorted_coeff_tokens[-bottom_k:]:
    print(f'{token} : {coef}')

Size of the vocabulary (i.e. the dimension of the data): 74849
Size of the training data: 25000

top 20 most important tokens, with their coefficients: 
evilest : -0.7773204948059027
reccommend : 0.659615112344284
unexpecting : 0.6172886394607975
labeling : -0.5713097194205504
abductions : -0.562729100747804
chungking : -0.5598040681684273
damnit : 0.5401431725143944
raimy : 0.5271947442153081
patterson : -0.5189534712902111
sharpen : -0.5123163670584275
extremelly : -0.5118941384046103
slovenian : 0.500947631289223
octress : -0.49975323630543117
ailing : 0.49851396794000563
corncobs : -0.49241025557615314
overstatement : 0.4784211466781818
blasphemous : -0.4756101672931148
architectural : 0.46881094901003756
pota : -0.4659927461345996
reeling : 0.460805519632763

bottom 20 least important tokens, with their coefficients: 
coselli : -1.1911651335721926e-05
delarua : -1.1911651335721926e-05
emiliano : -1.1911651335721926e-05
esperando : -1.1911651335721926e-05
menen : -1.191165133572192

### Model performance

In [45]:
# Make predictions on the testing set
predictions = model.predict(test_features)
predictions = np.clip(predictions, 0, 1)

# Evaluate the model
mse = mean_squared_error(test_ratings, predictions)
mae = mean_absolute_error(test_ratings, predictions)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)

Mean Squared Error: 0.16715439239019775
Mean Absolute Error: 0.30340951814718525


In [48]:
# Classify reviews as 'positive' (1) or 'negative' (0)
binary_predictions = np.where(predictions < 0.5, 0, 1)
binary_test_ratings = np.where(np.array(test_ratings) < 0.5, 0, 1)

conf_matrix = confusion_matrix(binary_test_ratings, binary_predictions)
conf_matrix_df = pd.DataFrame(conf_matrix, 
                      index = [ 'actual negative reviews',  'actual positive reviews'], 
                      columns = [ 'predicted negative reviews',  'predicted positive reviews'] )
conf_matrix_df

Unnamed: 0,predicted negative reviews,predicted positive reviews
actual negative reviews,8702,3798
actual positive reviews,3885,8615


In [49]:
tp = conf_matrix[1, 1]  # True Positives
fp = conf_matrix[0, 1]  # False Positives
fn = conf_matrix[1, 0]  # False Negatives
tn = conf_matrix[0, 0]  # True Negatives

precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = (2 * precision * recall) / (precision + recall)

print('Precision: ', precision)
print('Recall: ', recall)
print('F1 score:', f1_score)

Precision:  0.6940304519455409
Recall:  0.6892
F1 score: 0.6916067916348895


### Bag of Words Model removing fill words

In [53]:
# Combine fill words found during dataset analysis with default English stop words
fill_words = ['the', 'and', 'of', 'to', 'is', 'br', 'it', 'in', 'this', 'that', 'was', 'as', 
              'for', 'with', 'movie', 'but', 'film', 'you', 'on', 'he', 'are', 'his', 'have',
              'be', 'one', 'at', 'they', 'by', 'an', 'who', 'so', 'from', 'there', 'her', 'or', 
              'about', 'out', 'if', 'has', 'what', 'some', 'can', 'she', 'when', 'even', 'my', 
              'would', 'which', 'story', 'see', 'their', 'had', 'we', 'were', 'me', 'than', 
              'much', 'get', 'been', 'people', 'will', 'do', 'other', 'also', 'up', 'into', 'first',
              'all', 'no', 'just', 'how', 'because', 'then']

from sklearn.feature_extraction import text 
stop_words = list(text.ENGLISH_STOP_WORDS.union(fill_words))


In [54]:
# Create bag of words representation
vectorizer = CountVectorizer(stop_words = stop_words)
train_features = vectorizer.fit_transform(train_reviews)
test_features = vectorizer.transform(test_reviews)

# Train the model
model = LinearRegression()
model.fit(train_features, train_ratings)

print('Training finished')

Training finished


In [55]:
# Make predictions on the test set and clip to [0, 1]
predictions = model.predict(test_features)
predictions = np.clip(predictions, 0, 1)

# Evaluate the model
mse = mean_squared_error(test_ratings, predictions)
mae = mean_absolute_error(test_ratings, predictions)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)

Mean Squared Error: 0.18345109386501124
Mean Absolute Error: 0.31853475109780754


In [56]:
# Function to predict the rating of a new review
def predict_rating(review_text):
    # Preprocess the review text
    review_features = vectorizer.transform([review_text])
    
    # Make prediction
    predicted_rating = model.predict(review_features)
    
    return predicted_rating[0]

# Example usage
new_review = "This film powerfully demonstrates the struggle of two women in love in a culture so deeply entrenched in ritual and tradition. All this against a backdrop of an India which itself is struggling for freedom from these same values. This film is both political and personal and never too preachy or idealistic on either front. It is easy to see why 'Fire' has caused riots in India, but tragic nonetheless. A true film such as this one deserves to be seen by all people of the world, not just privileged westerners."
predicted_rating = predict_rating(new_review)
print("Predicted Rating:", predicted_rating)

Predicted Rating: 0.23746848485294475


### Model performance

In [57]:
# Make predictions on the testing set
predictions = model.predict(test_features)
predictions = np.clip(predictions, 0, 1)

# Evaluate the model
mse = mean_squared_error(test_ratings, predictions)
mae = mean_absolute_error(test_ratings, predictions)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)

Mean Squared Error: 0.18345109386501124
Mean Absolute Error: 0.31853475109780754


In [58]:
# Classify reviews as 'positive' (1) or 'negative' (0)
binary_predictions = np.where(predictions < 0.5, 0, 1)
binary_test_ratings = np.where(np.array(test_ratings) < 0.5, 0, 1)

conf_matrix = confusion_matrix(binary_test_ratings, binary_predictions)
conf_matrix_df = pd.DataFrame(conf_matrix, 
                      index = [ 'actual negative reviews',  'actual positive reviews'], 
                      columns = [ 'predicted negative reviews',  'predicted positive reviews'] )
conf_matrix_df

Unnamed: 0,predicted negative reviews,predicted positive reviews
actual negative reviews,8440,4060
actual positive reviews,4185,8315


In [59]:
tp = conf_matrix[1, 1]  # True Positives
fp = conf_matrix[0, 1]  # False Positives
fn = conf_matrix[1, 0]  # False Negatives
tn = conf_matrix[0, 0]  # True Negatives

precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = (2 * precision * recall) / (precision + recall)

print('Precision: ', precision)
print('Recall: ', recall)
print('F1 score:', f1_score)

Precision:  0.6719191919191919
Recall:  0.6652
F1 score: 0.6685427135678391
