In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import os
import re

In [2]:
# Directory paths
train_dir = "aclImdb/train"
test_dir = "aclImdb/test"

# Read and preprocess the movie reviews
def read_reviews(directory):
    reviews = []
    ratings = []
    for label in ["pos", "neg"]:
        label_dir = os.path.join(directory, label)
        for filename in os.listdir(label_dir):
            with open(os.path.join(label_dir, filename), "r") as file:
                review = file.read()
                rating = int(filename.split("_")[1].split(".")[0])
                rating = (rating - 1) / 9.0
                reviews.append(review)
                ratings.append(rating)
    return reviews, ratings

# Read training and testing data
train_reviews, train_ratings = read_reviews(train_dir)
test_reviews, test_ratings = read_reviews(test_dir)

In [3]:
# Create bag of words representation
vectorizer = CountVectorizer()
train_features = vectorizer.fit_transform(train_reviews)
test_features = vectorizer.transform(test_reviews)

# Train the model
model = LinearRegression()
model.fit(train_features, train_ratings)

print('Training finished')

Training finished


In [7]:
# Make predictions on the testing set
predictions = model.predict(test_features)

# Evaluate the model
mse = mean_squared_error(test_ratings, np.ones_like(test_ratings) * 0.5)
msa = mean_absolute_error(test_ratings, np.ones_like(test_ratings) * 0.5)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", msa)

Mean Squared Error: 0.15044543209876546
Mean Absolute Error: 0.36553777777777774


In [5]:
# Function to predict the rating of a new review
def predict_rating(review_text):
    # Preprocess the review text
    review_features = vectorizer.transform([review_text])
    
    # Make prediction
    predicted_rating = model.predict(review_features)
    
    return predicted_rating[0]

# Example usage
new_review = "This film powerfully demonstrates the struggle of two women in love in a culture so deeply entrenched in ritual and tradition. All this against a backdrop of an India which itself is struggling for freedom from these same values. This film is both political and personal and never too preachy or idealistic on either front. It is easy to see why 'Fire' has caused riots in India, but tragic nonetheless. A true film such as this one deserves to be seen by all people of the world, not just privileged westerners."
predicted_rating = predict_rating(new_review)
print("Predicted Rating:", predicted_rating)

Predicted Rating: 0.43882057512453565
