In [182]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import requests
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [183]:
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [184]:
url = 'https://www.imdb.com/title/tt1745960/reviews/?ref_=tt_ql_2'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

In [185]:
reviews = []
labels = []

In [186]:
for review in soup.find_all(class_='review-container'):
    text = review.find(class_='text show-more__control').get_text(strip=True)
    sentiment_elem = review.find(class_='spoiler-warning')
    if sentiment_elem is not None:
        sentiment = sentiment_elem.get_text(strip=True)
        labels.append(sentiment)
    else:
        labels.append('N/A')
    reviews.append(text)

In [187]:
if len(reviews) == 0 or len(labels) == 0:
    print("Error: No reviews or labels found.")
else:
    # Perform train-test split
    from sklearn.model_selection import train_test_split

    # Preprocess the reviews
    reviews = [preprocess_text(review) for review in reviews]

    # Convert labels to numpy array
    labels = np.array(labels)


In [188]:
 X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=42)

In [189]:
stop_words = stopwords.words('english')
vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [190]:
nb_model = MultinomialNB()
nb_preds = []
if len(X_train) > 0 and len(X_test) > 0:
    nb_model.fit(X_train_vec, y_train)
    nb_preds = nb_model.predict(X_test_vec)

In [191]:
svm_model = SVC()
svm_preds = []
if len(X_train) > 0 and len(X_test) > 0:
    svm_model.fit(X_train_vec, y_train)
    svm_preds = svm_model.predict(X_test_vec)

In [192]:
lr_model = LogisticRegression()
lr_preds = []
if len(X_train) > 0 and len(X_test) > 0:
    lr_model.fit(X_train_vec, y_train)
    lr_preds = lr_model.predict(X_test_vec)

In [193]:
if len(X_train) > 0 and len(X_test) > 0:
    nb_accuracy = accuracy_score(y_test, nb_preds)
    nb_precision = precision_score(y_test, nb_preds, average='weighted')
    nb_recall = recall_score(y_test, nb_preds, average='weighted')

    svm_accuracy = accuracy_score(y_test, svm_preds)
    svm_precision = precision_score(y_test, svm_preds, average='weighted')
    svm_recall = recall_score(y_test, svm_preds, average='weighted')

    lr_accuracy = accuracy_score(y_test, lr_preds)
    lr_precision = precision_score(y_test, lr_preds, average='weighted')
    lr_recall = recall_score(y_test, lr_preds, average='weighted')

    # Print evaluation results
    print("Naive Bayes Accuracy: ", nb_accuracy)
    print("Naive Bayes Precision: ", nb_precision)
    print("Naive Bayes Recall: ", nb_recall)
    print("SVM Accuracy: ", svm_accuracy)
    print("SVM Precision: ", svm_precision)
    print("SVM Recall: ", svm_recall)
    print("Logistic Regression Accuracy: ", lr_accuracy)
    print("Logistic Regression Precision: ", lr_precision)
    print("Logistic Regression Recall: ", lr_recall)
    
else:
    print("Error: Empty train or test set.")

Naive Bayes Accuracy:  0.8
Naive Bayes Precision:  0.64
Naive Bayes Recall:  0.8
SVM Accuracy:  0.8
SVM Precision:  0.64
SVM Recall:  0.8
Logistic Regression Accuracy:  0.8
Logistic Regression Precision:  0.64
Logistic Regression Recall:  0.8


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
