In [1]:
from datasets import load_dataset
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import re
import nltk
from nltk.corpus import stopwords

In [2]:
# Load the IMDB dataset
ds = load_dataset("stanfordnlp/imdb")

In [3]:
# Convert to pandas DataFrame
df_train = pd.DataFrame(ds['train'])
df_test = pd.DataFrame(ds['test'])

In [4]:
# Combine train and test data for processing
df = pd.concat([df_train, df_test])


In [5]:
# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jorgen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Preprocess text: remove punctuation, capitalization, and stop words
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stop words
    return text

df['text'] = df['text'].apply(preprocess_text)

In [7]:
# Split back into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(df['text'], df['label'], test_size=0.5, random_state=42)

In [8]:
# TF-IDF
vectorizer_tfidf = TfidfVectorizer()
X_train_tfidf = vectorizer_tfidf.fit_transform(train_texts)
X_test_tfidf = vectorizer_tfidf.transform(test_texts)

In [9]:
# Logistic Regression with TF-IDF
model_tfidf = LogisticRegression(max_iter=1000)
model_tfidf.fit(X_train_tfidf, train_labels)
predictions_tfidf = model_tfidf.predict(X_test_tfidf)

In [10]:
accuracy_tfidf = accuracy_score(test_labels, predictions_tfidf)
report_tfidf = classification_report(test_labels, predictions_tfidf)

print(f"TF-IDF Accuracy: {accuracy_tfidf}")
print(f"TF-IDF Classification Report:\n{report_tfidf}")

TF-IDF Accuracy: 0.89044
TF-IDF Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.87      0.89     12603
           1       0.88      0.91      0.89     12397

    accuracy                           0.89     25000
   macro avg       0.89      0.89      0.89     25000
weighted avg       0.89      0.89      0.89     25000



### Reflection:

- **How well does this model perform?**
- The model performs well with an accuracy of 89%. Both precision and recall are high for both positive and negative classes, resulting in a balanced F1-score.
- - **Context Ignorance:** TF-IDF, while effective, does not consider the context or order of words. More advanced models like BERT can capture semantic meaning better.
- **Simplicity of Model:** Logistic Regression is a simple model. Exploring more complex models like neural networks might improve performance further.