In [5]:
import nltk
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, classification_report
import pandas as pd
import sentencepiece as spm
import os
from datasets import load_dataset

nltk.download('stopwords')

dataset = load_dataset("imdb")

data = pd.DataFrame(dataset['train'])

# Basic preprocessing
data['text'] = data['text'].str.lower()  # Convert text to lowercase
data['label'] = data['label'].map({1: 1, 0: 0})  # Ensure labels are binary

# Initialize stemmers
porter = PorterStemmer()
lancaster = LancasterStemmer()
snowball = SnowballStemmer(language="english")
stop_words = set(stopwords.words('english'))

def porter_stemmer(text):
    words = text.split()
    stemmed_words = [porter.stem(word) for word in words if word not in stop_words]
    return " ".join(stemmed_words)

def lancaster_stemmer(text):
    words = text.split()
    stemmed_words = [lancaster.stem(word) for word in words if word not in stop_words]
    return " ".join(stemmed_words)

def snowball_stemmer(text):
    words = text.split()
    stemmed_words = [snowball.stem(word) for word in words if word not in stop_words]
    return " ".join(stemmed_words)

data['porter_stemmed'] = data['text'].apply(porter_stemmer)
data['lancaster_stemmed'] = data['text'].apply(lancaster_stemmer)
data['snowball_stemmed'] = data['text'].apply(snowball_stemmer)

with open("text_data.txt", "w", encoding="utf-8") as f:
    for review in data['text']:
        f.write(review + "\n")

spm.SentencePieceTrainer.train(input='text_data.txt', model_prefix='bpe', vocab_size=5000, model_type='bpe')

sp = spm.SentencePieceProcessor(model_file='bpe.model')

def bpe_tokenizer(text):
    return " ".join(sp.encode_as_pieces(text))

data['bpe_tokenized'] = data['text'].apply(bpe_tokenizer)

tfidf = TfidfVectorizer(max_features=5000)

# Apply TF-IDF to original, stemmed, and BPE tokenized text
X = tfidf.fit_transform(data['text']).toarray()
X_porter = tfidf.fit_transform(data['porter_stemmed']).toarray()
X_lancaster = tfidf.fit_transform(data['lancaster_stemmed']).toarray()
X_snowball = tfidf.fit_transform(data['snowball_stemmed']).toarray()
X_bpe = tfidf.fit_transform(data['bpe_tokenized']).toarray()

y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train_porter, X_test_porter, y_train_porter, y_test_porter = train_test_split(X_porter, y, test_size=0.3, random_state=42)
X_train_lancaster, X_test_lancaster, y_train_lancaster, y_test_lancaster = train_test_split(X_lancaster, y, test_size=0.3, random_state=42)
X_train_snowball, X_test_snowball, y_train_snowball, y_test_snowball = train_test_split(X_snowball, y, test_size=0.3, random_state=42)
X_train_bpe, X_test_bpe, y_train_bpe, y_test_bpe = train_test_split(X_bpe, y, test_size=0.3, random_state=42)


nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print("Accuracy (Original):", accuracy_score(y_test, y_pred))
print("F1 Score (Original):", f1_score(y_test, y_pred))
print("Classification Report (Original):\n", classification_report(y_test, y_pred))

# Porter Stemmed text
nb_porter = MultinomialNB()
nb_porter.fit(X_train_porter, y_train_porter)
y_pred_porter = nb_porter.predict(X_test_porter)
print("Accuracy (Porter Stemmer):", accuracy_score(y_test_porter, y_pred_porter))
print("F1 Score (Porter Stemmer):", f1_score(y_test_porter, y_pred_porter))
print("Classification Report (Porter Stemmer):\n", classification_report(y_test_porter, y_pred_porter))

# Lancaster Stemmed text
nb_lancaster = MultinomialNB()
nb_lancaster.fit(X_train_lancaster, y_train_lancaster)
y_pred_lancaster = nb_lancaster.predict(X_test_lancaster)
print("Accuracy (Lancaster Stemmer):", accuracy_score(y_test_lancaster, y_pred_lancaster))
print("F1 Score (Lancaster Stemmer):", f1_score(y_test_lancaster, y_pred_lancaster))
print("Classification Report (Lancaster Stemmer):\n", classification_report(y_test_lancaster, y_pred_lancaster))

# Snowball Stemmed text
nb_snowball = MultinomialNB()
nb_snowball.fit(X_train_snowball, y_train_snowball)
y_pred_snowball = nb_snowball.predict(X_test_snowball)
print("Accuracy (Snowball Stemmer):", accuracy_score(y_test_snowball, y_pred_snowball))
print("F1 Score (Snowball Stemmer):", f1_score(y_test_snowball, y_pred_snowball))
print("Classification Report (Snowball Stemmer):\n", classification_report(y_test_snowball, y_pred_snowball))

# BPE Tokenized text
nb_bpe = MultinomialNB()
nb_bpe.fit(X_train_bpe, y_train_bpe)
y_pred_bpe = nb_bpe.predict(X_test_bpe)
print("Accuracy (BPE Tokenization):", accuracy_score(y_test_bpe, y_pred_bpe))
print("F1 Score (BPE Tokenization):", f1_score(y_test_bpe, y_pred_bpe))
print("Classification Report (BPE Tokenization):\n", classification_report(y_test_bpe, y_pred_bpe))

# Cleanup temporary files
os.remove("text_data.txt")
os.remove("bpe.model")
os.remove("bpe.vocab")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mayaa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Accuracy (Original): 0.852
F1 Score (Original): 0.8509266720386786
Classification Report (Original):
               precision    recall  f1-score   support

           0       0.85      0.86      0.85      3752
           1       0.86      0.85      0.85      3748

    accuracy                           0.85      7500
   macro avg       0.85      0.85      0.85      7500
weighted avg       0.85      0.85      0.85      7500

Accuracy (Porter Stemmer): 0.8473333333333334
F1 Score (Porter Stemmer): 0.8473129750633418
Classification Report (Porter Stemmer):
               precision    recall  f1-score   support

           0       0.85      0.85      0.85      3752
           1       0.85      0.85      0.85      3748

    accuracy                           0.85      7500
   macro avg       0.85      0.85      0.85      7500
weighted avg       0.85      0.85      0.85      7500

Accuracy (Lancaster Stemmer): 0.8498666666666667
F1 Score (Lancaster Stemmer): 0.8501862692921767
Classificatio