In [None]:
!tar -xzvf aclImdb_v1.tar.gz

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
aclImdb/train/unsup/20402_0.txt
aclImdb/train/unsup/20401_0.txt
aclImdb/train/unsup/20400_0.txt
aclImdb/train/unsup/20399_0.txt
aclImdb/train/unsup/20398_0.txt
aclImdb/train/unsup/20397_0.txt
aclImdb/train/unsup/20396_0.txt
aclImdb/train/unsup/20395_0.txt
aclImdb/train/unsup/20394_0.txt
aclImdb/train/unsup/20393_0.txt
aclImdb/train/unsup/20392_0.txt
aclImdb/train/unsup/20391_0.txt
aclImdb/train/unsup/20390_0.txt
aclImdb/train/unsup/20389_0.txt
aclImdb/train/unsup/20388_0.txt
aclImdb/train/unsup/20387_0.txt
aclImdb/train/unsup/20386_0.txt
aclImdb/train/unsup/20385_0.txt
aclImdb/train/unsup/20384_0.txt
aclImdb/train/unsup/20383_0.txt
aclImdb/train/unsup/20382_0.txt
aclImdb/train/unsup/20381_0.txt
aclImdb/train/unsup/20380_0.txt
aclImdb/train/unsup/20379_0.txt
aclImdb/train/unsup/20378_0.txt
aclImdb/train/unsup/20377_0.txt
aclImdb/train/unsup/20376_0.txt
aclImdb/train/unsup/20375_0.txt
aclImdb/train/unsup/20374_0.txt
aclImdb

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
import os

In [None]:
# Download NLTK data (run once if not already downloaded)
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Load and preprocess the dataset
def load_reviews(data_path):
    texts = []
    labels = []
    pos_path = os.path.join(data_path, 'pos')
    neg_path = os.path.join(data_path, 'neg')

    for filename in os.listdir(pos_path):
        if filename.endswith('.txt'):
            with open(os.path.join(pos_path, filename), 'r', encoding='utf-8') as file:
                texts.append(file.read())
                labels.append(1)

    for filename in os.listdir(neg_path):
        if filename.endswith('.txt'):
            with open(os.path.join(neg_path, filename), 'r', encoding='utf-8') as file:
                texts.append(file.read())
                labels.append(0)

    return np.array(texts), np.array(labels)

In [None]:
# Preprocess text function
def preprocess_text(texts):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    processed_texts = []

    for text in texts:
        tokens = [word for word in text.split() if word.lower() not in stop_words]
        stemmed_tokens = [stemmer.stem(word) for word in tokens]
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]
        processed_texts.append(' '.join(lemmatized_tokens))

    return processed_texts

In [None]:
# Load the training data
train_texts, train_labels = load_reviews('/content/aclImdb/train')
train_texts_preprocessed = preprocess_text(train_texts)

In [None]:
# Split data into training and validation sets
X_train_texts, X_val_texts, y_train_labels, y_val_labels = train_test_split(train_texts_preprocessed, train_labels, test_size=0.2, random_state=42)


In [None]:
# Vectorize the data using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000)
X_train = vectorizer.fit_transform(X_train_texts)
X_val = vectorizer.transform(X_val_texts)

In [None]:
# Load and preprocess the test data
test_texts, test_labels = load_reviews('/content/aclImdb/test')
test_texts_preprocessed = preprocess_text(test_texts)
X_test = vectorizer.transform(test_texts_preprocessed)

In [None]:
# Train the Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train_labels)

In [None]:
# Evaluate the model on the validation set
y_val_pred = nb_model.predict(X_val)
val_accuracy = accuracy_score(y_val_labels, y_val_pred)
print("Validation accuracy with Naive Bayes model:", val_accuracy)

Validation accuracy with Naive Bayes model: 0.8664


In [None]:
# Evaluate the model on the test set
y_test_pred = nb_model.predict(X_test)
test_accuracy = accuracy_score(test_labels, y_test_pred)
print("Test accuracy with Naive Bayes model:", test_accuracy)


Test accuracy with Naive Bayes model: 0.83448
