In [None]:
!tar -xzvf aclImdb_v1.tar.gz

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
aclImdb/train/pos/6854_10.txt
aclImdb/train/pos/6853_10.txt
aclImdb/train/pos/6852_8.txt
aclImdb/train/pos/6851_8.txt
aclImdb/train/pos/6850_7.txt
aclImdb/train/pos/6849_7.txt
aclImdb/train/pos/6848_7.txt
aclImdb/train/pos/6847_10.txt
aclImdb/train/pos/6846_10.txt
aclImdb/train/pos/6845_10.txt
aclImdb/train/pos/6844_10.txt
aclImdb/train/pos/6843_9.txt
aclImdb/train/pos/6842_7.txt
aclImdb/train/pos/6841_7.txt
aclImdb/train/pos/6840_10.txt
aclImdb/train/pos/6839_10.txt
aclImdb/train/pos/6838_7.txt
aclImdb/train/pos/6837_10.txt
aclImdb/train/pos/6836_9.txt
aclImdb/train/pos/6835_10.txt
aclImdb/train/pos/6834_10.txt
aclImdb/train/pos/6833_10.txt
aclImdb/train/pos/6832_10.txt
aclImdb/train/pos/6831_9.txt
aclImdb/train/pos/6830_10.txt
aclImdb/train/pos/6829_8.txt
aclImdb/train/pos/6828_10.txt
aclImdb/train/pos/6827_10.txt
aclImdb/train/pos/6826_9.txt
aclImdb/train/pos/6825_10.txt
aclImdb/train/pos/6824_10.txt
aclImdb/train/pos/

In [None]:
# Complete Code for MLP Model Training and Evaluation

# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk

# Download NLTK data (run once if not already downloaded)
nltk.download('stopwords')
nltk.download('wordnet')

# Load and preprocess the dataset
def load_reviews(data_path):
    texts = []
    labels = []
    pos_path = os.path.join(data_path, 'pos')
    neg_path = os.path.join(data_path, 'neg')

    for filename in os.listdir(pos_path):
        if filename.endswith('.txt'):
            with open(os.path.join(pos_path, filename), 'r', encoding='utf-8') as file:
                texts.append(file.read())
                labels.append(1)

    for filename in os.listdir(neg_path):
        if filename.endswith('.txt'):
            with open(os.path.join(neg_path, filename), 'r', encoding='utf-8') as file:
                texts.append(file.read())
                labels.append(0)

    return np.array(texts), np.array(labels)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Preprocess text function
def preprocess_text(texts):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    processed_texts = []

    for text in texts:
        tokens = [word for word in text.split() if word.lower() not in stop_words]
        stemmed_tokens = [stemmer.stem(word) for word in tokens]
        lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]
        processed_texts.append(' '.join(lemmatized_tokens))

    return processed_texts


In [None]:
# Load the training data
import os
import numpy as np
train_texts, train_labels = load_reviews('/content/aclImdb/train')
train_texts_preprocessed = preprocess_text(train_texts)

In [None]:
# Split data into training and validation sets
X_train_texts, X_val_texts, y_train_labels, y_val_labels = train_test_split(train_texts_preprocessed, train_labels, test_size=0.2, random_state=42)


In [None]:
# Vectorize the data using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000)
X_train = vectorizer.fit_transform(X_train_texts)
X_val = vectorizer.transform(X_val_texts)

In [None]:
# Load and preprocess the test data
test_texts, test_labels = load_reviews('/content/aclImdb/test')
test_texts_preprocessed = preprocess_text(test_texts)
X_test = vectorizer.transform(test_texts_preprocessed)

In [None]:
# Train the MLP model
mlp_model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, activation='relu', solver='adam', random_state=42)
mlp_model.fit(X_train, y_train_labels)

In [None]:
# Evaluate the model on the validation set
y_val_pred = mlp_model.predict(X_val)
val_accuracy = accuracy_score(y_val_labels, y_val_pred)
print("Validation accuracy with MLP model:", val_accuracy)

Validation accuracy with MLP model: 0.8559983566146261


In [None]:
# Evaluate the model on the test set
y_test_pred = mlp_model.predict(X_test)
test_accuracy = accuracy_score(test_labels, y_test_pred)
print("Test accuracy with MLP model:", test_accuracy)


Test accuracy with MLP model: 0.82324
