In [1]:
import pandas as pd

In [2]:
import glob

pos_files = glob.glob('aclImdb/test/pos/*.txt') + glob.glob('aclImdb/train/pos/*.txt')
neg_files = glob.glob('aclImdb/test/neg/*.txt') + glob.glob('aclImdb/train/neg/*.txt')

In [3]:
reviews = []
labels = []


In [4]:
for file in pos_files:
    with open(file, 'r', encoding='utf-8') as f:
        review = f.read()
        reviews.append(review)
        labels.append('positive')

for file in neg_files:
    with open(file, 'r', encoding='utf-8') as f:
        review = f.read()
        reviews.append(review)
        labels.append('negative')


In [5]:
data = {'review': reviews, 'label': labels}

In [6]:
df = pd.DataFrame(data)


In [7]:
df.head()


Unnamed: 0,review,label
0,I went and saw this movie last night after bei...,positive
1,Actor turned director Bill Paxton follows up h...,positive
2,As a recreational golfer with some knowledge o...,positive
3,"I saw this film in a sneak preview, and it is ...",positive
4,Bill Paxton has taken the true story of the 19...,positive


In [8]:
df.describe()


Unnamed: 0,review,label
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [9]:
df['label'].value_counts()


positive    25000
negative    25000
Name: label, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=42)

In [11]:
!pip install nltk



In [12]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mathl\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mathl\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mathl\AppData\Roaming\nltk_data...


True

In [16]:
import nltk
nltk.download('omw-1.4')


[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mathl\AppData\Roaming\nltk_data...


True

In [23]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define punctuation and stopwords
punctuation = string.punctuation
stopwords = set(stopwords.words('english'))

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Preprocess function
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove punctuation and convert to lowercase
    tokens = [token.lower() for token in tokens if token not in punctuation]
    
    # Remove stopwords
    tokens = [token for token in tokens if token not in stopwords]
    
    # Lemmatize tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Join tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# Apply preprocessing to your data
preprocessed_reviews = [preprocess_text(review) for review in X_train]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mathl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mathl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mathl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [24]:
from sklearn.feature_extraction.text import CountVectorizer


In [25]:
vectorizer = CountVectorizer()


In [26]:
X_train_vectorized = vectorizer.fit_transform(preprocessed_reviews)


In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [28]:
vectorizer = TfidfVectorizer()


In [29]:
X_train_vectorized = vectorizer.fit_transform(preprocessed_reviews)


In [30]:
from sklearn.naive_bayes import MultinomialNB


In [31]:
classifier = MultinomialNB()


In [32]:
classifier.fit(X_train_vectorized, y_train)


In [34]:
X_test_preprocessed = [preprocess_text(text) for text in X_test]
X_test_vectorized = vectorizer.transform(X_test_preprocessed)
predictions = classifier.predict(X_test_vectorized)


In [35]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, predictions)


In [36]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [37]:
predictions = classifier.predict(X_test_vectorized)


In [38]:
accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')


In [39]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Accuracy: 0.8633
Precision: 0.8637983603899607
Recall: 0.8633
F1-score: 0.8632660961375841
