In [41]:
import pandas as pd

# Load the IMDb dataset using pandas
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
df = pd.read_csv("imdb.csv")

# Display the first few rows of the DataFrame
print(df.head())


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [42]:
import nltk
from nltk.tokenize import word_tokenize

# Download the required resources (if not already done)
nltk.download('punkt')

# Tokenize the text data
df['tokenized_text'] = df['review'].apply(word_tokenize)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [43]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

# Convert text data into numerical features using CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vectorized, y_train)

# Predict sentiments
y_pred = nb_classifier.predict(X_test_vectorized)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.8488


In [44]:
# Define functions for data cleaning and tokenization
def clean_text(text):
    cleaned_text = text.lower() 
    # Implement your data cleaning logic here
    return cleaned_text

def tokenize_text(text):
    tokenized_text = word_tokenize(text)
    # Implement your tokenization logic here
    return tokenized_text

# Apply data cleaning and tokenization using Pandas' apply()
df['cleaned_review'] = df['review'].apply(clean_text)
df['tokenized_text'] = df['cleaned_review'].apply(tokenize_text)

# Split data and perform Naive Bayes classification as shown earlier


In [45]:
 from sklearn.metrics import classification_report

# Assuming you've already split your data into training and testing sets and trained the Naive Bayes classifier
# nb_classifier.fit(X_train_vectorized, y_train)
# y_pred = nb_classifier.predict(X_test_vectorized)

# Generate classification report
class_report = classification_report(y_test, y_pred, target_names=['negative', 'positive'])

print("Classification Report:\n", class_report)


Classification Report:
               precision    recall  f1-score   support

    negative       0.83      0.88      0.85      4961
    positive       0.87      0.82      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [46]:
from sklearn.metrics import confusion_matrix
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Confusion Matrix:
 [[4361  600]
 [ 912 4127]]
