In [1]:
!pip install pandas openpyxl nltk scikit-learn




In [2]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


# Load IMDB Dataset

In [4]:
# Load IMDb reviews from the uploaded Excel file
file_path = "/content/imdb reviews.xlsx"  # Update filename if needed
df = pd.read_excel(file_path, engine="openpyxl")

# Display first few rows to check the structure
print(df.head())


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [5]:
# Convert sentiment labels: 'positive' -> 1, 'negative' -> 0
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})


In [6]:
nltk.download('stopwords')
nltk.download('punkt')

# Ensure stopwords are loaded properly
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


#

# Preprocess the Text Data

In [10]:
import pandas as pd
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Function for text cleaning without using NLTK
def preprocess_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    words = text.split()  # Simple tokenization (split by space)
    words = [word for word in words if word not in ENGLISH_STOP_WORDS]  # Remove stopwords
    return ' '.join(words)

# Load dataset (update with actual file path)
df = pd.read_excel("/content/imdb reviews.xlsx")  # Load XLSX file in Colab

# Apply preprocessing to the reviews column
df['cleaned_review'] = df['review'].apply(preprocess_text)

# Display cleaned data
print(df[['review', 'cleaned_review']].head())



                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                      cleaned_review  
0  reviewers mentioned watching just oz episode y...  
1  wonderful little production filming technique ...  
2  thought wonderful way spend time hot summer we...  
3  basically theres family little boy jake thinks...  
4  petter matteis love time money visually stunni...  


# Split Data into Training & Testing Sets

In [11]:
# 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_review'], df['sentiment'], test_size=0.2, random_state=42)


# Convert Text to Numeric Format Using TF-IDF

In [12]:
vectorizer = TfidfVectorizer(max_features=5000)  # Convert text to numerical format
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


# Train a Machine Learning Model

In [13]:
# Train Naive Bayes model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)


# Make Predictions & Evaluate Performance

In [14]:
# Predict on test data
y_pred = model.predict(X_test_tfidf)

# Evaluate model performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8218623481781376
Classification Report:
               precision    recall  f1-score   support

    negative       0.78      0.89      0.83       123
    positive       0.87      0.76      0.81       124

    accuracy                           0.82       247
   macro avg       0.83      0.82      0.82       247
weighted avg       0.83      0.82      0.82       247

