In [1]:
pip install pandas nltk scikit-learn matplotlib


Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
df = pd.read_csv(r"C:\Users\hp\OneDrive\Desktop\movie_review.csv")  # Replace 'movie_review.csv' with the actual file path


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Display the first few rows of the dataset
print(df.head())

# Check for missing values
print(df.isnull().sum())

# Explore the distribution of sentiments
print(df['tag'].value_counts())


   fold_id cv_tag  html_id  sent_id  \
0        0  cv000    29590        0   
1        0  cv000    29590        1   
2        0  cv000    29590        2   
3        0  cv000    29590        3   
4        0  cv000    29590        4   

                                                text  tag  
0  films adapted from comic books have had plenty...  pos  
1  for starters , it was created by alan moore ( ...  pos  
2  to say moore and campbell thoroughly researche...  pos  
3  the book ( or " graphic novel , " if you will ...  pos  
4  in other words , don't dismiss this film becau...  pos  
fold_id    0
cv_tag     0
html_id    0
sent_id    0
text       0
tag        0
dtype: int64
pos    32937
neg    31783
Name: tag, dtype: int64


In [6]:
# Define a function for text preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    text = ' '.join([word for word in words if word not in stop_words])
    return text

# Apply text preprocessing to the 'text' column
df['processed_text'] = df['text'].apply(preprocess_text)


In [7]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['processed_text'],
    df['tag'],
    test_size=0.2,
    random_state=42
)


In [8]:
# Use TF-IDF Vectorizer to convert text data into numerical features
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the max_features parameter
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [9]:
# Use a Naive Bayes classifier for sentiment analysis
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)


In [10]:
# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')


Accuracy: 0.6634734239802225
Confusion Matrix:
[[4087 2284]
 [2072 4501]]
Classification Report:
              precision    recall  f1-score   support

         neg       0.66      0.64      0.65      6371
         pos       0.66      0.68      0.67      6573

    accuracy                           0.66     12944
   macro avg       0.66      0.66      0.66     12944
weighted avg       0.66      0.66      0.66     12944

