In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline

In [48]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Predator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Predator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Predator\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Predator\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [50]:
# Step 1: Load the dataset
data = pd.read_csv(r"C:\Users\Predator\Downloads\IMDB Dataset.csv", encoding='utf-8')

In [52]:
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


### Text Preprocessing

In [54]:
stopwords.words('nepali')

['छ',
 'र',
 'पनि',
 'छन्',
 'लागि',
 'भएको',
 'गरेको',
 'भने',
 'गर्न',
 'गर्ने',
 'हो',
 'तथा',
 'यो',
 'रहेको',
 'उनले',
 'थियो',
 'हुने',
 'गरेका',
 'थिए',
 'गर्दै',
 'तर',
 'नै',
 'को',
 'मा',
 'हुन्',
 'भन्ने',
 'हुन',
 'गरी',
 'त',
 'हुन्छ',
 'अब',
 'के',
 'रहेका',
 'गरेर',
 'छैन',
 'दिए',
 'भए',
 'यस',
 'ले',
 'गर्नु',
 'औं',
 'सो',
 'त्यो',
 'कि',
 'जुन',
 'यी',
 'का',
 'गरि',
 'ती',
 'न',
 'छु',
 'छौं',
 'लाई',
 'नि',
 'उप',
 'अक्सर',
 'आदि',
 'कसरी',
 'क्रमशः',
 'चाले',
 'अगाडी',
 'अझै',
 'अनुसार',
 'अन्तर्गत',
 'अन्य',
 'अन्यत्र',
 'अन्यथा',
 'अरु',
 'अरुलाई',
 'अर्को',
 'अर्थात',
 'अर्थात्',
 'अलग',
 'आए',
 'आजको',
 'ओठ',
 'आत्म',
 'आफू',
 'आफूलाई',
 'आफ्नै',
 'आफ्नो',
 'आयो',
 'उदाहरण',
 'उनको',
 'उहालाई',
 'एउटै',
 'एक',
 'एकदम',
 'कतै',
 'कम से कम',
 'कसै',
 'कसैले',
 'कहाँबाट',
 'कहिलेकाहीं',
 'का',
 'किन',
 'किनभने',
 'कुनै',
 'कुरा',
 'कृपया',
 'केही',
 'कोही',
 'गए',
 'गरौं',
 'गर्छ',
 'गर्छु',
 'गर्नुपर्छ',
 'गयौ',
 'गैर',
 'चार',
 'चाहनुहुन्छ',
 'चाहन्छु',
 'चाहिए

In [56]:
# Step 2: Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [58]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

In [65]:
# Apply preprocessing to the 'review' column
data['processed_review'] = data['review'].apply(preprocess_text)

In [69]:
data

Unnamed: 0,review,sentiment,processed_review
0,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching oz episode you...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production br br filming tech...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically there family little boy jake think t...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...
...,...,...,...
49995,I thought this movie did a down right good job...,positive,thought movie right good job wasnt creative or...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative,bad plot bad dialogue bad acting idiotic direc...
49997,I am a Catholic taught in parochial elementary...,negative,catholic taught parochial elementary school nu...
49998,I'm going to have to disagree with the previou...,negative,im going disagree previous comment side maltin...


In [73]:
# Step 3: Split the dataset into training and testing sets
X = data['processed_review']
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [77]:
help(TfidfVectorizer)

Help on class TfidfVectorizer in module sklearn.feature_extraction.text:

class TfidfVectorizer(CountVectorizer)
 |  TfidfVectorizer(*, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, dtype=<class 'numpy.float64'>, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
 |
 |  Convert a collection of raw documents to a matrix of TF-IDF features.
 |
 |  Equivalent to :class:`CountVectorizer` followed by
 |  :class:`TfidfTransformer`.
 |
 |  For an example of usage, see
 |  :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py`.
 |
 |  For an efficiency comparison of the different feature extractors, see
 |  :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`.
 |
 |  For an example of document clusteri

In [79]:
# Step 4: Create a pipeline with TfidfVectorizer and Naive Bayes
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2), max_features=5000, min_df=5, max_df=0.7)),
    ('classifier', MultinomialNB())
])

In [83]:
# Step 5: Train the model
pipeline.fit(X_train, y_train)

In [85]:
# Step 6: Make predictions on the test set
y_pred = pipeline.predict(X_test)

In [89]:
# Step 7: Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8565

Confusion Matrix:
[[4190  771]
 [ 664 4375]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.86      0.84      0.85      4961
    positive       0.85      0.87      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [101]:
# Step 8: Test with new reviews
# new_reviews = [
#     "This movie was excellent! I loved it.",
#     "Terrible film. I hated every minute of it.",
#     "An average movie, nothing special."
# ]
new_reviews = [
    "Movie is very good but i don't like the movie. Although, concept is good! but the scences are not good enough",
    "I am unable to understand the concept behind the movie but i like the scence very well everything was planned very nicely but concept" 
]

In [103]:
# Preprocess new reviews
processed_new_reviews = [preprocess_text(review) for review in new_reviews]

# Make predictions
new_predictions = pipeline.predict(processed_new_reviews)

for review, sentiment in zip(new_reviews, new_predictions):
    print(f"Review: {review}")
    print(f"Predicted Sentiment: {sentiment}\n")

Review: Movie is very good but i don't like the movie. Although, concept is good! but the scences are not good enough
Predicted Sentiment: negative

Review: I am unable to understand the concept behind the movie but i like the scence very well everything was planned very nicely but concept
Predicted Sentiment: positive

