# TASK 2 Text Sentiment Analysis

In [29]:
import pandas as pd
import regex
import numpy as np

import nltk
nltk.download("stopwords")
nltk.download("punkt")

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB,GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Haseeb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Haseeb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
# Load dataset
data = pd.read_csv("IMDB Dataset.csv")
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [6]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [7]:
data.shape

(50000, 2)

In [9]:
def cleanText(text):
    # Remove HTML tags
    text = regex.sub(r"<[^<]+?>", "", text)
    
    # Remove Special chars
    text = regex.sub(r"[^a-zA-Z0-9\s]", "", text)
    
    # Convet to LowerCase
    text = text.lower()
    
    return text

# Applying the function to data
data["review"] = data["review"].apply(cleanText)
data.head(10)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive
5,probably my alltime favorite movie a story of ...,positive
6,i sure would like to see a resurrection of a u...,positive
7,this show was an amazing fresh innovative ide...,negative
8,encouraged by the positive comments about this...,negative
9,if you like original gut wrenching laughter yo...,positive


In [15]:
# Get list of stopwords
stopword_list = stopwords.words('english')

# Tokenize and Remove StopWords
def remove_stopwords(text):
    tokens = [token.strip() for token in word_tokenize(text)]
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)

    return filtered_text

# Applying the function to stemm words
data['review'] = data['review'].apply(simple_stemmer)
data.head(10)

Unnamed: 0,review,sentiment
0,one of the other review ha mention that after ...,positive
1,a wonder littl product the film techniqu is ve...,positive
2,i thought thi wa a wonder way to spend time on...,positive
3,basic there a famili where a littl boy jake th...,negative
4,petter mattei love in the time of money is a v...,positive
5,probabl my alltim favorit movi a stori of self...,positive
6,i sure would like to see a resurrect of a up d...,positive
7,thi show wa an amaz fresh innov idea in the 70...,negative
8,encourag by the posit comment about thi film o...,negative
9,if you like origin gut wrench laughter you wil...,positive


In [14]:
# PorterStemmer to Convert the words to base form
def simple_stemmer(text):
    ps = PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text

# Applying the function to stemm words
data['review'] = data['review'].apply(simple_stemmer)
data.head(10)

Unnamed: 0,review,sentiment
0,one of the other review ha mention that after ...,positive
1,a wonder littl product the film techniqu is ve...,positive
2,i thought thi wa a wonder way to spend time on...,positive
3,basic there a famili where a littl boy jake th...,negative
4,petter mattei love in the time of money is a v...,positive
5,probabl my alltim favorit movi a stori of self...,positive
6,i sure would like to see a resurrect of a up d...,positive
7,thi show wa an amaz fresh innov idea in the 70...,negative
8,encourag by the posit comment about thi film o...,negative
9,if you like origin gut wrench laughter you wil...,positive


# Vectorizing Text

In [16]:
X = data['review']
y = data['sentiment']

In [None]:
# Initializing the TF-IDFvectorizer
vect = TfidfVectorizer(ngram_range=(1, 3))

# Vectorizing the text using TF-IDF
X = vect.fit_transform(X)
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 27378179 stored elements and shape (50000, 8779085)>

In [23]:
# spltting training  and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

# Training Model

In [30]:
def classifier_testing(clf, X_train, X_test, y_train, y_test):
    # Training the Classifier
    clf.fit(X_train, y_train)

    # Getting Predictions
    y_pred = clf.predict(X_test)

    # Accuracy Score
    clf_accuracy_score = accuracy_score(y_test, y_pred)
    print("Accuracy Score:\n", clf_accuracy_score, "\n")

    # Classification Report
    class_rep = classification_report(y_test, y_pred)
    print("Classification Report:\n", class_rep, "\n")

    # Confusion Matrix
    conf_mtx = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", conf_mtx, "\n")

# Evaluation

In [31]:
# Initializing NaiveBayes-MultinomialNB Classifier
MNB = MultinomialNB(fit_prior=False, alpha=1.3)

# Training / Testing
classifier_testing(MNB, X_train, X_test, y_train, y_test)

Accuracy Score:
 0.8884 

Classification Report:
               precision    recall  f1-score   support

    negative       0.88      0.90      0.89      5000
    positive       0.90      0.88      0.89      5000

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000
 

Confusion Matrix:
 [[4498  502]
 [ 614 4386]] 

