# Sentiment Analysis Using Term Frequency-Inverse Document Frequency (TF-IDF)

### Imported Packages

In [None]:
#Load the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

import os
import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv('IMDB Dataset.csv')
print(df.shape)
df.head(10)

### Strip Text Entries of Stop Words and Insignificant Characters/Speech

In [None]:
tokenizer=ToktokTokenizer()
stopword_list=nltk.corpus.stopwords.words('english')

#Removing html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text

#Removing special characters
df['review']=df['review'].apply(denoise_text)

def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
df['review']=df['review'].apply(remove_special_characters)

#Stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text

#Apply function on review column
df['review']=df['review'].apply(simple_stemmer)

#Set stopwords to english
stop=set(stopwords.words('english'))
print(stop) # Printing all stopwords

#Removing stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

#Apply function on review column
df['review']=df['review'].apply(remove_stopwords)

In [None]:
#labeling the sentient data
lb=LabelBinarizer()


#Transformed sentiment data
df['sentiment'] = lb.fit_transform(df['sentiment'])

x = df['review']
y = df['sentiment']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=50) # Creating training and testing dataset

In [None]:
# x_train[14] -> training dataset example output

In [None]:
# Creating Logistic Regression Model
lr=LogisticRegression()

train = lr.fit(x_train,y_train)
test = lr.fit(x_test,y_test)

y_pred=lr.predict(x_test)

In [None]:
# Calculating accuracy of logistic regression model
lr_score=accuracy_score(y_test,y_pred)
print("lr_score :",lr_score)

In [None]:
# Printing report of logistic regression model results
report = classification_report(y_test,y_pred)
print(report)

In [None]:
#word cloud for positive review words
plt.figure(figsize=(10,10))
positive_text=data['review'][1]
WC=WordCloud(width=1000,height=500,max_words=500,min_font_size=5)
positive_words=WC.generate(positive_text)
plt.imshow(positive_words,interpolation='bilinear')
plt.show

In [None]:
# Comparing logistic regression model with multinomial Naive Bayes model
mnb=MultinomialNB()

mnb=mnb.fit(x_train,y_train)

mnb_y_pred=mnb.predict(x_test)

In [None]:
# Calculating accuracy of Naive Bayes model
mnb_score=accuracy_score(y_test,mnb_y_pred)
print("mnb_score :",mnb_score)

In [None]:
# Printing report of Naive Bayes model results
report = classification_report(y_test,mnb_y_pred)
print(report)