<a href="https://colab.research.google.com/github/GammaKing2000/Sentiment-Analysis-ML-Project/blob/main/Sentiment_Analysis_Classifier_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Gathering
Link for the kaggle dataset used in the follwing model : https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Drive Link for the same dataset: https://drive.google.com/file/d/1kTGOEWWEFFlFWySbPRu10I51MOJTsDZR/view?usp=sharing

In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/DATASET/IMDB Dataset.csv')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


## PREPROCESSING

In [None]:
#Importing some important libraries
import nltk
import string
import re

In [None]:
#Lowercasing the texts of review column
def text_lowercase(text):
    return text.lower()

#Alternate method fr text lowercase
#df['review'] = df['review'].str.lower()


#Converting the numbers if any present in the review column into words
import inflect
p = inflect.engine()
  
def convert_number(text):
    temp_str = text.split()
    new_string = []
  
    for word in temp_str:
        
        if word.isdigit():
            temp = p.number_to_words(word)
            new_string.append(temp)
            
        else:
          new_string.append(word)
  
    temp_str = ' '.join(new_string)
    return temp_str


#Removing Punctuation
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)


#Removing Whitespaces
def remove_whitespace(text):
    return  " ".join(text.split())


#Removing Stopwords
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
stop_words.remove('no')
stop_words.remove('not')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    filtered_text = ' '.join(filtered_text)
    return filtered_text


#Tokenization
#Stemming
nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
stemmer = PorterStemmer()

def stem_words(text):
    word_tokens = word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    stems = ' '.join(stems)
    return stems


#Lemmetizing
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer = WordNetLemmatizer()
def lemmatize_word(text):
  word_tokens = word_tokenize(text)
  lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
  lemmas = ' '.join(lemmas)
  return lemmas 

[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
df.review = df.review.apply(lambda x: x.lower())
df.review = df.review.apply(remove_punctuation)
df.review = df.review.apply(convert_number)
df.review = df.review.apply(stem_words)
df.review = df.review.apply(lemmatize_word)
df.review = df.review.apply(remove_stopwords)

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,one review ha mention watch one oz episod youl...,positive
1,wonder littl product br br film techniqu veri ...,positive
2,think thi wa wonder way spend time hot summer ...,positive
3,basic famili littl boy jake think zombi hi clo...,negative
4,petter mattei love time money visual stun film...,positive


## Vectorization ( TFIDF Vectorizer ) and Creating a Model using Classification Algorithm

In [None]:
x = df['review'].values
y = df['sentiment'].values

#Split Data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

#Using Pipeline
from sklearn.pipeline import Pipeline
text_model = Pipeline([('TFIDF',TfidfVectorizer()),('model',MultinomialNB())])

text_model.fit(x_train,y_train)

Pipeline(memory=None,
         steps=[('TFIDF',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('model',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [None]:
text = "Resurgence of old familiar sounding brands with just the same products. not good at all. It's like a cheap facsimilie to the current cheap Chinese products in b the market, not worth more than 999"
text = text_lowercase(text)
text = remove_punctuation(text)
text =convert_number(text)
text =stem_words(text)
text =lemmatize_word(text)
text =remove_stopwords(text)
print(text)
text_model.predict([text])

resurg old familiar sound brand product not good like cheap facsimili current cheap chin product b market not worth nine hundr ninety-nin


array(['negative'], dtype='<U8')

## Saving the Model using joblib

In [None]:
import joblib
joblib.dump(text_model,'SentimentAnalysis_model')

['SentimentAnalysis_model']

In [None]:
text_model.score(x_train,y_train)

0.9066666666666666