In [4]:
#Dataset :- https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
import pandas as pd
df = pd.read_csv('/content/IMDB Dataset.csv.zip')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [8]:
import nltk
import string
import re

In [9]:
#removing punctuations
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

In [10]:
#Stopwords
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
stop_words.remove('no')
stop_words.remove('not')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def remove_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    filtered_text = ' '.join(filtered_text)
    return filtered_text



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
#Stemming
nltk.download('punkt')
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
stemmer = PorterStemmer()

def stem_words(text):
    word_tokens = word_tokenize(text)
    stems = [stemmer.stem(word) for word in word_tokens]
    stems = ' '.join(stems)
    return stems
  

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
#Lemmetizing
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemmatizer = WordNetLemmatizer()
def lemmatize_word(text):
  word_tokens = word_tokenize(text)
  lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
  lemmas = ' '.join(lemmas)
  return lemmas  
    
    

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [13]:
#Removing Numbers
def remove_numbers(text):
    result = re.sub(r'\d+', '', text)
    return result

In [14]:
#Lower Case
def lower(text):
    return text.lower()

In [15]:
df.review = df.review.apply(lambda x: x.lower())
df.review = df.review.apply(remove_punctuation)
df.review = df.review.apply(remove_numbers)
df.review = df.review.apply(stem_words)
df.review = df.review.apply(lemmatize_word)
df.review = df.review.apply(remove_stopwords)

df.head()

Unnamed: 0,review,sentiment
0,one review ha mention watch oz episod youll ho...,positive
1,wonder littl product br br film techniqu veri ...,positive
2,think thi wa wonder way spend time hot summer ...,positive
3,basic famili littl boy jake think zombi hi clo...,negative
4,petter mattei love time money visual stun film...,positive


In [16]:
x = df['review'].values
y = df['sentiment'].values

In [18]:
#Split Data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=0)

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [20]:
#pipeline
from sklearn.pipeline import Pipeline
text_model = Pipeline([('TFIDF',TfidfVectorizer()),('model',MultinomialNB())])

In [21]:
text_model.fit(x_train,y_train)

Pipeline(memory=None,
         steps=[('TFIDF',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('model',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [26]:
text ="Resurgence of old familiar sounding brands with just the same products. not good at all. It's like a cheap facsimilie to the current cheap Chinese products in b the market, not worth more than 999"
text = lower(text)
text = remove_punctuation(text)
text =remove_numbers(text)
text =stem_words(text)
text =lemmatize_word(text)
text =remove_stopwords(text)
print(text)
text_model.predict([text])

resurg old familiar sound brand product not good like cheap facsimili current cheap chin product b market not worth


array(['negative'], dtype='<U8')

In [27]:
import joblib
joblib.dump(text_model,'SentimentAnalysis_model')

['SentimentAnalysis_model']

In [28]:
text_model.score(x_train,y_train)

0.9065333333333333

In [30]:
!pip install streamlit --quiet
!pip install pyngrok==4.1.1 --quiet
from pyngrok import ngrok

  Building wheel for pyngrok (setup.py) ... [?25l[?25hdone


In [41]:
%%writefile app.py
import streamlit as st
import sklearn
import joblib
model=joblib.load('SentimentAnalysis_model')
st.title('Sentiment Analysis')
ip=st.text_input('Enter the text')
op=model.predict([ip])
if st.button('Predict'):
  st.title(op[0])

Overwriting app.py


In [None]:
!nohup streamlit run app.py &
url=ngrok.connect(port='8501')
url

http://39fbb649255c.ngrok.io/