**Similarity Analysis**

In [2]:
doc1='Hello, my name is Elizabeth'
doc2='Hi, my name is Eliza'
string=[doc1,doc2]

In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
doc1=word_tokenize(doc1)
doc2=word_tokenize(doc2)
sw = stopwords.words('english')
l1 =[];l2 =[]
X_set = {w for w in doc1 if not w in sw}
Y_set = {w for w in doc2 if not w in sw}
print(X_set)
print(Y_set)

{'name', 'Hello', ',', 'Elizabeth'}
{'Hi', 'name', ',', 'Eliza'}


In [5]:
rvector = X_set.union(Y_set)
for w in rvector:
    if w in X_set: l1.append(1) # create a vector
    else: l1.append(0)
    if w in Y_set: l2.append(1)
    else: l2.append(0)
c = 0
print(rvector)
print(l1)
print(l2)

{'Hello', 'Eliza', 'Hi', ',', 'Elizabeth', 'name'}
[1, 0, 0, 1, 1, 1]
[0, 1, 1, 1, 0, 1]


In [6]:
import numpy as np
A=np.array(l1)
B=np.array(l2)
dot = np.dot(A,B)
norma = np.linalg.norm(A)
normb = np.linalg.norm(B)
cos = dot/(norma*normb)
print(cos)

0.5


**Based on Euclidean Distance**

In [7]:
d = np.linalg.norm(A-B)
print(d)

2.0


**Sentiment Analysis**

In [8]:
import pandas as pd
data=pd.read_csv('/content/drive/MyDrive/test.csv')
data.head()

Unnamed: 0,text,sentiment
0,"My daughter liked it but I was aghast, that a ...",neg
1,I... No words. No words can describe this. I w...,neg
2,this film is basically a poor take on the old ...,neg
3,"This is a terrible movie, and I'm not even sur...",neg
4,First of all this movie is a piece of reality ...,pos


In [9]:
from nltk.corpus import stopwords
data['text']=data['text'].str.lower()
sw = stopwords.words('english')
data['text']=data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (sw)]))

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
token = RegexpTokenizer(r'[a-zA-Z0-9]+')
cv = CountVectorizer(ngram_range = (1,1),tokenizer = token.tokenize)
text_counts = cv.fit_transform(data['text'])



In [11]:
from sklearn.model_selection import train_test_split
X=text_counts
y=data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=30)

**Fitting Multinomial Naive Bayes model**

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
MNB = MultinomialNB()
MNB.fit(X_train, y_train)

predicted = MNB.predict(X_test)
accuracy_score = accuracy_score(predicted, y_test)

print('MultinominalNB model accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')

MultinominalNB model accuracy is 87.68%


**Preprocessing for Neural Network models**

In [13]:
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
sentiment_label=data['sentiment'].factorize()
review = data['text'].values
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(review)
vocab_size = len(tokenizer.word_index) + 1
encoded_docs = tokenizer.texts_to_sequences(review)
padded_sequence = pad_sequences(encoded_docs, maxlen=200)
embedding_vector_length = 32

**Fitting RNN model**

In [14]:
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding, Dropout
from tensorflow.keras.models import Sequential
RNN_model=Sequential()
RNN_model.add(Embedding(vocab_size, embedding_vector_length, input_length=200))
RNN_model.add(SimpleRNN(128, activation='tanh',return_sequences=False))
RNN_model.add(Dropout(0.2))
RNN_model.add(Dense(1, activation='sigmoid'))
RNN_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=['accuracy'])
RNN_model.fit(padded_sequence,sentiment_label[0],batch_size=64,epochs=10,verbose=1,validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7c2be76c5660>

**Fitting LSTM model**

In [15]:
from tensorflow.keras.layers import LSTM
LSTM_model=Sequential()
LSTM_model.add(Embedding(vocab_size, embedding_vector_length, input_length=200) )
LSTM_model.add(LSTM(50, dropout=0.5,recurrent_dropout=0.5 ,activation='tanh',return_sequences=False))
LSTM_model.add(Dropout(0.2))
LSTM_model.add(Dense(1, activation='sigmoid'))
LSTM_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=['accuracy'])
LSTM_model.fit(padded_sequence,sentiment_label[0],batch_size=64,epochs=5,verbose=1,validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7c2bd373fc10>

**Making Predictions**

In [16]:
def predict_sentiment(text):
    tw = tokenizer.texts_to_sequences([text])
    tw = pad_sequences(tw,maxlen=200)
    prediction = int(RNN_model.predict(tw).round().item())
    print("RNN Model Predicted label: ", sentiment_label[1][prediction])
    prediction = int(LSTM_model.predict(tw).round().item())
    print("LSTM Model Predicted label: ", sentiment_label[1][prediction])

import random
random_text=data['text'][random.randint(0, 5000)]
print(random_text)
predict_sentiment(random_text)

i've described film surprising... true many respects. subject material (black man wrongly accused), characters (people expect stereotypes often show uncharacteristic attitudes film), production...<br /><br />all factors make refreshingly unusual film, especially time (1949). possible spoilers sometimes cheesy dialogue occasional high moral stance.<br /><br />but, happen across planning watch film, might find like - staying middle night see happens.<br /><br />
RNN Model Predicted label:  pos
LSTM Model Predicted label:  pos
