<a href="https://colab.research.google.com/github/Mohit1-K/NLP-Similarity-Score-Using-Word2Vec-/blob/main/word2vec%20implementation%20to%20get%20similarity%20score.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# *The purpose and usefulness of Word2Vec is to group the vectors of similar words together in vectorspace. That is, it detects similarities mathematically. Word2Vec creates vectors that are distributed numerical representations of word features, features such as the context of individual words.*

# Calculating similarity score using ***Word2Vec*** (Deep Learning)

## Importing necessary modules

In [None]:
import pandas as pd
import numpy as np

import re
import scipy
import tensorflow as tf
from tensorflow import keras

## Loading Dataset

In [None]:
url = 'https://raw.githubusercontent.com/Mohit1-K/Data/main/Text_Similarity_Dataset.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searchers fail to spot ads internet sear...,newcastle 2-1 bolton kieron dyer smashed home ...
1,1,millions to miss out on the net by 2025 40% o...,nasdaq planning $100m share sale the owner of ...
2,2,young debut cut short by ginepri fifteen-year-...,ruddock backs yapp s credentials wales coach m...
3,3,diageo to buy us wine firm diageo the world s...,mci shares climb on takeover bid shares in us ...
4,4,be careful how you code a new european directi...,media gadgets get moving pocket-sized devices ...


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4023 entries, 0 to 4022
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Unique_ID  4023 non-null   int64 
 1   text1      4023 non-null   object
 2   text2      4023 non-null   object
dtypes: int64(1), object(2)
memory usage: 94.4+ KB


## Exploratory Data Analysis

In [None]:
df.drop('Unique_ID', axis = 1, inplace=True)

In [None]:
df

Unnamed: 0,text1,text2
0,savvy searchers fail to spot ads internet sear...,newcastle 2-1 bolton kieron dyer smashed home ...
1,millions to miss out on the net by 2025 40% o...,nasdaq planning $100m share sale the owner of ...
2,young debut cut short by ginepri fifteen-year-...,ruddock backs yapp s credentials wales coach m...
3,diageo to buy us wine firm diageo the world s...,mci shares climb on takeover bid shares in us ...
4,be careful how you code a new european directi...,media gadgets get moving pocket-sized devices ...
...,...,...
4018,labour plans maternity pay rise maternity pay ...,no seasonal lift for house market a swathe of ...
4019,high fuel costs hit us airlines two of the lar...,new media battle for bafta awards the bbc lead...
4020,britons growing digitally obese gadget lover...,film star fox behind theatre bid leading actor...
4021,holmes is hit by hamstring injury kelly holmes...,tsunami to hit sri lanka banks sri lanka s b...


### NLTK is a very powerful tool for Natural Language Processing

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

#Tokenize

tokensent1 = [word_tokenize(str(sentence)) for sentence in df['text1']]
tokensent2 = [word_tokenize(str(sentence)) for sentence in df['text2']]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
tokensent1[0]

['savvy',
 'searchers',
 'fail',
 'to',
 'spot',
 'ads',
 'internet',
 'search',
 'engine',
 'users',
 'are',
 'an',
 'odd',
 'mix',
 'of',
 'naive',
 'and',
 'sophisticated',
 'suggests',
 'a',
 'report',
 'into',
 'search',
 'habits',
 '.',
 'the',
 'report',
 'by',
 'the',
 'us',
 'pew',
 'research',
 'center',
 'reveals',
 'that',
 '87',
 '%',
 'of',
 'searchers',
 'usually',
 'find',
 'what',
 'they',
 'were',
 'looking',
 'for',
 'when',
 'using',
 'a',
 'search',
 'engine',
 '.',
 'it',
 'also',
 'shows',
 'that',
 'few',
 'can',
 'spot',
 'the',
 'difference',
 'between',
 'paid-for',
 'results',
 'and',
 'organic',
 'ones',
 '.',
 'the',
 'report',
 'reveals',
 'that',
 '84',
 '%',
 'of',
 'net',
 'users',
 'say',
 'they',
 'regularly',
 'use',
 'google',
 'ask',
 'jeeves',
 'msn',
 'and',
 'yahoo',
 'when',
 'online',
 '.',
 'almost',
 '50',
 '%',
 'of',
 'those',
 'questioned',
 'said',
 'they',
 'would',
 'trust',
 'search',
 'engines',
 'much',
 'less',
 'if',
 'they',
 'k

In [None]:
tokensent2[0]

['newcastle',
 '2-1',
 'bolton',
 'kieron',
 'dyer',
 'smashed',
 'home',
 'the',
 'winner',
 'to',
 'end',
 'bolton',
 's',
 '10-game',
 'unbeaten',
 'run',
 '.',
 'lee',
 'bowyer',
 'put',
 'newcastle',
 'ahead',
 'when',
 'he',
 'fed',
 'stephen',
 'carr',
 'on',
 'the',
 'right',
 'flank',
 'then',
 'sprinted',
 'into',
 'the',
 'area',
 'to',
 'power',
 'home',
 'a',
 'header',
 'from',
 'the',
 'resultant',
 'cross',
 '.',
 'wanderers',
 'hit',
 'back',
 'through',
 'stelios',
 'giannakopoulos',
 'who',
 'ended',
 'a',
 'fluid',
 'passing',
 'move',
 'with',
 'a',
 'well-struck',
 'volley',
 '.',
 'but',
 'dyer',
 'had',
 'the',
 'last',
 'word',
 'in',
 'a',
 'game',
 'of',
 'few',
 'chances',
 'pouncing',
 'on',
 'a',
 'loose',
 'ball',
 'after',
 'alan',
 'shearer',
 's',
 'shot',
 'was',
 'blocked',
 'and',
 'firing',
 'into',
 'the',
 'top',
 'corner',
 '.',
 'neither',
 'side',
 'lacked',
 'urgency',
 'in',
 'the',
 'early',
 'stages',
 'of',
 'the',
 'game',
 'with',
 'ple

### Filtering all the special characters from text columns

In [None]:

filtered1 = []
for w in tokensent1:
    st1 = re.sub('[^A-Za-z]',' ', str(w))
    x1 = re.split("\s", st1)
    filtered1.append(x1)
    
filtered2 = []
for w in tokensent2:
    st2 = re.sub('[^A-Za-z]',' ',str(w))
    x2 = re.split("/s",st2)
    filtered2.append(x2)

In [None]:
filtered1[0]

['',
 '',
 'savvy',
 '',
 '',
 '',
 'searchers',
 '',
 '',
 '',
 'fail',
 '',
 '',
 '',
 'to',
 '',
 '',
 '',
 'spot',
 '',
 '',
 '',
 'ads',
 '',
 '',
 '',
 'internet',
 '',
 '',
 '',
 'search',
 '',
 '',
 '',
 'engine',
 '',
 '',
 '',
 'users',
 '',
 '',
 '',
 'are',
 '',
 '',
 '',
 'an',
 '',
 '',
 '',
 'odd',
 '',
 '',
 '',
 'mix',
 '',
 '',
 '',
 'of',
 '',
 '',
 '',
 'naive',
 '',
 '',
 '',
 'and',
 '',
 '',
 '',
 'sophisticated',
 '',
 '',
 '',
 'suggests',
 '',
 '',
 '',
 'a',
 '',
 '',
 '',
 'report',
 '',
 '',
 '',
 'into',
 '',
 '',
 '',
 'search',
 '',
 '',
 '',
 'habits',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'the',
 '',
 '',
 '',
 'report',
 '',
 '',
 '',
 'by',
 '',
 '',
 '',
 'the',
 '',
 '',
 '',
 'us',
 '',
 '',
 '',
 'pew',
 '',
 '',
 '',
 'research',
 '',
 '',
 '',
 'center',
 '',
 '',
 '',
 'reveals',
 '',
 '',
 '',
 'that',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 'of',
 '',
 '',
 '',
 'searchers',
 '',
 '',
 '',
 'usually',
 '',
 

### Removing whitespace and converting to lower case

In [None]:
for sent in filtered1:
    while '' in sent:
        sent.remove('')

for sent in filtered2:
    while '' in sent:
        sent.remove('')
        
# Lowercasing
filtered_lower1 = []
for i in filtered1:
    i = [x.lower() for x in i]
    filtered_lower1.append(i)

filtered_lower2 = []
for i in filtered2:
    i = [x.lower() for x in i]
    filtered_lower2.append(i)

In [None]:
filtered_lower1[0]

['savvy',
 'searchers',
 'fail',
 'to',
 'spot',
 'ads',
 'internet',
 'search',
 'engine',
 'users',
 'are',
 'an',
 'odd',
 'mix',
 'of',
 'naive',
 'and',
 'sophisticated',
 'suggests',
 'a',
 'report',
 'into',
 'search',
 'habits',
 'the',
 'report',
 'by',
 'the',
 'us',
 'pew',
 'research',
 'center',
 'reveals',
 'that',
 'of',
 'searchers',
 'usually',
 'find',
 'what',
 'they',
 'were',
 'looking',
 'for',
 'when',
 'using',
 'a',
 'search',
 'engine',
 'it',
 'also',
 'shows',
 'that',
 'few',
 'can',
 'spot',
 'the',
 'difference',
 'between',
 'paid',
 'for',
 'results',
 'and',
 'organic',
 'ones',
 'the',
 'report',
 'reveals',
 'that',
 'of',
 'net',
 'users',
 'say',
 'they',
 'regularly',
 'use',
 'google',
 'ask',
 'jeeves',
 'msn',
 'and',
 'yahoo',
 'when',
 'online',
 'almost',
 'of',
 'those',
 'questioned',
 'said',
 'they',
 'would',
 'trust',
 'search',
 'engines',
 'much',
 'less',
 'if',
 'they',
 'knew',
 'information',
 'about',
 'who',
 'paid',
 'for',
 '

### Lemmatizing

In [None]:
nltk.download('wordnet')



lemmatized1 = []
wordnet = WordNetLemmatizer()
for sent in filtered_lower1:
    tokens = [wordnet.lemmatize(w) for w in sent]
    lemmatized1.append(tokens)
    
lemmatized2 = []
for sent in filtered_lower2:
    tok = [wordnet.lemmatize(se) for se in sent]
    lemmatized2.append(tok)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
lemmatized1[0]

['savvy',
 'searcher',
 'fail',
 'to',
 'spot',
 'ad',
 'internet',
 'search',
 'engine',
 'user',
 'are',
 'an',
 'odd',
 'mix',
 'of',
 'naive',
 'and',
 'sophisticated',
 'suggests',
 'a',
 'report',
 'into',
 'search',
 'habit',
 'the',
 'report',
 'by',
 'the',
 'u',
 'pew',
 'research',
 'center',
 'reveals',
 'that',
 'of',
 'searcher',
 'usually',
 'find',
 'what',
 'they',
 'were',
 'looking',
 'for',
 'when',
 'using',
 'a',
 'search',
 'engine',
 'it',
 'also',
 'show',
 'that',
 'few',
 'can',
 'spot',
 'the',
 'difference',
 'between',
 'paid',
 'for',
 'result',
 'and',
 'organic',
 'one',
 'the',
 'report',
 'reveals',
 'that',
 'of',
 'net',
 'user',
 'say',
 'they',
 'regularly',
 'use',
 'google',
 'ask',
 'jeeves',
 'msn',
 'and',
 'yahoo',
 'when',
 'online',
 'almost',
 'of',
 'those',
 'questioned',
 'said',
 'they',
 'would',
 'trust',
 'search',
 'engine',
 'much',
 'le',
 'if',
 'they',
 'knew',
 'information',
 'about',
 'who',
 'paid',
 'for',
 'result',
 'wa

### Removing Stop words

In [None]:
nltk.download('stopwords')

filter_words1 = []
Stopwords = set(stopwords.words('english'))

for sent in lemmatized1:
    tokens = [w for w in sent if w not in Stopwords]
    filter_words1.append(tokens)
    
filter_words2 = []
for sent in lemmatized2:
    tokens2 = [w for w in sent if w not in Stopwords]
    filter_words2.append(tokens2)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
filter_words1[0]

['savvy',
 'searcher',
 'fail',
 'spot',
 'ad',
 'internet',
 'search',
 'engine',
 'user',
 'odd',
 'mix',
 'naive',
 'sophisticated',
 'suggests',
 'report',
 'search',
 'habit',
 'report',
 'u',
 'pew',
 'research',
 'center',
 'reveals',
 'searcher',
 'usually',
 'find',
 'looking',
 'using',
 'search',
 'engine',
 'also',
 'show',
 'spot',
 'difference',
 'paid',
 'result',
 'organic',
 'one',
 'report',
 'reveals',
 'net',
 'user',
 'say',
 'regularly',
 'use',
 'google',
 'ask',
 'jeeves',
 'msn',
 'yahoo',
 'online',
 'almost',
 'questioned',
 'said',
 'would',
 'trust',
 'search',
 'engine',
 'much',
 'le',
 'knew',
 'information',
 'paid',
 'result',
 'wa',
 'hidden',
 'according',
 'figure',
 'gathered',
 'pew',
 'researcher',
 'average',
 'user',
 'spends',
 'minute',
 'per',
 'month',
 'carrying',
 'separate',
 'search',
 'look',
 'webpage',
 'hunt',
 'significant',
 'chunk',
 'net',
 'user',
 'carry',
 'search',
 'least',
 'weekly',
 'asked',
 'look',
 'every',
 'week',
 

## Modelling

Loading Word2Vec pretrained model and training it on our data.

In [None]:
from gensim.models import Word2Vec, KeyedVectors

In [None]:
w2v = Word2Vec(window = 5, min_count = 1, size = 32)

In [None]:
allwords = []
for words in filter_words1:
  allwords.append(words)

for words in filter_words2:
  allwords.append(words)

In [None]:
w2v.build_vocab(allwords, progress_per=100)

In [None]:
w2v.train(allwords, total_examples=w2v.corpus_count, epochs=w2v.epochs)

(4345083, 4513985)

In [None]:
w2v.save('word2vec_5.model')

In [None]:
w2v.wv.get_vector('bolton')

array([ 0.1125015 , -0.14489532, -0.29933545,  1.0057585 ,  0.22655983,
        0.38653448, -1.2733945 , -0.4720744 , -0.5477413 , -0.52612734,
       -0.17089006, -0.16578138, -0.8443088 ,  0.10045854,  0.5816555 ,
        0.7172789 ,  0.47068483, -0.7847902 , -0.10888976, -0.11961161,
       -0.5139477 , -0.16394445,  0.2562801 , -0.35449833, -0.8587196 ,
        0.21850507,  0.13962021,  0.0036268 , -0.91473573,  0.31184912,
       -0.17569824,  1.8681045 ], dtype=float32)

## Getting results

In [None]:
result1 = []
for sent1, sent2 in zip(filter_words1,filter_words2):
    vector1 = np.mean([w2v.wv.get_vector(word) for word in sent1], axis = 0)
    vector2 = np.mean([w2v.wv.get_vector(word) for word in sent2], axis = 0)
    cosine = scipy.spatial.distance.cosine(vector1, vector2)
    result1.append((1-cosine)*100)

'''
We can either use w2v.wv.similarity_score directly to get similarity score 
but i used scipy just to show you that how we can calculate get cosine distance 
between two vectors using scipy, the difficult task was to get word vector.
'''

df['Sentence_Similarity_in_percentage'] = result1

In [None]:
res = {
    'Unique_ID': np.arange(0,4023),
    'Similarity_Score': df['Sentence_Similarity_in_percentage']
}

result = pd.DataFrame(data = res)

### Saving result to a csv file.

In [None]:
result.to_csv('Result.csv', index=False)