# Semantic Textual Similarity

Author: Mayank Bazari 29-05-2021

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Importing libaries for data cleaing
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to C:\Users\Mayank
[nltk_data]     Bazari\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Importing Datasets

In [3]:
text = pd.read_csv("Text_Similarity_Dataset.csv")
print("shape"+str(text.shape))
text.head()

shape(4023, 3)


Unnamed: 0,Unique_ID,text1,text2
0,0,savvy searchers fail to spot ads internet sear...,newcastle 2-1 bolton kieron dyer smashed home ...
1,1,millions to miss out on the net by 2025 40% o...,nasdaq planning $100m share sale the owner of ...
2,2,young debut cut short by ginepri fifteen-year-...,ruddock backs yapp s credentials wales coach m...
3,3,diageo to buy us wine firm diageo the world s...,mci shares climb on takeover bid shares in us ...
4,4,be careful how you code a new european directi...,media gadgets get moving pocket-sized devices ...


# Data Cleaning

In [4]:
# Cleaning puncuations
def removing_puncuations(sentance):
    # Replace common words with puncuations marks
    sentance = re.sub(r"n\'t", " not", sentance)
    sentance = re.sub(r"won't", "will not", sentance)
    sentance = re.sub(r"can\'t", "can not",sentance)  
    sentance = re.sub(r"\'ll", " will", sentance)
    sentance = re.sub(r"\'ve", " have", sentance)
    sentance = re.sub(r"\'re", " are", sentance)
    sentance = re.sub(r"\'t", " not", sentance)
    sentance = re.sub(r"\'s", " is",sentance)
    # Removing Everything(number,puncuations etc) except letter A-Z and a-z
    sentance  = re.sub('[^a-zA-Z]',' ',sentance)
    return  sentance

Data before cleaning

In [17]:
example = text['text1'][0][0:500] + " can\'t"
example

"savvy searchers fail to spot ads internet search engine users are an odd mix of naive and sophisticated  suggests a report into search habits.  the report by the us pew research center reveals that 87% of searchers usually find what they were looking for when using a search engine. it also shows that few can spot the difference between paid-for results and organic ones. the report reveals that 84% of net users say they regularly use google  ask jeeves  msn and yahoo when online.  almost 50% of t can't"

Data after cleaning

In [18]:
example = removing_puncuations(example)
example = example.split() #stopword Accept arguments as list of words
ps = PorterStemmer()  #stemming(Remving tenses)
# removing stopwors(preposition,models) and stemming
example = [ps.stem(word) for word in example if not word in set(stopwords.words('english'))] 
example = ' '.join(example)
print(example)

savvi searcher fail spot ad internet search engin user odd mix naiv sophist suggest report search habit report us pew research center reveal searcher usual find look use search engin also show spot differ paid result organ one report reveal net user say regularli use googl ask jeev msn yahoo onlin almost ca


# Making Corpus of text1 and text2

In [25]:
corpus_text1 = []
for sentance in  text['text1'].values:
    sentance = removing_puncuations(sentance)
    sentance = sentance.split()
    ps = PorterStemmer()
    sentance = [ps.stem(word) for word in sentance if not word in set(stopwords.words('english'))] 
    sentance = ' '.join(sentance)
    corpus_text1.append(sentance) 


In [26]:
# Replacing corpus_text1 in text
text['text1'] = corpus_text1
text.head()

Unnamed: 0,Unique_ID,text1,text2
0,0,savvi searcher fail spot ad internet search en...,newcastle 2-1 bolton kieron dyer smashed home ...
1,1,million miss net uk popul still without intern...,nasdaq planning $100m share sale the owner of ...
2,2,young debut cut short ginepri fifteen year old...,ruddock backs yapp s credentials wales coach m...
3,3,diageo buy us wine firm diageo world biggest s...,mci shares climb on takeover bid shares in us ...
4,4,care code new european direct could put softwa...,media gadgets get moving pocket-sized devices ...


In [27]:
corpus_text2 = []
for sentance in  text['text2'].values:
    sentance = removing_puncuations(sentance)
    sentance = sentance.split()
    ps = PorterStemmer()
    sentance = [ps.stem(word) for word in sentance if not word in set(stopwords.words('english'))] 
    sentance = ' '.join(sentance)
    corpus_text2.append(sentance) 

In [28]:
# Replacing corpus_text1 in text
text['text2'] = corpus_text2
text.head()

Unnamed: 0,Unique_ID,text1,text2
0,0,savvi searcher fail spot ad internet search en...,savvi searcher fail spot ad internet search en...
1,1,million miss net uk popul still without intern...,million miss net uk popul still without intern...
2,2,young debut cut short ginepri fifteen year old...,young debut cut short ginepri fifteen year old...
3,3,diageo buy us wine firm diageo world biggest s...,diageo buy us wine firm diageo world biggest s...
4,4,care code new european direct could put softwa...,care code new european direct could put softwa...


# Word Embeddings

A word embedding is a learned representation for text where words that have the same meaning have a similar representation. Word embeddings are low dimensional vectors. One of the benefits of using dense and low-dimensional vectors is computational the majority of neural network toolkits do not play well with very high-dimensional, sparse vectors.<br>  
Each word is mapped to one vector and the vector values are learned in a way that resembles a neural network, and hence the technique is often lumped into the field of deep learning. <br>
Key to the approach is the idea of using a dense distributed representation for each word.<br>
Each word is represented by a real-valued vector, often tens or hundreds of dimensions. This is contrasted to the thousands or millions of dimensions required for sparse word representations, such as a one-hot encoding. <br>

# Importing gensim

In [165]:
import gensim.downloader as api
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.similarities import WordEmbeddingSimilarityIndex
from gensim.similarities import SparseTermSimilarityMatrix
from gensim.similarities import SoftCosineSimilarity
from gensim.models import word2vec # For represent words in vectors
import gensim

# Transfer Leaning

A pre-trained model is nothing more than a file containing tokens and their associated word vectors. The pre-trained Google word2vec model was trained on Google news data (about 100 billion words); it contains 3 million words and phrases and was fit using 300-dimensional word vectors.

In [169]:
wordmodelfile="GoogleNews-vectors-negative300.bin.gz"
wordmodel= gensim.models.KeyedVectors.load_word2vec_format(wordmodelfile, binary=True)

In [48]:
glove = api.load("glove-wiki-gigaword-50")    
similarity_index = WordEmbeddingSimilarityIndex(glove)



In [144]:
from gensim import corpora
dictionary = corpora.Dictionary([(str(s1)+str(" ")+str(s2)).split()])
tfidf = TfidfModel(dictionary=dictionary)

In [146]:
# Create the term similarity matrix.  
similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dictionary, tfidf)

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 43.60it/s]


# Calculating similarlity based on Cosine Distance

In [180]:
similarity_score = [] # List for store similarity score
# word_tokenize

# Combining all the above stundents 
from tqdm import tqdm

for index in tqdm(text.index):
    
        sentance1 = text['text1'][index]
        sentance2 = text['text2'][index]
           

        words1 = word_tokenize(sentance1)
        words2 = word_tokenize(sentance2)           
           
            
        new_corpus = list(wordmodel.index_to_key)
            
        if len(words1 and words2)==0:
                    similarity_score.append(1.0)

        else:               
            for word in words1.copy(): 
                    if (word not in new_corpus):
                        words1.remove(word)
            for word in words2.copy():
                    if (word not in new_corpus):                           
                            words2.remove(word)            
                            
            similarity_score.append((1-wordmodel.n_similarity(words1, words2)))                    
             
                            
                            
            nv k[]

100%|████████████████████████████████████████████████████████████████████████████| 4023/4023 [5:36:47<00:00,  5.02s/it]


# Exporting Final Score

In [181]:
final_score = pd.DataFrame({'Unique_ID':text.Unique_ID,
                     'Similarity_score':similarity_score})
final_score.head()

Unnamed: 0,Unique_ID,Similarity_score
0,0,0.0
1,1,5.960464e-08
2,2,0.0
3,3,0.0
4,4,5.960464e-08


In [185]:
# SAVE DF as CSV file 

final_score.to_csv('final_score.csv',index=False)