<a href="https://colab.research.google.com/github/abhijitsahoo0790/text_similarity_using_GLOVE/blob/master/main_text_similarity_GLOVE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Mount google drive to google Colab environment
from os.path import join
from google.colab import drive

ROOT = "/content/drive"
drive.mount(ROOT)

In [None]:
#Just run this for the first time to fetch and unzip the dataset
#This will download and unzip the data in the desired filder as specified by DATA_PATH
#DATA_PATH = "My Drive/Datasets/glove.6B"
#DATA_DESTINATION_PATH = join(ROOT, DATA_PATH)
#!mkdir "{DATA_DESTINATION_PATH}"
#!wget http://nlp.stanford.edu/data/glove.6B.zip - -P "{DATA_DESTINATION_PATH}"
#!unzip glove.6B.zip -d "{DATA_DESTINATION_PATH}"

In [None]:
#import all the necessary packages here only
import pandas as pd
import numpy as np
import copy
import math
from scipy import spatial
import os
from os.path import join
import sys
import traceback
import logging
#Create a custom logger to log the timestamp along with the logs
logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', 
                    filename='log.txt', filemode='w', level=logging.DEBUG, 
                    datefmt='%Y-%m-%d %H:%M:%S')

In [None]:
ROOT = "/content/drive"
DATA_PATH = "My Drive/Datasets/glove.6B/"
DATA_DESTINATION_PATH = join(ROOT, DATA_PATH)

PATH_GLOVE_VECTORS = DATA_DESTINATION_PATH
LIST_GLOVE_FILES = ["glove.6B.50d", "glove.6B.100d", "glove.6B.200d", "glove.6B.300d"]

In [None]:
#This block contains all the necessary functions at one place
def convert_str_to_float(list1):
    """
    Convert list of strings to its respective floating point numbers. 

    Parameters
    ----------
    list1 : list of str
        Word vector in list of str format.

    Returns
    -------
     : list of float
        Word vector in list of float format.
    """
    return [float(item) for item in list1]

def add(list1, list2):
    """
    Add two real-valued lists and return the resultant list

    Parameters
    ----------
    list1 : list
        Word vector in list of float.
    list2 : list
        Word vector in list of float.

    Returns
    -------
     : list of float
        List of resultant float
        
    """
    return [list1[i]+list2[i] for i in range(0, len(list1))]

def sub(list1, list2):
    """
    Subtract two real-valued lists and return the resultant list

    Parameters
    ----------
    list1 : list
        Word vector in list of float.
    list2 : list
        Word vector in list of float.

    Returns
    -------
     : list of float
        List of resultant float
        
    """
    return [list1[i]-list2[i] for i in range(0, len(list1))]

def read_word_vectors_as_text(vector_file_index=1):
    """
    Read GLOVE vectors from the appropriate file using vector_file_index as the
    index over the list of file names stored in LIST_GLOVE_FILES.

    Parameters
    ----------
    vector_file_index : int, optional
        The index to select the name of the file to be read from the list of 
        files in LIST_GLOVE_FILES. The default is 1.

    Returns
    -------
    word_vectors_text : str
        The content of the file that is read or None is file doesn't exist'
    """
    #Combining filename and path to create the final path of the file to be read    
    word_vector_path = PATH_GLOVE_VECTORS + LIST_GLOVE_FILES[vector_file_index]
    logging.info("Path of word vector file is :"+ word_vector_path)
    logging.info("Reading :"+ word_vector_path)
    
    if os.path.exists(word_vector_path+".txt"):
        with open(word_vector_path+".txt", "r", encoding='utf-8') as f:
            try:
                word_vectors_text = f.read()
                if len(word_vectors_text):
                    logging.info("file read successfully!")
                else:
                    logging.warning("empty file read")
                return word_vectors_text
            except Exception as e:
                logging.error(traceback.format_exc(e))
    else:
        logging.warning("The file doesn't exist")
        return None

def create_word_vector_dictionary_from_text(word_vectors_text):
    """
    create word vector for each word by parsing the text file 

    Parameters
    ----------
    word_vectors_text : str
        Content of word to vector file in text format.

    Returns
    -------
    word_vector_dict : dict
        Dictionary with word as keys and value as its corrosponding vector.
    """
    word_vector_dict={}
    lines = word_vectors_text.split('\n')
    for line in lines:
        temp = line.split(" ")
        word_vector_dict[temp[0]] = convert_str_to_float(temp[1:])
    return word_vector_dict

def find_most_similar_one(vec0, word_list):
    """
    find the most closest word vector to the input vector and return the word

    Parameters
    ----------
    vec0 : list of real-valued numbers
        Word vector in list of float.
    list2 : list of str
        List of words whose vectors are used to construct vec0.

    Returns
    -------
    best_word : str
        The closest word to the input vector vec0    
    """
    #Initialize with very large negative number
    max_score=-100
    best_word=""

    #Iterate over the entire word vector dictionary and retain the closest word so far
    for item in word_vector_dict:
      try:
        #print (item)
        if item not in word_list:
          vec1 = convert_str_to_float(word_vector_dict[item])
          sim_score = 1 - spatial.distance.cosine(vec1, vec0)
          if sim_score > max_score:
            max_score = sim_score
            best_word = item
          #print (item, sim_score)
          #print ("====>",best_word, max_score)
      except:
        pass  

    return best_word

def convert_text_to_vec(text1):
    """
    convert the input text to vector (like doc2vec) by averaging its constituent word vectors as given by GLOVE.

    Parameters
    ----------
    text1 : str
        Input text data

    Returns
    -------
    doc2vec : list of floats
        vector for the input text    
    """
    words = text1.split(" ")
    vec_resultant = [0] * len(word_vector_dict['boy'])
    count = 0
    for item in words:
      try:
        vec_temp = word_vector_dict[item]
        vec_resultant = add(vec_resultant, vec_temp)
        count += 1
      except:
        pass
    doc2vec = [item/count for item in vec_resultant]
    return doc2vec

In [None]:
if __name__ == "__main__":
  #Read and store the word vectors in a dictionary data structure, alternatively DataFRame can be used
  word_vectors_text = read_word_vectors_as_text(vector_file_index=3)
  word_vector_dict = create_word_vector_dictionary_from_text(word_vectors_text)
  del word_vectors_text

In [None]:
"""
TASK 1: Find the related fourth word with the inter-relation of these 3 words  
"""
#Initialize 3 word vectors to find the fourth related words
word_list = ["delhi","india","germany"]
vec1 = word_vector_dict[word_list[0]]
vec2 = word_vector_dict[word_list[1]]
vec3 = word_vector_dict[word_list[2]]  

# Delhi - India + Germany ===> It infers "Berlin"
vec_new = add(sub(vec1,vec2),vec3)
inferred_word = find_most_similar_one(vec_new, word_list)
print ("The inferred related word is:", inferred_word)

The inferred related word is: berlin


In [None]:
  """
  TASK 2: Find similary score between any two words. Score of 100 indicates the words are 
  exactly related and a score of 0 indicate that the words are not at all related  
  """
  vec1 = word_vector_dict["spoon"]
  vec2 = word_vector_dict["forest"]
  similarity_score = 1 - spatial.distance.cosine(vec1,vec2)
  print("Similarity Score is =", ((similarity_score+1.0)/2.0)*100, "%")
  print("Similarity Score is =", (similarity_score)*100, "%")

In [None]:
  """
  TASK 2: Find similary score between any text pieces. Score of 100 indicates the texts are 
  exactly related and a score of 0 indicate that the texts are not at all related  
  """
  text1 = "the girl semms beautiful"
  text2 = "lady looks pretty"
  vec1 = convert_text_to_vec(text1) #the function uses a naive doc2vec extension of GLOVE word vectors
  vec2 = convert_text_to_vec(text2)
  similarity_score = 1 - spatial.distance.cosine(vec1,vec2)
  #print("Similarity Score is =", ((similarity_score+1.0)/2.0)*100, "%")
  print("Similarity Score is =", (similarity_score)*100, "%")

# **The following blocks are related to pushing and pulling codes from its corrosponding repository**


In [None]:
!git config --global user.email "abhijitsahoo0790@gmail.com"
!git config --global user.name "abhijitsahoo0790"

In [None]:
"""
For creating a new project in GitHub, it will throw error if it is executed after the project dir is created
"""
PROJ = "My Drive/Colab Notebooks/text_similarity_using_GLOVE" # This is a custom path.
PROJECT_PATH = join(ROOT, PROJ)
!mkdir "{PROJECT_PATH}"
!git clone https://github.com/abhijitsahoo0790/text_similarity_using_GLOVE.git "{PROJECT_PATH}"


In [None]:
"""
For updating an existing project from GitHub
"""
ROOT = "/content/drive"
PROJ = "My Drive/Colab Notebooks/text_similarity_using_GLOVE" # This is a custom path.
PROJECT_PATH = join(ROOT, PROJ)
%cd "{PROJECT_PATH}"
!git pull origin master

In [None]:
!git add -A
!git commit -a -m "completed all functions"

In [None]:
!git remote add origin "https://github.com/abhijitsahoo0790/text_similarity_using_GLOVE.git"

In [None]:
!git remote remove origin

In [None]:
!git remote add origin "https://github.com/abhijitsahoo0790/text_similarity_using_GLOVE.git" https://github.com/abhijitsahoo0790/text_similarity_using_GLOVE

In [None]:
!git push -u origin master

In [None]:
!ssh-keygen

In [None]:
!cat key2.pub

In [None]:
!ssh -T git@github.com