In [None]:


import numpy as np 
import string 
import pandas as pd 
from numpy.linalg import norm
import nltk
import re 

# moduls for Stopwords 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


#moduls for stemming 
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer

#moduls for lemming 
from nltk.stem import WordNetLemmatizer 
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet

nltk.download('stopwords')
nltk.download('wordnet')

class StringSimilarity: 
    
    
    def __init__(self): 
        
        #list with all documents 
        self.document_pool = {}
        
        self.vector_pool = {}
        
        #dictionary with all words -> without punctation and special characters 
        self.dictionary = set()
     
    
    def add_documents(self,name, document): 
        
        processed_document = StringSimilarity.main_cleaning(document)
        
        if processed_document not in list(self.document_pool.keys()): 
            
            self.document_pool[name] = processed_document
            
            self.dictionary.update(set(processed_document))
            
        else: 
            raise ValueError("Text has already been added to pool")
    
    
    # methods to clean and prepare the text documents
    
    @staticmethod
    def cleaning_text(text): 
        
        text = text.strip()
        text = re.sub(r'(?<=\w)[_-]|[_-](?=\w)', '', text)
        text = re.sub(r'\b(?:[a-zA-Z]\.)+[a-zA-Z]?[,]*\b', ' ', text)
        text = re.sub(r"\W", " ", text)  #remove non words char
        text = re.sub(r"\d", " ", text)  #remove digits char
        text = re.sub(r"[\s]+", " ", text) # remove extra white space
        text = text.lower() #lower char for matching
        return text 
    
    
    
    @staticmethod
    def string_to_list(string1): 
        
        clean_text = StringSimilarity.cleaning_text(string1)
        
        return clean_text.split()

    # removing stopwords 
    @staticmethod
    def removing_stopwords(list_words): 
        stop_words = set(stopwords.words('english'))
        text_without_stop = [word for word in list_words if word not in stop_words]
        
        return text_without_stop
    
    #stemming of words -> avoid dublicates 
    @staticmethod
    def stemming_words(word_list): 
        
        snowball = SnowballStemmer('english')
        lancaster = LancasterStemmer()
        porter = PorterStemmer()
        
        for stemmer in (snowball, lancaster, porter): 
            
            stemmend_words = [stemmer.stem(word) for word in word_list]
            
        return stemmend_words
    
    @staticmethod
    def pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:          
            return None
        
    @staticmethod    
    def lemming_words(word_list): 
        
        lemmatizer = WordNetLemmatizer()

        pos_tagged = nltk.pos_tag(word_list)


        wordnet_tagged = list(map(lambda x: (x[0], StringSimilarity.pos_tagger(x[1])), pos_tagged))

        lemmatized_sentence = []
        for word, tag in wordnet_tagged: 
            
            if tag is None: 
                lemmatized_sentence.append(word)
            else: 
                lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))

        return lemmatized_sentence
    
    @staticmethod
    def main_cleaning(text): 
        
        text_list = StringSimilarity.string_to_list(text)
        text_list = StringSimilarity.removing_stopwords(text_list)
        text_list = StringSimilarity.lemming_words(text_list)  # Use lemmatization
        return text_list      
    
    def update_vectors(self): 
        
        pass 
    

    def create_vetor(self, word_list): 
    
        vector = [0] * len(self.dictionary)
        

        # maybe better performance if we delete word from dict temporally -> lenght of loop would be reducing by each run
        for i, word in enumerate(self.dictionary): 
            
            if word in word_list: 
                vector[i] = 1
            else: 
                continue 
            
        return vector 

    @staticmethod
    def rank_vectors(dict1): 
        
        return dict(sorted(dict1.items(), key=lambda item: item[1], reverse=True))



    def create_matrix(self): 
        pass 
        
        
        
    def dot_product_normal(self, new_doc): 
        
        final_dict = {}
        
        clean_text = self.main_cleaning(new_doc)
        
        new_vector = self.create_vetor(clean_text)
        
        for text in self.document_pool.keys(): 
            
            temp_vector = self.create_vetor(self.document_pool[text])
            
            final_dict[text] = np.dot(new_vector, temp_vector)
        
        return StringSimilarity.rank_vectors(final_dict)
    
    

    def cosine_Similarity(self, new_doc): 
        
        cosine_values = {}
        
        clean_text = self.main_cleaning(new_doc)
        
        new_vector = self.create_vetor(clean_text)
        
        for i in self.document_pool.keys(): 
            
            temp_vector = self.create_vetor(self.document_pool[i])
            
            if norm(new_vector)*norm(temp_vector) != 0: 
                
                cosine = np.dot(new_vector,temp_vector)/(norm(new_vector)*norm(temp_vector))
                
                cosine_values[i] = cosine
                
            else: 
                cosine_values[i] = 'no matches'
            
        return StringSimilarity.rank_vectors(cosine_values)
    
    
    def Euclidean_distance(self, new_doc): 
        
        euclidean_values = {}
        clean_text = self.main_cleaning(new_doc)
        
        new_vector = self.create_vetor(clean_text)
        
        for i in self.document_pool.keys(): 
            
            temp_vector = self.create_vetor(self.document_pool[i]) 
            
            dist = np.linalg.norm(np.array(temp_vector) - np.array(new_vector))
            euclidean_values[i] = dist 
            
        return StringSimilarity.rank_vectors(euclidean_values)
            

In [None]:
document_pool3 = StringSimilarity()

text1 = 'Create a python program that will, compute the text document similarity between different documents one two three!!'
text2 = 'Your implementation will take a list of documents as an input text corpus, and it will compute a dictionary of words for the given corpus.'
text3 = 'Later, when a new document (i.e, search document) is provided, your implementation should provide a list of documents that are similar to the given search document, in descending order of their similarity with the search document.'



document_pool3.add_documents('doc1', text1)
document_pool3.add_documents('doc2', text2)
document_pool3.add_documents('doc3', text3)

In [None]:
test_string =  'A text document can be represented as a word vector against a given dictionary of words.'


print(document_pool3.dot_product_normal(test_string))

print(document_pool3.cosine_Similarity(test_string))

print(document_pool3.Euclidean_distance(test_string))