In [11]:


import os 
import numpy as np 
import string 
import pandas as pd 
from numpy.linalg import norm
import re 


class StringSimilarity: 
    
    
    def __init__(self): 
        
        #list with all documents 
        self.document_pool = {}
        
        self.vector_pool = {}
        
        #dictionary with all words -> without punctation and special characters 
        self.dictionary = set()
        
        self.stopwords = [
            
        "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", 
        "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", 
        "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", 
        "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", 
        "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", 
        "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", 
        "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", 
        "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", 
        "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", 
        "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", 
        "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", 
        "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now" 
        ]
     
    
    def add_documents(self,name, document): 
        
        processed_document = self.main_cleaning(document)
        
        if processed_document not in list(self.document_pool.keys()): 
            
            self.document_pool[name] = processed_document
            
            self.dictionary.update(set(processed_document))
            self.update_vectorpool() 
            
        else: 
            raise ValueError("Text has already been added to pool")
    
    def add_texts(self): 
        
        pass 
    # methods to clean and prepare the text documents
    
    @staticmethod
    def cleaning_text(text): 
        
        text = text.strip()
        text = re.sub(r'(?<=\w)[_-]|[_-](?=\w)', '', text)
        text = re.sub(r'\b(?:[a-zA-Z]\.)+[a-zA-Z]?[,]*\b', ' ', text)
        text = re.sub(r"\W", " ", text)  #remove non words char
        text = re.sub(r"\d", " ", text)  #remove digits char
        text = re.sub(r"[\s]+", " ", text) # remove extra white space
        text = text.lower() #lower char for matching
        return text 
    
    
    @staticmethod
    def load_text(path):
     
        with open(path, 'r') as file: #closed after reading 
            
            file = StringSimilarity.string_to_list(file.read())
        
        return file
    
    
    @staticmethod
    def create_doc_list(curr_path): 
        
        corpus_path = os.path.join(curr_path, 'Corpus')

        objects = os.listdir(corpus_path)
        
        return objects 

    def create_corpus(self): 
        
        path = os.getcwd()
        text_files = StringSimilarity.create_doc_list(path)
        
        corpus_path = os.path.join(path, 'Corpus')
        new_count = 0
        
        for i in text_files: 
            
            if i.endswith('.txt'): 
                
                if i not in self.document_pool.keys(): 
                    
                    temp_text = StringSimilarity.load_text(os.path.join(corpus_path, i))
                    temp_text = self.removing_stopwords(temp_text)

                    self.dictionary.update(set(temp_text))
                    self.document_pool[i] = list(set(temp_text))
                    new_count+= 1 
                    
                else: 
                    continue
            else: 
                continue 
        
        self.update_vectorpool()  
        
        if new_count == 0: 
            
            return "no new documents in folder"
        else: 
            
            return f"where have been {str(new_count)} new documents in the folder"
            
        return "Corpus created"
    
    
    @staticmethod
    def string_to_list(string1): 
        
        clean_text = StringSimilarity.cleaning_text(string1)

        
        return clean_text.split()

    # removing stopwords 

    def removing_stopwords(self, list_words): 

        text_without_stop = [word for word in list_words if word not in self.stopwords]
        
        return text_without_stop
    
    
    def main_cleaning(self, text): 
        
        text_list = StringSimilarity.string_to_list(text)
        text_list = self.removing_stopwords(text_list)
        
        return text_list      
    
   

    def create_vector(self, word_list): 
    
        vector = [0] * len(self.dictionary)
        

        # maybe better performance if we delete word from dict temporally -> lenght of loop would be reducing by each run
        for i, word in enumerate(self.dictionary): 
            
            if word in word_list: 
                vector[i] = 1
            else: 
                continue 
            
        return vector 
    
    
    def update_vectorpool(self): 
        
        for i in self.document_pool.keys(): 
            
            self.vector_pool[i] = self.create_vector(self.document_pool[i])
            
        print("all vectors are updated") 

    @staticmethod
    def rank_vectors(dict1): 
        
        return dict(sorted(dict1.items(), key=lambda item: item[1], reverse=True))

        
        
        
    def dot_product_normal(self, new_doc): 
        
        final_dict = {}
        
        clean_text = self.main_cleaning(new_doc)
        
        new_vector = self.create_vector(clean_text)
        
        for text in self.document_pool.keys(): 

            final_dict[text] = np.dot(new_vector, self.vector_pool[text])
        
        return StringSimilarity.rank_vectors(final_dict)
    
    

    def cosine_Similarity(self, new_doc): 
        
        cosine_values = {}
        
        clean_text = self.main_cleaning(new_doc)
        
        new_vector = self.create_vector(clean_text)
        
        for i in self.document_pool.keys(): 
            
            temp_vector = self.vector_pool[i]
            
            if norm(new_vector)*norm(temp_vector) != 0: 
                
                cosine = np.dot(new_vector,temp_vector)/(norm(new_vector)*norm(temp_vector))
                
                cosine_values[i] = cosine
                
            else: 
                cosine_values[i] = 'no matches'
            
        return StringSimilarity.rank_vectors(cosine_values)
    
    
    def Euclidean_distance(self, new_doc): 
        
        euclidean_values = {}
        clean_text = self.main_cleaning(new_doc)
        new_vector = self.create_vector(clean_text)
        
        for i in self.document_pool.keys(): 
            
            temp_vector = self.vector_pool[i]
            
            dist = np.linalg.norm(np.array(temp_vector) - np.array(new_vector))
            euclidean_values[i] = dist 
            
        return StringSimilarity.rank_vectors(euclidean_values)
    
    @staticmethod
    def create_dataframe(dict1, dict2, dict3): 
        
        df = pd.DataFrame([dict1,dict2, dict3 ])
        
        df = df.T
    
        df.columns = ["dot_product", "cosine", "Euclidean"]
        
        return df 
    
    def user_interaction(self): 
        
            
        q1 = input('Please Enter the text you want to compare and press Enter')
        
        result1 = self.dot_product_normal(q1)

        result2 = self.cosine_Similarity(q1)

        result3 = self.Euclidean_distance(q1)

                    
        return StringSimilarity.create_dataframe(result1, result2, result3)
            
            
            
            

In [12]:
document_pool3 = StringSimilarity()
document_pool3.create_corpus()


all vectors are updated


'where have been 2 new documents in the folder'

In [13]:

test_string = "If this is the case, they have the right to access their personal data held by the controller, thereby ensuring the transparency of the data processing. The controller should provide the data subject with a copy of all information about the data subject upon receipt of the request. The information should include the purpose of the data processing, the categories of personal data, the duration of storage, the recipients of the data, the rights to rectification, erasure or restriction of the data, the right to lodge a complaint, the source of the data if it was not collected from the data subject, and information on automated decision-making. (Trzaskowski and Gersvang Sørensen, 2022) Grindr has introduced two methods for applying Article 15. The first allows the user to download their data within the Grindr application and secondly the user can request the data with a form. The online form allows users to make personalized written requests about their data and request the deletion of their account and data"
document_pool3.add_documents("text1", test_string)

all vectors are updated


In [16]:
document_pool3.user_interaction()

Unnamed: 0,dot_product,cosine,Euclidean
text2.txt,25.0,0.392546,9.643651
example1.txt,21.0,0.332956,9.949874
text1,18.0,0.375244,7.874008
