In [23]:


import os 
import numpy as np 
import pandas as pd 
from numpy.linalg import norm
import re 


class StringSimilarity: 
    
    """
    A class for computing string similarity using various metrics.
    This class provides functionality to clean and process text documents,
    calculate similarity scores, and manage a collection of text documents.
    
    """
    
    def __init__(self): 
        
        """
        Initializes the StringSimilarity class with empty structures for storing documents.
        
        """
        
        # Dictionary to store the processed documents
        self.document_pool = {}
        
        # Dictionary to store vector representations of the documents
        self.vector_pool = {}
        
        # Set to store unique words across all documents
        self.dictionary = set()
        
        
        # list of stopwords for basic text filtering
        self.stopwords = [
            
        "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", 
        "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", 
        "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", 
        "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", 
        "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", 
        "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", 
        "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", 
        "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", 
        "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", 
        "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", 
        "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", 
        "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now" 
        ]
     
    
    def add_documents(self,name, document): 
        
        """
        Manual adds a document to the document pool after processing it.

        Args:
            name (str): The name or identifier for the document.
            document (str): The text of the document to be added.

        Raises:
            ValueError: If document or name is string
            ValueError: If name or document is empty 
            ValueError: Processed document is empty. It might contain only stopwords or non-words
            ValueError: If the processed document already exists in the document pool.
        """
        
        
        if not isinstance(name, str) or not isinstance(document, str):
            
            
            raise TypeError("Both name and document must be strings.")
        
        if not name:
            raise ValueError("Document name is empty.")
    
        if not document:
            raise ValueError("Document content is empty.")
        
        
        processed_document = self.main_cleaning(document)
        
        if not processed_document:
            raise ValueError("Processed document is empty. It might contain only stopwords or non-words.")
        
        
        # Check if the document is not already in the pool
        if processed_document not in list(self.document_pool.keys()): 
            
            self.document_pool[name] = processed_document
            
            self.dictionary.update(set(processed_document))
            
            # after a new document is added to pool, all vectors have to be updated because dictionary is longer. 
            self.update_vectorpool() 
            
        else: 
            raise ValueError(f"The text {processed_document} has already been added to pool")
    


    
    @staticmethod
    def cleaning_text(text): 
        
        
        """
        Static method to clean a given text.

        Args:
            text (str): The text to be cleaned.

        Returns:
            str: The cleaned text.
            
        Raises: 
            TypeError: If the input text is not a string.
            ValueError: If Input text is empty or only contains whitespace
            
        """
        
        if not isinstance(text, str):
            raise TypeError("Input text must be a string.")
        
        if text.strip() == "":
            
            raise ValueError("Input text is empty or only contains whitespace.")
        
        
        text = text.strip() # removes whitespaces in the beginning and end
        text = re.sub(r'(?<=\w)[_-]|[_-](?=\w)', '', text) # Removes hyphens or underscores that are surrounded by word characters.
        text = re.sub(r'\b(?:[a-zA-Z]\.)+[a-zA-Z]?[,]*\b', ' ', text) # Replaces abbreviations or initials and optional trailing commas with a space.
        text = re.sub(r"\W", " ", text)  #remove non words char
        text = re.sub(r"\d", " ", text)  #remove digits char
        text = re.sub(r"[\s]+", " ", text) # remove extra white space
        text = text.lower() #lower char for matching
        
        return text 

    
    @staticmethod
    def load_text(path):
        
        """
        Static method to load text from a given file path.

        Args:
            path (str): The file path from which to load the text.

        Returns:
            list: The processed list of words from the file.
        
        Raises: 
            ValueError: If input is not a string
            FileNotFoundError: If the path can not be found within the operating system 
        """
        if not isinstance(path, str): 
            
            raise ValueError("The file path must be a string.")
        
        if not os.path.exists(path):
            raise FileNotFoundError(f"The file does not exist at the path: {path}")
        
        
        # add try ... except???
        with open(path, 'r') as file: #Automatically closes the file after reading
            
            file = StringSimilarity.string_to_list(file.read())
        
        return file
    
    
    @staticmethod
    def create_doc_list(curr_path): 
        
        """
        Static method to create a list of document names in the 'Corpus' directory.

        Args:
            curr_path (str): The current working directory path.

        Returns:
            list: A list of filenames found in the 'Corpus' subdirectory.
            
        Raises: 
            FileNotFoundError: If the path can not be found within the operating system 
        """
        
        # Construct the path to the 'Corpus' directory which contains .txt files
        corpus_path = os.path.join(curr_path, 'Corpus')
        
        if not os.path.exists(corpus_path):
            raise FileNotFoundError(f"The file does not exist at the path: {corpus_path}")

        # List all files in the 'Corpus' directory
        objects = os.listdir(corpus_path)
        
        return objects 

    def create_corpus(self): 
        
        """
        Method to create a corpus by processing and adding text files from the 'Corpus' directory.
        Updates the document pool with new documents and their processed content.
        
        Returns:
            str: A message indicating the outcome of the corpus creation
        
        Raises: 
            Exception: If an unexpected error occurs during file processing.
        
        """
        
        # Get the current working directory
        path = os.getcwd()
        
        # Retrieve the list of text files in the 'Corpus' directory
        
        try: 
            
            text_files = StringSimilarity.create_doc_list(path)
            
        except Exception as e: 
            
            raise Exception(f'Failed to create document list: {e}')
        
        # create path to Corpus folder 
        corpus_path = os.path.join(path, 'Corpus')
        
        # count number of documents
        new_count = 0
        
        
        for i in text_files: 
            
            # Process only text files and avoid duplicates
            if i.endswith('.txt'): 
                
                # avoid duplicates in document pool
                if i not in self.document_pool.keys(): 
                    
                    
                    try: 
                    # Load and process the text file
                        temp_text = StringSimilarity.load_text(os.path.join(corpus_path, i))
                        temp_text = self.removing_stopwords(temp_text)

                        # Update the dictionary and document pool
                        self.dictionary.update(set(temp_text))
                        self.document_pool[i] = list(set(temp_text))
                        new_count+= 1 
                    except Exception as e: 
                        
                        raise Exception(f'Failed to load document {i} because of {e}')
                        
                    
                else: 
                    continue
            else: 
                continue 
        
        # Update the vector pool with new vectors
        self.update_vectorpool()  
        
        if new_count == 0: 
            
            return "no new documents in folder"
        else: 
            
            return f"where have been {str(new_count)} new documents in the folder"
            
    
    
    
    @staticmethod
    def string_to_list(string1): 
        
        """
        Static method to convert a cleaned string into a list of words.

        Args:
            string1 (str): The string to be converted.

        Returns:
            list: A list of words from the string.
            
        Raises: 
            TypeError: If string is not string
            ValueError: If string is empty after cleaning 
        """
        
        if not isinstance(string1, str):
            raise TypeError("Input must be a string.")
        
        
        # Convert the cleaned string into a list of words
        clean_text = StringSimilarity.cleaning_text(string1)

        if not clean_text.strip():
            raise ValueError("Input string is empty or contains only whitespace after cleaning.")
        
        return clean_text.split()


    def removing_stopwords(self, list_words): 
        
        """
        Method to remove stopwords from a list of words.

        Args:
            list_words (list): The list of words from which stopwords are to be removed.

        Returns:
            list: A list of words with stopwords removed.
            
        Raises: 
            TypError: If Type of Input is not a list of words 
            ValueError: If list from Input is empty

            
        """
        if not isinstance(list_words, list):
            raise TypeError("Input must be a list of words.")

        if not list_words:
            raise ValueError("Input list of words is empty.")
        
        
        # Filter out stopwords from the list of words
        text_without_stop = [word for word in list_words if word not in self.stopwords]
        
        return text_without_stop
    
    
    def main_cleaning(self, text): 
        
        """
        Method to perform cleaning of the text, converting it into a list of words and removing stopwords.

        Args:
            text (str): The text to be cleaned.

        Returns:
            list: A list of cleaned words from the text.
            
        Raises: 
            TypeError: If input is not a string 
            ValueError: If the input text is empty.
        """
        
        if not isinstance(text, str):
            raise TypeError("Input must be a string.")
        
        
        if text.strip() == "":
            raise ValueError("Input text is empty or only contains whitespace.")
        
        # Clean text, convert text to a list of words and remove stopwords
        text_list = StringSimilarity.string_to_list(text)
        text_list = self.removing_stopwords(text_list)
        
        return text_list      
    
   

    def create_vector(self, word_list): 
        
        """
        Creates a binary vector representation for a given list of words.

        Args:
            word_list (list): A list of words to be converted into a vector.

        Returns:
            list: A binary vector where 1 represents the presence of a word from the word list in the dictionary.
            
        Raises: 
            TypeError: If the input is not a list.
            ValueError: If the input list is empty or the dictionary is not initialized.
            
        """


        if not isinstance(word_list, list):
            raise TypeError("Input must be a list of words.")

        if not word_list:
            raise ValueError("Input word list is empty.")

        if not self.dictionary:
            raise ValueError("Dictionary is not initialized. Add some documents first.")
        
        # Initialize a vector of zeros with the same length as the dictionary
        vector = [0] * len(self.dictionary)
        

        # Set elements to 1 in the vector for words present in the word list
        for i, word in enumerate(self.dictionary): 
            
            if word in word_list: 
                vector[i] = 1
            else: 
                continue 
            
        return vector 
    
    
    def update_vectorpool(self):
        
        """
        Updates the vector representations for all documents in the document pool.
        
        Raises: 
            ValueError: If the document pool is empty.
        """ 
        
        
        if not self.document_pool:
            raise ValueError("Document pool is empty. Add some documents before updating the vector pool.")

        # Check if the dictionary is initialized
        if not self.dictionary:
            raise ValueError("Dictionary is not initialized. Add some documents to create the dictionary.")

        try:
            # Update vector for each document in the document pool
            for i in self.document_pool.keys():
                self.vector_pool[i] = self.create_vector(self.document_pool[i])

            print("All vectors are updated")
            
    
        except Exception as e:
            raise Exception(f"An error occurred while updating the vector pool: {e}")
        
    

    @staticmethod
    def rank_vectors(dict1): 
        
        """
        Ranks vectors based on their values.

        Args:
            dict1 (dict): A dictionary of vectors to be ranked.

        Returns:
            dict: A dictionary with vectors ranked in descending order of their values.
        
        Raises:
            TypeError: If the input is not a dictionary.
            ValueError: If the input dictionary is empty.
        """
        
        
        if not isinstance(dict1, dict):
            raise TypeError("Input must be a dictionary.")

        if not dict1:
            raise ValueError("Input dictionary is empty.")  

        
        # Sort the dictionary in descending order based on values
        return dict(sorted(dict1.items(), key=lambda item: item[1], reverse=True))

        
        
        
    def dot_product_normal(self, new_doc): 
        
        """
        Calculates the dot product similarity between a new document and all documents in the document pool.

        Args:
            new_doc (str): The text of the new document.

        Returns:
            dict: A dictionary of dot product similarity scores.
        
        Raises:
            TypeError: If the new document is not a string.
            ValueError: If the new document is empty or only contains whitespace.
            ValueError: If the document pool is empty.
        """
        if not isinstance(new_doc, str):
            raise TypeError("The new document must be a string.")
        
        if new_doc.strip() == "":
            raise ValueError("The new document is empty or only contains whitespace.")

        if not self.document_pool:
            raise ValueError("Document pool is empty. Add some documents before calculating dot product.")
        
        final_dict = {}
        
        
        # cleans new text and create vector
        clean_text = self.main_cleaning(new_doc)
        new_vector = self.create_vector(clean_text)
        
        
        # Calculate dot product with each document vector
        for text in self.document_pool.keys(): 

            final_dict[text] = np.dot(new_vector, self.vector_pool[text])
        
        return StringSimilarity.rank_vectors(final_dict)
    
    

    def cosine_Similarity(self, new_doc): 
        
        """
        Calculates the cosine similarity between a new document and all documents in the document pool.

        Args:
            new_doc (str): The text of the new document.

        Returns:
            dict: A dictionary of cosine similarity scores.
            
        Raises:
            TypeError: If the new document is not a string.
            ValueError: If the new document is empty or only contains whitespace.
            ValueError: If the document pool is empty.
        """
        if not isinstance(new_doc, str):
            raise TypeError("The new document must be a string.")

        if new_doc.strip() == "":
            raise ValueError("The new document is empty or only contains whitespace.")

        if not self.document_pool:
            raise ValueError("Document pool is empty. Add some documents before calculating cosine similarity.")
        
        
        cosine_values = {}
        
        
        # cleans new text and create vector
        clean_text = self.main_cleaning(new_doc)
        
        new_vector = self.create_vector(clean_text)
        
        
        # Calculate cosine similarity with each document vector
        for i in self.document_pool.keys(): 
            
            temp_vector = self.vector_pool[i]
            
            if norm(new_vector)*norm(temp_vector) != 0: 
                
                cosine = np.dot(new_vector,temp_vector)/(norm(new_vector)*norm(temp_vector))
                
                cosine_values[i] = cosine
                
            else: 
                cosine_values[i] = 'no matches'
            
        return StringSimilarity.rank_vectors(cosine_values)
    
    
    def Euclidean_distance(self, new_doc): 
        
        """
        Calculates the Euclidean distance between a new document and all documents in the document pool.

        Args:
            new_doc (str): The text of the new document.

        Returns:
            dict: A dictionary of Euclidean distance scores.
            
        Raises:
            TypeError: If the new document is not a string.
            ValueError: If the new document is empty or only contains whitespace.
            ValueError: If the document pool is empty.
            
        """
        if not isinstance(new_doc, str):
            raise TypeError("The new document must be a string.")

        if new_doc.strip() == "":
            raise ValueError("The new document is empty or only contains whitespace.")

        if not self.document_pool:
            raise ValueError("Document pool is empty. Add some documents before calculating Euclidean distance.")
        
        euclidean_values = {}
        
        # cleans new text and create vector
        clean_text = self.main_cleaning(new_doc)
        new_vector = self.create_vector(clean_text)
        
        
        # Calculate Euclidean distance with each document vector
        for i in self.document_pool.keys(): 
            
            temp_vector = self.vector_pool[i]
            
            dist = np.linalg.norm(np.array(temp_vector) - np.array(new_vector))
            euclidean_values[i] = dist 
            
        return StringSimilarity.rank_vectors(euclidean_values)
    
    def Jaccard_similarity(self, new_doc): 
        
        """
        Calculates the Jaccard similarity between a new document and all documents in the document pool.

        Args:
            new_doc (str): The text of the new document.

        Returns:
            dict: A dictionary of Jaccard similarity scores.
            
        Raises:
            TypeError: If the new document is not a string.
            ValueError: If the new document is empty or only contains whitespace.
            ValueError: If the document pool is empty.
        """
        if not isinstance(new_doc, str):
            raise TypeError("The new document must be a string.")

        if new_doc.strip() == "":
            raise ValueError("The new document is empty or only contains whitespace.")

        if not self.document_pool:
            raise ValueError("Document pool is empty. Add some documents before calculating Jaccard similarity.")
        jaccard_values = {}
        
        # cleans new text and create set of words
        clean_text = self.main_cleaning(new_doc)
        set_new_words = set(clean_text)
        
        # Iterate over each document in the document pool
        for name, words in self.document_pool.items(): 
            
            set_old_words = set(words)
            
            # Calculate the intersection and union
            intersection = set_new_words.intersection(set_old_words)
            union = set_new_words.union(set_old_words)

            # Calculate Jaccard similarity and add to the dictionary
            jaccard_sim = len(intersection) / len(union) if union else 0
            jaccard_values[name] = jaccard_sim
        
        return  jaccard_values 
        
    
    @staticmethod
    def create_dataframe(dict1, dict2, dict3, dict4): 
        
        """
        Creates a DataFrame from four dictionaries of similarity scores by each method.

        Args:
            dict1, dict2, dict3 (dict): Dictionaries of similarity scores seperated by method.

        Returns:
            DataFrame: A DataFrame with the similarity scores from the three dictionaries.

        """

        
        df = pd.DataFrame([dict1,dict2, dict3, dict4])
        
        df = df.T # Transpose to have keys as rows
    
        df.columns = ["dot_product", "cosine", "Euclidean", "jaccard"]
        
        return df 
    
    def user_interaction(self): 
        
        """
        Facilitates user interaction for comparing a new text with the document pool.

        Returns:
            DataFrame: A DataFrame showing the similarity scores of the new text with each document in the pool.
        Raises: 
            ValueError: If User input is empty

        """
        
        # Prompt the user to enter text
        q1 = input('Please Enter the text you want to compare and press Enter')
        
        if q1.strip() == "":
            raise ValueError("Entered text is empty or only contains whitespace. Please enter valid text.")
        
        try: 
        # Compute similarity scores
            result1 = self.dot_product_normal(q1)
            result2 = self.cosine_Similarity(q1)
            result3 = self.Euclidean_distance(q1)
            result4 = self.Jaccard_similarity(q1)
            
            
            # Create and return a DataFrame with the results               
            return StringSimilarity.create_dataframe(result1, result2, result3, result4)
        
        except Exception as e:
            
            raise Exception(f"An error occurred while calculating similarity scores: {e}") 
            
            
            
            
            

In [24]:
Document_similarity = StringSimilarity()
Document_similarity.create_corpus()


All vectors are updated


'where have been 4 new documents in the folder'

In [25]:
Document_similarity.document_pool

{'text1.txt': ['lengths',
  'today',
  'shape',
  'historical',
  'ranging',
  'computer',
  'notion',
  'system',
  'foundational',
  'realm',
  'problemsolving',
  'groundwork',
  'rowan',
  'cornerstone',
  'determining',
  'reasoning',
  'like',
  'euclidean',
  'concepts',
  'back',
  'vectors',
  'know',
  'significant',
  'electromagnetism',
  'acting',
  'century',
  'reduction',
  'work',
  'processing',
  'developing',
  'mechanics',
  'linear',
  'array',
  'capture',
  'known',
  'product',
  'phenomena',
  'meaningful',
  'pivotal',
  'projection',
  'numbers',
  'finds',
  'remains',
  'beyond',
  'operation',
  'origins',
  'moving',
  'intensity',
  'multiplication',
  'geometry',
  'however',
  'science',
  'explored',
  'scalar',
  'dimensionality',
  'made',
  'rich',
  'image',
  'facilitates',
  'geometric',
  'mathematical',
  'produce',
  'roots',
  'impact',
  'widely',
  'mathematicians',
  'conclusion',
  'involves',
  'spaces',
  'modern',
  'particularly',
 

In [8]:

test_string = "If this is the case, they have the right to access their personal data held by the controller, thereby ensuring the transparency of the data processing. The controller should provide the data subject with a copy of all information about the data subject upon receipt of the request. The information should include the purpose of the data processing, the categories of personal data, the duration of storage, the recipients of the data, the rights to rectification, erasure or restriction of the data, the right to lodge a complaint, the source of the data if it was not collected from the data subject, and information on automated decision-making. (Trzaskowski and Gersvang Sørensen, 2022) Grindr has introduced two methods for applying Article 15. The first allows the user to download their data within the Grindr application and secondly the user can request the data with a form. The online form allows users to make personalized written requests about their data and request the deletion of their account and data"
Document_similarity.add_documents("text1", test_string)

all vectors are updated


In [26]:
Document_similarity.user_interaction()

Unnamed: 0,dot_product,cosine,Euclidean,jaccard
text3.txt,20.0,0.277617,12.767145,0.090498
text2.txt,17.0,0.252581,12.124356,0.084158
text1.txt,12.0,0.150827,14.73092,0.044944
text4.txt,11.0,0.161312,12.767145,0.051887
