In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install tqdm



In [None]:
from nltk.tokenize import word_tokenize
import random
import hashlib
from collections import defaultdict
import pandas as pd
from tqdm import tqdm

class FilterDataset:
    @staticmethod
    def eliminate_sentences_lower_than_size(pandas_dataset, column_to_use, min_length = 5, max_length = 50):

        pandas_dataset['number_of_words'] = pandas_dataset[column_to_use].apply(lambda x: len([word for word in word_tokenize(x) if word.isalpha()]))
        indexes_to_drop = pandas_dataset[(pandas_dataset["number_of_words"] <= min_length) | (pandas_dataset["number_of_words"] >= max_length)].index
        pandas_dataset_droped = pandas_dataset.drop(indexes_to_drop)
        return pandas_dataset_droped.drop(columns = "number_of_words")

    @staticmethod
    def generate_word_pairs(sentence, k = 3):
        tokenized_sentence = word_tokenize(sentence)
        tokenized_sentence = [word.lower() for word in tokenized_sentence if word.isalpha()]
        tokenized_parts = set()
        for i in range(len(tokenized_sentence) - k + 1):
            word_pair = ' '.join(tokenized_sentence[i:i+k])
            tokenized_parts.add(word_pair)
        return tokenized_parts

    @staticmethod
    def generate_hash_functions(number_of_hash_functions = 20):
        hash_functions = []
        large_prime_number = 104729
        for elem in range(number_of_hash_functions):
            a = random.randint(1, large_prime_number)
            b = random.randint(0, large_prime_number)
            hash_function = lambda x, a=a, b=b: ((a*hash(x)+b) % large_prime_number)
            hash_functions.append(hash_function)

        return hash_functions

    @staticmethod
    def get_min_hashes_of_tokenized_sentence(word_pairs_sentence, hash_functions):
        min_values = []
        for hash_function in hash_functions:
            min_hash_of_current_function = min([hash_function(pair) for pair in word_pairs_sentence])
            min_values.append(str(min_hash_of_current_function))
        return min_values

    @staticmethod
    def get_buckets(hash_list, number_of_elems_per_bucket, number_of_buckets):
        buckets = []

        for i in range(number_of_buckets):
            buckets.append(tuple(hash_list[i * number_of_elems_per_bucket: (i+1) * number_of_elems_per_bucket]))

        return buckets

    @staticmethod
    def get_jaccard_similarity(min_hash_list1, min_hash_list2):
        length_of_hash = len(min_hash_list1)
        count_same = 0
        for i in range(length_of_hash):
            if min_hash_list1[i] == min_hash_list2[i]:
                count_same += 1

        return count_same / length_of_hash


    @staticmethod
    def create_buckets_and_search_forpotential_duplicates(min_hashes, number_of_buckets = 4, threshold = 0.8):
        number_of_elems_per_bucket = len(min_hashes[0]) // number_of_buckets

        buckets_dictionary = defaultdict(list)

        for idx, hash_list in enumerate(min_hashes):
            buckets = FilterDataset.get_buckets(hash_list, number_of_elems_per_bucket, number_of_buckets)

            for bucket in buckets:
                buckets_dictionary[bucket].append(idx)

        potential_duplicates = set()

        for bucket_key in tqdm(buckets_dictionary.keys()):
            if len(buckets_dictionary[bucket_key]):
                for i in range(len(buckets_dictionary[bucket_key]) - 1):
                    for j in range(i+1, len(buckets_dictionary[bucket_key])):
                        min_hashes_document1, min_hashes_document2 = min_hashes[buckets_dictionary[bucket_key][i]], min_hashes[buckets_dictionary[bucket_key][j]]
                        similarity = FilterDataset.get_jaccard_similarity(min_hashes_document1, min_hashes_document2)

                        if similarity >= threshold:
                            potential_duplicates.add((buckets_dictionary[bucket_key][i], buckets_dictionary[bucket_key][j], similarity))

        return potential_duplicates

    @staticmethod
    def search_potential_similarity_from_pandas_dataset(file_path, column_to_analyze):
        hashes = FilterDataset.generate_hash_functions(20)

        dataset = pd.read_csv(file_path, encoding = "utf8")

        tokens = [FilterDataset.generate_word_pairs(dataset[column_to_analyze][i]) for i in range(len(dataset[column_to_analyze]))]
        min_hashes = [FilterDataset.get_min_hashes_of_tokenized_sentence(tok, hashes) for tok in tokens]
        print(FilterDataset.create_buckets_and_search_forpotential_duplicates(min_hashes))

    @staticmethod
    def search_potential_similarity_from_in_memory_dataset(dataset, column_to_analyze):
        hashes = FilterDataset.generate_hash_functions(20)
        original_positions_in_dataset = [] #because some indexes might miss, the iteration will retain what element is at each step
        tokens = []
        print("Tokenizing the input...")
        for i in tqdm(range(len(dataset[column_to_analyze]))):
            try:
                text_row = dataset[column_to_analyze][i]
                token = FilterDataset.generate_word_pairs(text_row)
                tokens.append(token)
                original_positions_in_dataset.append(i)
            except Exception as e:
                pass
        print("Creating the hashes...")
        min_hashes = [FilterDataset.get_min_hashes_of_tokenized_sentence(tok, hashes) for tok in tqdm(tokens)]
        print("Creating the similarities...")
        similarity = FilterDataset.create_buckets_and_search_forpotential_duplicates(min_hashes, threshold=0.8)
        similarity_translated = [(original_positions_in_dataset[elem[0]], original_positions_in_dataset[elem[1]], elem[2]) for elem in similarity]
        return similarity_translated

    @staticmethod
    def drop_indexes_in_dataset(dataset, indexes_duplicates):
      indexes_to_eliminate = set()
      for i in range(len(indexes_duplicates)):
          indexes_to_eliminate.add(indexes_duplicates[i][1])
      return dataset.drop(index=list(indexes_to_eliminate))


In [None]:
import pandas as pd
#from FilterDataset import FilterDataset
import nltk
nltk.download('punkt_tab')
#dataset = pd.read_csv("dataset_opus_nllb.csv")
dataset = pd.read_csv("/content/drive/MyDrive/disertatie/dataset_opus_ccMATRIX.csv")
print(len(dataset))
dataset = FilterDataset.eliminate_sentences_lower_than_size(dataset,"ro")
print(len(dataset))
results = FilterDataset.search_potential_similarity_from_in_memory_dataset(dataset, "ro")
df = FilterDataset.drop_indexes_in_dataset(dataset, results)
output_file = "/content/drive/MyDrive/disertatie/cleaned_dataset_opus_ccMATRIX.csv"
df.to_csv(output_file, index=False)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


5087974
4247098
Tokenizing the input...


100%|██████████| 4247098/4247098 [09:45<00:00, 7255.19it/s]


Creating the hashes...


100%|██████████| 3621733/3621733 [05:32<00:00, 10881.83it/s]


Creating the similarities...


100%|██████████| 12364753/12364753 [00:16<00:00, 763598.88it/s] 


In [None]:
len(results)

636455