In [None]:
from datasets import load_dataset
import pandas as pd

In [168]:
import re
import string

class DuplicateDeleter:

    def __init__(self, valid_set, external_dataset):
        self.valid_set = valid_set
        self.external_dataset = external_dataset

    def preprocess_query(self, q):
        punct = '[' + ''.join([c for c in string.punctuation if c != "'"]) + ']'
        q = q.lower()
        q = re.sub(punct, ' ', q)
        q = re.sub('[ ]{2,}', ' ', q)
        return q

    def search_in_base(self, q, kb):
        q = self.preprocess_query(q)
        return int(q in kb)

    def delete(self):
        index_to_delete = []
        original_length =  self.external_dataset.shape[0]
        self.external_dataset_preprocessed = self.external_dataset['premise'].apply(self.preprocess_query)  # preprocess the external dataset
        self.knowledge_base = set(self.external_dataset['premise'].apply(self.preprocess_query))            # create a set of the external dataset for searching duplicates
        
        self.valid_set['duplicate'] = self.valid_set['premise'].apply(lambda q: self.search_in_base(q, self.knowledge_base))    # search for duplicates in the valid set and mark them

        for i in self.valid_set[self.valid_set.duplicate > 0.5].index:

            print("index from valid set to drop: ", i)

            # search duplicates in external dataset
            print("found in:", self.external_dataset_preprocessed[self.external_dataset_preprocessed == self.preprocess_query(self.valid_set.iloc[i,1])].index)

            for i in self.external_dataset_preprocessed[self.external_dataset_preprocessed == self.preprocess_query(self.valid_set.iloc[i,1])].index:
                index_to_delete.append(i)
            
            # drop duplicates in external dataset
            print("index in external dataset to drop: ", index_to_delete)

        print("*******************************")
        print(set(index_to_delete))
        print("index_to_delete")
        self.external_dataset.drop(set(index_to_delete), inplace=True)
        
        

        print(original_length - self.external_dataset.shape[0], " duplicates deleted")
        return self.external_dataset.reset_index(drop=True)

In [None]:
valid_set_translated = pd.read_csv("data/valid_translated.csv")
valid_set_translated.head()

### MNLI

In [170]:
mnli = load_dataset('glue', 'mnli')
df_mnli = pd.DataFrame.from_dict(mnli["train"])
df_mnli.drop(columns=['idx'], inplace=True)
original_count = df_mnli.shape[0]

Found cached dataset glue (/Users/dominikbieri/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 5/5 [00:00<00:00, 494.18it/s]


In [171]:
mnli_deleter = DuplicateDeleter(valid_set_translated, df_mnli)
mnli_deleter.delete()

index from valid set to drop:  32
found in: Int64Index([97830, 165644, 335527], dtype='int64')
index in external dataset to drop:  [97830, 165644, 335527]
index from valid set to drop:  51
found in: Int64Index([85441, 102236, 309003], dtype='int64')
index in external dataset to drop:  [97830, 165644, 335527, 85441, 102236, 309003]
index from valid set to drop:  175
found in: Int64Index([28082, 142019, 148058, 162540, 341322, 385947], dtype='int64')
index in external dataset to drop:  [97830, 165644, 335527, 85441, 102236, 309003, 28082, 142019, 148058, 162540, 341322, 385947]
index from valid set to drop:  310
found in: Int64Index([243257, 286742, 309989], dtype='int64')
index in external dataset to drop:  [97830, 165644, 335527, 85441, 102236, 309003, 28082, 142019, 148058, 162540, 341322, 385947, 243257, 286742, 309989]
index from valid set to drop:  352
found in: Int64Index([2069, 169475, 244908], dtype='int64')
index in external dataset to drop:  [97830, 165644, 335527, 85441, 1022

Unnamed: 0,premise,hypothesis,label
0,Conceptually cream skimming has two basic dime...,Product and geography are what make cream skim...,1
1,you know during the season and i guess at at y...,You lose the things to the following level if ...,0
2,One of our number will carry out your instruct...,A member of my team will execute your orders w...,0
3,How do you know? All this is their information...,This information belongs to them.,0
4,yeah i tell you what though if you go price so...,The tennis shoes have a range of prices.,1
...,...,...,...
392622,"Clearly, California can - and must - do better.",California cannot do any better.,2
392623,It was once regarded as the most beautiful str...,So many of the original buildings had been rep...,1
392624,Houseboats are a beautifully preserved traditi...,The tradition of houseboats originated while t...,0
392625,Obituaries fondly recalled his on-air debates ...,The obituaries were beautiful and written in k...,1


### SNLI

In [172]:
snli = load_dataset('snli')
df_snli = pd.DataFrame.from_dict(snli["train"])

Found cached dataset snli (/Users/dominikbieri/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)
100%|██████████| 3/3 [00:00<00:00, 42.61it/s]


In [173]:
snli_deleter = DuplicateDeleter(valid_set_translated, df_snli)
snli_deleter.delete()

*******************************
set()
index_to_delete
0  duplicates deleted


Unnamed: 0,premise,hypothesis,label
0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,1
1,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",2
2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",0
3,Children smiling and waving at camera,They are smiling at their parents,1
4,Children smiling and waving at camera,There are children present,0
...,...,...,...
550147,Four dirty and barefooted children.,four kids won awards for 'cleanest feet',2
550148,Four dirty and barefooted children.,"four homeless children had their shoes stolen,...",1
550149,A man is surfing in a bodysuit in beautiful bl...,A man in a bodysuit is competing in a surfing ...,1
550150,A man is surfing in a bodysuit in beautiful bl...,A man in a business suit is heading to a board...,2


### XNLI

In [174]:
valid_set = pd.read_csv("data/valid.csv")
valid_set.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,a7b0b9498c,"It's just the beginning!""",A great journey is about to begin!,en,English,1
1,00c0cdf348,Lạnh hơn và xa hơn bao giờ hết đã phát triển t...,Giọng của Chúa cảm thấy thật xa xôi và lạnh lẽo,vi,Vietnamese,0
2,a2a8b36437,"Κατά συνέπεια, οι κυβερνητικοί υπεύθυνοι λήψης...",Οι κυβερνητικοί εκπρόσωποι αρνούνται να αλλάξο...,el,Greek,2
3,097b4dfe2a,You have to walk through it).,You need to pass through it on foot.,en,English,0
4,aab0894630,"Rather, kids today are not only little bundles...",While kids today are symbols of success and st...,en,English,1


In [175]:
xnli_languages = ['ar', 'bg', 'de', 'el', 'en', 'es', 'fr', 'hi', 'ru', 'sw', 'th', 'tr', 'ur', 'vi', 'zh']
df_ar = pd.DataFrame()
df_bg = pd.DataFrame()
df_de = pd.DataFrame()
df_el = pd.DataFrame()
df_en = pd.DataFrame() 
df_es = pd.DataFrame()
df_fr = pd.DataFrame()
df_hi = pd.DataFrame()
df_ru = pd.DataFrame()
df_sw = pd.DataFrame()
df_th = pd.DataFrame()
df_tr = pd.DataFrame()
df_ur = pd.DataFrame()
df_vi = pd.DataFrame()
df_zh = pd.DataFrame()

dataframes = [df_ar, df_bg, df_de, df_el, df_en, df_es, df_fr, df_hi, df_ru, df_sw, df_th, df_tr, df_ur, df_vi, df_zh]

for i in range(len(xnli_languages)):
    dataset = load_dataset('xnli', xnli_languages[i])
    dataframes[i]= pd.DataFrame.from_dict(dataset["train"])
    #print(dataframes[i].head())

Found cached dataset xnli (/Users/dominikbieri/.cache/huggingface/datasets/xnli/ar/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd)
100%|██████████| 3/3 [00:00<00:00, 47.04it/s]


                                             premise  \
0  - و قد ال كريم المفاهيمية اثنان اساسيين - المن...   
1  انت تعرف خلال الموسم و اعتقد انه عند المستوى ا...   
2  واحدة من رقابنا ستقوم بتنفيذ تعليماتك كلها بكل...   
3     وكيف تعرف ذلك ? كل هذا هو معلوماتهم مرة اخرى .   
4  نعم , انا اقول لك ماذا لو ذهبت ل السعر بعض احذ...   

                                          hypothesis  label  
0    المنتج والجغرافيا هو ما يجعل كريم القشط العمل .      1  
1  انت تفقد الاشياء الى المستوى التالي اذا كان ال...      0  
2       احد اعضاء فريقي سوف ينفذ اوامرك بدقة هائلة .      0  
3                              هذه المعلومات تخصهم .      0  
4                احذية التنس لها مجموعة من الاسعار .      1  


Found cached dataset xnli (/Users/dominikbieri/.cache/huggingface/datasets/xnli/bg/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd)
100%|██████████| 3/3 [00:00<00:00, 47.64it/s]


                                             premise  \
0  концептуално крем краде има две основни измере...   
1  по време на сезона и предполагам , че на твоет...   
2      един от нашите номера ще ви даде инструкции .   
3  от къде знаеш ? всичко това е тяхната информац...   
4  да , ще ти кажа какво , ако отидеш да цените н...   

                                          hypothesis  label  
0  продукт и география са това , което прави крем...      1  
1  губиш нещата на следното ниво , ако хората си ...      0  
2  член на моя екип ще изпълни заповедите ви с ог...      0  
3                тази информация принадлежи на тях .      0  
4          обувките за тенис имат диапазон от цени .      1  


Found cached dataset xnli (/Users/dominikbieri/.cache/huggingface/datasets/xnli/de/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd)
100%|██████████| 3/3 [00:00<00:00, 504.63it/s]


                                             premise  \
0  Konzeptionell cream abschöpfen hat zwei grundl...   
1  Du weißt , während der Saison und ich schätze ...   
2  Eine unserer Nummer wird ihre Anweisungen gena...   
3  - woher weißt du das ? All das sind ihre Infor...   
4  Ja , ich sag dir was , wenn du ein paar dieser...   

                                          hypothesis  label  
0  Produkt und Geographie sind das , was creme ab...      1  
1  Man verliert die Dinge auf die folgende Ebene ...      0  
2  Ein Mitglied meines Teams wird ihre Aufträge m...      0  
3                   Diese Information gehört ihnen .      0  
4   Die Tennis Schuhe haben eine Reihe von Preisen .      1  


Found cached dataset xnli (/Users/dominikbieri/.cache/huggingface/datasets/xnli/el/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd)
100%|██████████| 3/3 [00:00<00:00, 49.17it/s]


                                             premise  \
0  Η εννοιολογικά κρέμα κρέμα έχει δύο βασικές δι...   
1  Ξέρεις κατά τη διάρκεια της σεζόν και υποθέτω ...   
2  Ένας από τους αριθμούς μας θα μεταφέρει τις οδ...   
3  Πώς το ξέρεις ; Όλες αυτές είναι οι πληροφορίε...   
4  Ναι , θα σου πω τι θα γίνει αν πας να πάρεις μ...   

                                          hypothesis  label  
0  Το προϊόν και η γεωγραφία είναι αυτά που κάνου...      1  
1  Χάνεις τα πράγματα στο επόμενο επίπεδο , αν οι...      0  
2  Ένα μέλος της ομάδας μου θα εκτελέσει τις διατ...      0  
3          Αυτές οι πληροφορίες ανήκουν σ ' αυτούς .      0  
4  Τα παπούτσια του τένις έχουν μια σειρά από τιμ...      1  


In [None]:
xnli_deleter = DuplicateDeleter(valid_set, dataframes[4])
xnli_deleter.delete()

0  duplicates deleted


Unnamed: 0,premise,hypothesis,label
0,Conceptually cream skimming has two basic dime...,Product and geography are what make cream skim...,1
1,you know during the season and i guess at at y...,You lose the things to the following level if ...,0
2,One of our number will carry out your instruct...,A member of my team will execute your orders w...,0
3,How do you know ? All this is their informatio...,This information belongs to them .,0
4,yeah i tell you what though if you go price so...,The tennis shoes have a range of prices .,1
...,...,...,...
392631,"Clearly , California can - and must - do better .",California cannot do any better .,2
392632,It was once regarded as the most beautiful str...,So many of the original buildings had been rep...,1
392633,Houseboats are a beautifully preserved traditi...,The tradition of houseboats originated while t...,0
392634,Obituaries fondly recalled his on-air debates ...,The obituaries were beautiful and written in k...,1
