In [8]:
from datasets import load_dataset
import pandas as pd

In [17]:
import re
import string

class DuplicateDeleter:

    def __init__(self, valid_set, external_dataset):
        self.valid_set = valid_set
        self.external_dataset = external_dataset

    def preprocess_query(self, q):
        punct = '[' + ''.join([c for c in string.punctuation if c != "'"]) + ']'
        q = q.lower()
        q = re.sub(punct, ' ', q)
        q = re.sub('[ ]{2,}', ' ', q)
        return q

    def search_in_base(self, q, kb):
        q = self.preprocess_query(q)
        return int(q in kb)

    def delete(self):
        original_length =  self.external_dataset.shape[0]
        self.knowledge_base = set(self.external_dataset['premise'].apply(self.preprocess_query))
        self.valid_set['duplicate'] = self.valid_set['premise'].apply(lambda q: self.search_in_base(q, self.knowledge_base))

        for i in self.valid_set[self.valid_set.duplicate > 0.5].index:
            print("index dropped: ", i)
            self.external_dataset.drop(self.external_dataset[self.external_dataset['premise'] == self.valid_set.loc[i,:].premise].index, inplace=True)

        print(original_length - self.external_dataset.shape[0], " duplicates deleted")
        return self.external_dataset.reset_index(drop=True)

In [18]:
valid_set_translated = pd.read_csv("data/valid_translated.csv")
valid_set_translated.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,a7b0b9498c,"It's just the beginning!""",A great journey is about to begin!,en,English,1
1,00c0cdf348,Colder and farther than ever has developed the...,God's voice feels so far away and cold,en,English,0
2,a2a8b36437,"Consequently, government decision-makers and s...",Government representatives refuse to change th...,en,English,2
3,097b4dfe2a,You have to walk through it).,You need to pass through it on foot.,en,English,0
4,aab0894630,"Rather, kids today are not only little bundles...",While kids today are symbols of success and st...,en,English,1


### MNLI

In [19]:
mnli = load_dataset('glue', 'mnli')
df_mnli = pd.DataFrame.from_dict(mnli["train"])
df_mnli.drop(columns=['idx'], inplace=True)
original_count = df_mnli.shape[0]

Found cached dataset glue (/Users/dominikbieri/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 5/5 [00:00<00:00, 230.59it/s]


In [20]:
mnli_deleter = DuplicateDeleter(valid_set_translated, df_mnli)
mnli_deleter.delete()

index dropped:  32
index dropped:  51
index dropped:  175
index dropped:  310
index dropped:  352
index dropped:  468
index dropped:  479
index dropped:  483
index dropped:  555
index dropped:  632
index dropped:  659
index dropped:  705
index dropped:  763
index dropped:  846
index dropped:  964
index dropped:  973
index dropped:  1022
index dropped:  1060
index dropped:  1124
index dropped:  1140
index dropped:  1154
index dropped:  1158
72  duplicates deleted


Unnamed: 0,premise,hypothesis,label
0,Conceptually cream skimming has two basic dime...,Product and geography are what make cream skim...,1
1,you know during the season and i guess at at y...,You lose the things to the following level if ...,0
2,One of our number will carry out your instruct...,A member of my team will execute your orders w...,0
3,How do you know? All this is their information...,This information belongs to them.,0
4,yeah i tell you what though if you go price so...,The tennis shoes have a range of prices.,1
...,...,...,...
392625,"Clearly, California can - and must - do better.",California cannot do any better.,2
392626,It was once regarded as the most beautiful str...,So many of the original buildings had been rep...,1
392627,Houseboats are a beautifully preserved traditi...,The tradition of houseboats originated while t...,0
392628,Obituaries fondly recalled his on-air debates ...,The obituaries were beautiful and written in k...,1


### SNLI

In [21]:
snli = load_dataset('snli')
df_snli = pd.DataFrame.from_dict(snli["train"])

Found cached dataset snli (/Users/dominikbieri/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)
100%|██████████| 3/3 [00:00<00:00, 38.55it/s]


In [22]:
snli_deleter = DuplicateDeleter(valid_set_translated, df_snli)
snli_deleter.delete()

0  duplicates deleted


Unnamed: 0,premise,hypothesis,label
0,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.,1
1,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette.",2
2,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse.",0
3,Children smiling and waving at camera,They are smiling at their parents,1
4,Children smiling and waving at camera,There are children present,0
...,...,...,...
550147,Four dirty and barefooted children.,four kids won awards for 'cleanest feet',2
550148,Four dirty and barefooted children.,"four homeless children had their shoes stolen,...",1
550149,A man is surfing in a bodysuit in beautiful bl...,A man in a bodysuit is competing in a surfing ...,1
550150,A man is surfing in a bodysuit in beautiful bl...,A man in a business suit is heading to a board...,2


### XNLI

In [23]:
valid_set = pd.read_csv("data/valid.csv")
valid_set.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,a7b0b9498c,"It's just the beginning!""",A great journey is about to begin!,en,English,1
1,00c0cdf348,Lạnh hơn và xa hơn bao giờ hết đã phát triển t...,Giọng của Chúa cảm thấy thật xa xôi và lạnh lẽo,vi,Vietnamese,0
2,a2a8b36437,"Κατά συνέπεια, οι κυβερνητικοί υπεύθυνοι λήψης...",Οι κυβερνητικοί εκπρόσωποι αρνούνται να αλλάξο...,el,Greek,2
3,097b4dfe2a,You have to walk through it).,You need to pass through it on foot.,en,English,0
4,aab0894630,"Rather, kids today are not only little bundles...",While kids today are symbols of success and st...,en,English,1


In [31]:
xnli = load_dataset('xnli', 'all_languages')
df_xnli = pd.DataFrame.from_dict(xnli["train"])

Found cached dataset xnli (/Users/dominikbieri/.cache/huggingface/datasets/xnli/all_languages/1.1.0/818164464f9c9fd15776ca8a00423b074344c3e929d00a2c1a84aa5a50c928bd)
100%|██████████| 3/3 [00:00<00:00, 34.99it/s]


In [32]:
df_xnli.head(100)

Unnamed: 0,premise,hypothesis,label
0,{'ar': '- و قد ال كريم المفاهيمية اثنان اساسيي...,"{'language': ['ar', 'bg', 'de', 'el', 'en', 'e...",1
1,{'ar': 'انت تعرف خلال الموسم و اعتقد انه عند ا...,"{'language': ['ar', 'bg', 'de', 'el', 'en', 'e...",0
2,{'ar': 'واحدة من رقابنا ستقوم بتنفيذ تعليماتك ...,"{'language': ['ar', 'bg', 'de', 'el', 'en', 'e...",0
3,{'ar': 'وكيف تعرف ذلك ? كل هذا هو معلوماتهم مر...,"{'language': ['ar', 'bg', 'de', 'el', 'en', 'e...",0
4,"{'ar': 'نعم , انا اقول لك ماذا لو ذهبت ل السعر...","{'language': ['ar', 'bg', 'de', 'el', 'en', 'e...",1
...,...,...,...
95,{'ar': 'اضغط على المزيد من الروابط ( على الجان...,"{'language': ['ar', 'bg', 'de', 'el', 'en', 'e...",2
96,{'ar': 'ولذا بدات بمراقبته وفجاة بقيت ضبط الاس...,"{'language': ['ar', 'bg', 'de', 'el', 'en', 'e...",1
97,"{'ar': 'لا , لا , حسنا , اعتني بالامر', 'bg': ...","{'language': ['ar', 'bg', 'de', 'el', 'en', 'e...",0
98,"{'ar': '( مرحبا , ( بين ' ' ' ' ' ' '', 'bg': ...","{'language': ['ar', 'bg', 'de', 'el', 'en', 'e...",2


In [25]:
xnli_deleter = DuplicateDeleter(valid_set, df_xnli)
xnli_deleter.delete()

AttributeError: 'dict' object has no attribute 'lower'