In [1]:
from datasets import load_dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
valid_set = pd.read_csv("data/valid_translated.csv")
valid_set.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,a7b0b9498c,"It's just the beginning!""",A great journey is about to begin!,en,English,1
1,00c0cdf348,Colder and farther than ever has developed the...,God's voice feels so far away and cold,en,English,0
2,a2a8b36437,"Consequently, government decision-makers and s...",Government representatives refuse to change th...,en,English,2
3,097b4dfe2a,You have to walk through it).,You need to pass through it on foot.,en,English,0
4,aab0894630,"Rather, kids today are not only little bundles...",While kids today are symbols of success and st...,en,English,1


In [3]:
import re
import string

class DuplicateDeleter:

    def __init__(self, valid_set, external_dataset):
        self.valid_set = valid_set
        self.external_dataset = external_dataset

    def preprocess_query(self, q):
        punct = '[' + ''.join([c for c in string.punctuation if c != "'"]) + ']'
        q = q.lower()
        q = re.sub(punct, ' ', q)
        q = re.sub('[ ]{2,}', ' ', q)
        return q

    def search_in_base(self, q, kb):
        q = self.preprocess_query(q)
        return int(q in kb)

    def delete(self):
        original_length =  self.external_dataset.shape[0]
        self.knowledge_base = set(self.external_dataset['premise'].apply(self.preprocess_query))
        self.valid_set['mnli'] = self.valid_set['premise'].apply(lambda q: self.search_in_base(q, self.knowledge_base))

        for i in valid_set[valid_set.mnli > 0.5].index:
            print("index dropped: ", i)
            self.external_dataset.drop(self.external_dataset[self.external_dataset['premise'] == valid_set.loc[i,:].premise].index, inplace=True)

        print(original_length - self.external_dataset.shape[0], " duplicates deleted")
        return self.external_dataset.reset_index(drop=True)

In [4]:
mnli = load_dataset('glue', 'mnli')
df_mnli = pd.DataFrame.from_dict(mnli["train"])
df_mnli.drop(columns=['idx'], inplace=True)
original_count = df_mnli.shape[0]

Found cached dataset glue (/Users/dominikbieri/.cache/huggingface/datasets/glue/mnli/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
100%|██████████| 5/5 [00:00<00:00, 545.25it/s]


In [5]:
deleter = DuplicateDeleter(valid_set, df_mnli)

In [6]:
deleter.delete()

index dropped:  32
index dropped:  51
index dropped:  175
index dropped:  310
index dropped:  352
index dropped:  468
index dropped:  479
index dropped:  483
index dropped:  555
index dropped:  632
index dropped:  659
index dropped:  705
index dropped:  763
index dropped:  846
index dropped:  964
index dropped:  973
index dropped:  1022
index dropped:  1060
index dropped:  1124
index dropped:  1140
index dropped:  1154
index dropped:  1158
72  duplicates deleted


Unnamed: 0,premise,hypothesis,label
0,Conceptually cream skimming has two basic dime...,Product and geography are what make cream skim...,1
1,you know during the season and i guess at at y...,You lose the things to the following level if ...,0
2,One of our number will carry out your instruct...,A member of my team will execute your orders w...,0
3,How do you know? All this is their information...,This information belongs to them.,0
4,yeah i tell you what though if you go price so...,The tennis shoes have a range of prices.,1
...,...,...,...
392625,"Clearly, California can - and must - do better.",California cannot do any better.,2
392626,It was once regarded as the most beautiful str...,So many of the original buildings had been rep...,1
392627,Houseboats are a beautifully preserved traditi...,The tradition of houseboats originated while t...,0
392628,Obituaries fondly recalled his on-air debates ...,The obituaries were beautiful and written in k...,1
