In [1]:
import os

from lkae.utils.data_loading import AuredDataset, root_dir

In [2]:
file_names = ['English_train.jsonl', 'English_dev.jsonl', 'English_test.jsonl']

blacklist = []
maybelist = []

for file_name in file_names:
    issue_dict = {} # issues per id
    config = {
        'preprocess': False,
        'add_author_name': False,
        'add_author_bio': False,
        'author_info_filepath': os.path.join(root_dir, 'data', 'combined-author-data-translated.json'),
    }

    fingerprint = 'pre-' if config['preprocess'] else 'nopre-'
    fingerprint += 'nam-' if config['add_author_name'] else 'nonam-'
    fingerprint += 'bio' if config['add_author_bio'] else 'nobio'

    ds = AuredDataset(os.path.join(root_dir, 'data', file_name), **config)

    for rumor in ds:
        id = rumor['id']
        timeline = rumor['timeline']
        total_issues = 0

        for tweet in timeline:
            if "ISSUE: couldn't translate" in tweet[2]:
                total_issues += 1

        # filter down to tweets with transl issues
        if total_issues > 0:
            
            has_evidence = False
            # test for tweets with translation issues that have 
            if 'evidence' in rumor and rumor['evidence'] and len(rumor['evidence']) > 0:
                print(f'rumor {id} has non-empty evidence array')
                has_evidence = True
                for ev in rumor['evidence']:

                    # any evidence with translation issues?
                    # if not, we could just cull tweets from the tl with transl issues...
                    # ... as the tweet would be verifiable without those tweets 
                    if "ISSUE: couldn't translate" in ev[2]:
                        print(f'OH NO! transl issue in evidence for {id}')

            # calculate % of timeline tweets that have issue
            issue_percent = round((total_issues/len(timeline))*100, 1)
            if issue_percent == 100.0:
                blacklist.append(id)
            else:
                maybelist.append(id)
            issue_dict[f"{file_name}-{id}"] = {'issue_perc': issue_percent, 'has_ev': has_evidence}

    display(issue_dict)

rumor AuRED_090 has non-empty evidence array


{'English_train.jsonl-AuRED_052': {'issue_perc': 44.0, 'has_ev': False},
 'English_train.jsonl-AuRED_058': {'issue_perc': 100.0, 'has_ev': False},
 'English_train.jsonl-AuRED_063': {'issue_perc': 100.0, 'has_ev': False},
 'English_train.jsonl-AuRED_074': {'issue_perc': 100.0, 'has_ev': False},
 'English_train.jsonl-AuRED_075': {'issue_perc': 100.0, 'has_ev': False},
 'English_train.jsonl-AuRED_081': {'issue_perc': 73.3, 'has_ev': False},
 'English_train.jsonl-AuRED_069': {'issue_perc': 100.0, 'has_ev': False},
 'English_train.jsonl-AuRED_090': {'issue_perc': 0.6, 'has_ev': True},
 'English_train.jsonl-AuRED_071': {'issue_perc': 100.0, 'has_ev': False},
 'English_train.jsonl-AuRED_061': {'issue_perc': 100.0, 'has_ev': False},
 'English_train.jsonl-AuRED_055': {'issue_perc': 100.0, 'has_ev': False}}

{'English_dev.jsonl-AuRED_066': {'issue_perc': 100.0, 'has_ev': False},
 'English_dev.jsonl-AuRED_076': {'issue_perc': 96.7, 'has_ev': False}}

{'English_test.jsonl-AuRED_092': {'issue_perc': 0.9, 'has_ev': False},
 'English_test.jsonl-AuRED_054': {'issue_perc': 100.0, 'has_ev': False},
 'English_test.jsonl-AuRED_042': {'issue_perc': 1.4, 'has_ev': False},
 'English_test.jsonl-AuRED_060': {'issue_perc': 100.0, 'has_ev': False},
 'English_test.jsonl-AuRED_070': {'issue_perc': 100.0, 'has_ev': False},
 'English_test.jsonl-AuRED_056': {'issue_perc': 100.0, 'has_ev': False}}

In [3]:
# we will definitely discard all rumors that have timelines ith 100% translation issues 
display(blacklist)


['AuRED_058',
 'AuRED_063',
 'AuRED_074',
 'AuRED_075',
 'AuRED_069',
 'AuRED_071',
 'AuRED_061',
 'AuRED_055',
 'AuRED_066',
 'AuRED_054',
 'AuRED_060',
 'AuRED_070',
 'AuRED_056']

In [4]:
# maybe keep those rumors? it's only 6 though (over all datasets)
display(maybelist)

['AuRED_052', 'AuRED_081', 'AuRED_090', 'AuRED_076', 'AuRED_092', 'AuRED_042']

In [5]:
cull_list = [*blacklist, *maybelist]
cull_list

['AuRED_058',
 'AuRED_063',
 'AuRED_074',
 'AuRED_075',
 'AuRED_069',
 'AuRED_071',
 'AuRED_061',
 'AuRED_055',
 'AuRED_066',
 'AuRED_054',
 'AuRED_060',
 'AuRED_070',
 'AuRED_056',
 'AuRED_052',
 'AuRED_081',
 'AuRED_090',
 'AuRED_076',
 'AuRED_092',
 'AuRED_042']