In [174]:
import mtl_functions as mtl

import os
import pandas as pd
import csv
import numpy as np

## constants

In [189]:
'''

FN = 'D550_project'

TASKS = [STANCE, FNC, NLI, TOPIC, TOPIC_5WAY, LAPTOP, RESTAURANT, TARGET,FN]

TASK_NAMES_SHORT = {"semeval2016-task6-stance": "STANCE", "fakenewschallenge": "FNC", "topic-based": "TOPIC", "multinli": "NLI",
                    "topic-based-5way": "TOPIC_5WAY", "absa-laptops": "LAPTOP", "absa-restaurants":"RESTAURANT", "target-dependent": "TARGET","D550_project":"FN"}


FN_LABELS = ['0','1','10','2','2 pinnochios','3','3 pinnochios','4','4 pinnochios','None','a little baloney','a lot of baloney','accurate','authorship confirmed!','bogus warning','cherry picks','commentary!','compromise','conclusion: accurate','conclusion: false','conclusion: unclear','confirmed authorship!','correct','correct attribution','correct attribution!','determination: a stretch','determination: barely true','determination: false','determination: huckster propaganda','determination: misleading','determination: mostly true','determination: true','disputed!','distorts the facts','exaggerated','exaggerates','facebook scams','fact','factscan score: false','factscan score: misleading','factscan score: true','fake','fake news','false','fiction','fiction!','fiction! & satire!','full flop','grass roots movement!','half flip','half true','half-true','in the works','in-between','in-the-green','in-the-red','inaccurate attribution!','incorrect','incorrect attribution!','investigation pending!','legend','misattributed','miscaptioned','misleading','misleading recommendations','misleading!','mixture','mostly false','mostly fiction!','mostly true','mostly truth!','mostly-correct','mostly_false','mostly_true','needs context','no evidence','no flip','none','not the whole story','not yet rated','opinion!','outdated','outdated!','pants on fire!','partially true','partly true','previously truth! now resolved!','promise broken','promise kept','rating: false','scam','scam!','some baloney','spins the facts','stalled','statirical reports','true','true messages','truth!','truth! & disputed!','truth! & fiction!','truth! & misleading!','truth! & outdated!','truth! & unproven!','understated','unobservable','unproven','unproven!','unsubstantiated messages','unsupported','unverified','verdict: false','verdict: true','verdict: unsubstantiated','verified','virus!','we rate this claim false']

'''
FN_LABELS = sorted(df_data['label'].unique().tolist())


## data_reader

In [191]:
FN_LABELS = ['0','1','10','2','2 pinnochios','3','3 pinnochios','4','4 pinnochios','None','a little baloney','a lot of baloney','accurate','authorship confirmed!','bogus warning','cherry picks','commentary!','compromise','conclusion: accurate','conclusion: false','conclusion: unclear','confirmed authorship!','correct','correct attribution','correct attribution!','determination: a stretch','determination: barely true','determination: false','determination: huckster propaganda','determination: misleading','determination: mostly true','determination: true','disputed!','distorts the facts','exaggerated','exaggerates','facebook scams','fact','factscan score: false','factscan score: misleading','factscan score: true','fake','fake news','false','fiction','fiction!','fiction! & satire!','full flop','grass roots movement!','half flip','half true','half-true','in the works','in-between','in-the-green','in-the-red','inaccurate attribution!','incorrect','incorrect attribution!','investigation pending!','legend','misattributed','miscaptioned','misleading','misleading recommendations','misleading!','mixture','mostly false','mostly fiction!','mostly true','mostly truth!','mostly-correct','mostly_false','mostly_true','needs context','no evidence','no flip','none','not the whole story','not yet rated','opinion!','outdated','outdated!','pants on fire!','partially true','partly true','previously truth! now resolved!','promise broken','promise kept','rating: false','scam','scam!','some baloney','spins the facts','stalled','statirical reports','true','true messages','truth!','truth! & disputed!','truth! & fiction!','truth! & misleading!','truth! & outdated!','truth! & unproven!','understated','unobservable','unproven','unproven!','unsubstantiated messages','unsupported','unverified','verdict: false','verdict: true','verdict: unsubstantiated','verified','virus!','we rate this claim false']




def task2data_reader(task):
    if task == STANCE: # all data available
        return readSemEval2016Task6
    if task == FNC: # all data available
        return readFakeNewsChallengeData
    if task == NLI:  # test data not available - so we use every other dev example as test example
        return readMultinliData
    if task == TOPIC:  # all data available
        return readTopicBased
    if task == TOPIC_5WAY:
        return readTopic5Way
    if task == LAPTOP: # all data available
        return read_absa_laptops
    if task == RESTAURANT: # all data available
        return read_absa_restaurants
    if task == TARGET:  # all data available
        return read_target_dependent
    
    if task == FN:  # all data available
        return readFakeNews
    
    raise ValueError('No data reader available for %s.' % task)



def parseFakeNews(datafolder, datafile, snip_folder, data_dict, debug, num_instances):

    
    with open(os.path.join(datafolder, datafile),encoding='utf-8' ) as tsv:
        reader = csv.reader(tsv, delimiter="\t" )
        for i,row in enumerate(reader):
            
            try:
                df_row = pd.read_csv(os.path.join(snip_folder, row[0]),sep='\t',header=None,engine='python',encoding='utf8' ,quoting=3)
                data_dict["seq1"].append(row[1])
                data_dict["seq2"].append(df_row[1].str.cat(sep=' ') + df_row[2].str.cat(sep=' '))
                data_dict["stance"].append(row[2])
            except:
               
                pass
            
            if i>num_instances:
                break
            
    for lab in set(data_dict["stance"]):
        data_dict["labels"].append(lab)
    
    return data_dict



def readFakeNews(datafolder="../data/", debug=True, num_instances=20):
    
    data_train = {"seq1": [], "seq2": [], "stance": [], "labels": []}
    data_train = parseFakeNews(datafolder, "train.tsv", "../data/snippets",data_train  , debug, num_instances)
    
    data_dev = {"seq1": [], "seq2": [], "stance": [], "labels": []}
    data_dev = parseFakeNews(datafolder, "dev.tsv", "../data/snippets",data_dev , debug, num_instances)
    
    data_test = {"seq1": [], "seq2": [], "stance": [], "labels": []}
    data_test = parseFakeNews(datafolder, "test.tsv", "../data/snippets", data_test,  debug, num_instances)
         
    data_train["labels"] = sorted(list(set(data_train["labels"])))
    assert data_train["labels"] ==  FN_LABELS
    data_dev["labels"] = data_train["labels"]
    data_test["labels"] = data_train["labels"]
    return data_train, data_dev, data_test



In [182]:
train,dev,test = readFakeNews(datafolder="../data/",debug= True, num_instances=np.inf)

print('Claim:',len(train['seq1']))
print('Snip:',len(train['seq2']))
print('y:',len(train['stance']))
print('nr of labels',len(train['labels']))

Claim: 24004
Snip: 24004
y: 24004
nr of labels 117
