In [51]:
import pandas as pd
import numpy as np
from transformers import pipeline
from textblob import TextBlob

In [52]:
train_data = pd.read_csv("dataset/train2.tsv", sep='\t', header = None).drop(0, axis=1)
train_data.columns = ['ID', 'label', 'statement', 'subject', 'speaker', 'speaker_title', 'state', 'party_affliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context', 'extracted_justification']
train_data = train_data[~train_data['label'].isna()].reset_index(drop=True)
train_data.head()

Unnamed: 0,ID,label,statement,subject,speaker,speaker_title,state,party_affliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context,extracted_justification
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,That's a premise that he fails to back up. Ann...
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,"Surovell said the decline of coal ""started whe..."
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,Obama said he would have voted against the ame...
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,The release may have a point that Mikulskis co...
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,"Crist said that the economic ""turnaround start..."


In [53]:
def label_to_veracity(label):
    lab_ver_dict = {'pants-fire': 0, 'false': 0.2, 'barely-true': 0.4, 'half-true': 0.6, 'mostly-true': 0.8, 'true': 1}
    return lab_ver_dict[label]

In [54]:
train_data['veracity'] = train_data['label'].apply(label_to_veracity)
train_data.head()

Unnamed: 0,ID,label,statement,subject,speaker,speaker_title,state,party_affliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context,extracted_justification,veracity
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,That's a premise that he fails to back up. Ann...,0.2
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,"Surovell said the decline of coal ""started whe...",0.6
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,Obama said he would have voted against the ame...,0.8
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,The release may have a point that Mikulskis co...,0.2
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,"Crist said that the economic ""turnaround start...",0.6


In [55]:
train_data.iloc[0, -1]

0.2

In [56]:
train_data.loc[0, 'statement']

'Says the Annies List political group supports third-trimester abortions on demand.'

In [57]:
train_data['label'].unique()

array(['false', 'half-true', 'mostly-true', 'true', 'barely-true',
       'pants-fire'], dtype=object)

## Use Naive Realism to determine the veracity score
* Perspective Analysis
* Dissenting View Checks
* Isolation Analysis

Perspective Analysis can be performed by analyzing the speaker's bacgkround (record of truthness) and sentimental analysis of the statement. If a statement portrays itself as the sole correct viewpoint, it is usually done by speakers with bad reputations or high polarity (to positive or negative). If the sentimental model is confident that it an objectivity positive or negative statement, that it could only consider one perspective.

In [58]:
sentiment_analyzer = pipeline('sentiment-analysis')
sentiment_analyzer(train_data.loc[0, 'statement'])

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'NEGATIVE', 'score': 0.8292973637580872}]

In [59]:
blob = TextBlob(train_data.loc[0, 'statement'])
subjectivity = blob.sentiment.subjectivity
subjectivity

0.1

In [60]:
# im forced to trunctate the text, if I had more time, i would find a model that does not need a trunctate, so we can use the full text
train_data['confidence'] = train_data['statement'].apply(lambda x: sentiment_analyzer(x, truncation=True, max_length=512)[0]['score'])
train_data.head()

In [None]:
train_data['subjectivity'] = train_data['statement'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
train_data.head()

Unnamed: 0,ID,label,statement,subject,speaker,speaker_title,state,party_affliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context,extracted_justification,veracity,sentiment,subjectivity
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,That's a premise that he fails to back up. Ann...,0.2,0.829297,0.1
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,"Surovell said the decline of coal ""started whe...",0.6,0.992363,0.4
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,Obama said he would have voted against the ame...,0.8,0.99097,0.0
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,The release may have a point that Mikulskis co...,0.2,0.837077,0.9
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,"Crist said that the economic ""turnaround start...",0.6,0.970115,0.2


In [None]:
X_train = train_data[['barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'confidence', 'subjectivity']]
y_train = train_data['veracity']