In [24]:
import pandas as pd
import numpy as np
from transformers import pipeline
from textblob import TextBlob

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

import xgboost as xgb
import pickle

In [5]:
train_data = pd.read_csv("dataset/train2.tsv", sep='\t', header = None).drop(0, axis=1)
train_data.columns = ['ID', 'label', 'statement', 'subject', 'speaker', 'speaker_title', 'state', 'party_affliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context', 'extracted_justification']
train_data = train_data[~train_data['label'].isna()].reset_index(drop=True)
train_data.head()

Unnamed: 0,ID,label,statement,subject,speaker,speaker_title,state,party_affliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context,extracted_justification
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,That's a premise that he fails to back up. Ann...
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,"Surovell said the decline of coal ""started whe..."
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,Obama said he would have voted against the ame...
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,The release may have a point that Mikulskis co...
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,"Crist said that the economic ""turnaround start..."


For cleaning, we have to drop all rows containing na (only 2, so does not matter) and add more columns that contains the ratio of statements that each speaker have made. 

In [6]:
train_data = train_data[~train_data['speaker'].isna()]

In [7]:
train_data['total_statements'] = train_data['barely_true_counts'] + train_data['false_counts'] + train_data['half_true_counts'] + train_data['mostly_true_counts'] + train_data['pants_on_fire_counts']

train_data['barely_true_ratio'] = train_data['barely_true_counts'] / train_data['total_statements']
train_data['false_ratio'] = train_data['false_counts'] / train_data['total_statements']
train_data['half_true_ratio'] = train_data['half_true_counts'] / train_data['total_statements']
train_data['mostly_true_ratio'] = train_data['mostly_true_counts'] / train_data['total_statements']
train_data['pants_on_fire_ratio'] = train_data['pants_on_fire_counts'] / train_data['total_statements']

train_data = train_data.reset_index(drop = True)
train_data.head()

Unnamed: 0,ID,label,statement,subject,speaker,speaker_title,state,party_affliation,barely_true_counts,false_counts,...,mostly_true_counts,pants_on_fire_counts,context,extracted_justification,total_statements,barely_true_ratio,false_ratio,half_true_ratio,mostly_true_ratio,pants_on_fire_ratio
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,...,0.0,0.0,a mailer,That's a premise that he fails to back up. Ann...,1.0,0.0,1.0,0.0,0.0,0.0
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,...,1.0,0.0,a floor speech.,"Surovell said the decline of coal ""started whe...",2.0,0.0,0.0,0.5,0.5,0.0
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,...,163.0,9.0,Denver,Obama said he would have voted against the ame...,473.0,0.147992,0.150106,0.338266,0.344609,0.019027
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,...,5.0,44.0,a news release,The release may have a point that Mikulskis co...,78.0,0.089744,0.24359,0.038462,0.064103,0.564103
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,...,19.0,2.0,an interview on CNN,"Crist said that the economic ""turnaround start...",65.0,0.230769,0.138462,0.307692,0.292308,0.030769


## Use Naive Realism to determine the veracity score
* Perspective Analysis
* Dissenting View Checks
* Isolation Analysis


Perspective Analysis can be performed by analyzing the speaker's bacgkround (record of truthness) and sentimental analysis of the statement. If a statement portrays itself as the sole correct viewpoint, it is usually done by speakers with bad reputations or high polarity (to positive or negative). If the sentimental model is confident that it an objectivity positive or negative statement, that it could only consider one perspective.

In [9]:
# Unfortunately, I am forced to trunctate the dataset (though this only applies to two statements). However, if I had time, I would build my model from the ground up
sentiment_analyzer = pipeline('sentiment-analysis', truncation=True)
sentiment_analyzer(train_data.loc[0, 'statement'])

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'NEGATIVE', 'score': 0.8292973637580872}]

In [10]:
blob = TextBlob(train_data.loc[0, 'statement'])
subjectivity = blob.sentiment.subjectivity
subjectivity

0.1

In [11]:
def preprocessing(file_name):
    data = pd.read_csv(file_name, sep='\t', header = None).drop(0, axis=1)
    data.columns = ['ID', 'label', 'statement', 'subject', 'speaker', 'speaker_title', 'state', 'party_affliation', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context', 'extracted_justification']
    data = data[~data['label'].isna()]
    data = data[~data['speaker'].isna()]

    
    data = data.reset_index(drop = True)
    data['total_statements'] = data['barely_true_counts'] + data['false_counts'] + data['half_true_counts'] + data['mostly_true_counts'] + data['pants_on_fire_counts']

    data['barely_true_ratio'] = data['barely_true_counts'] / data['total_statements']
    data['false_ratio'] = data['false_counts'] / data['total_statements']
    data['half_true_ratio'] = data['half_true_counts'] / data['total_statements']
    data['mostly_true_ratio'] = data['mostly_true_counts'] / data['total_statements']
    data['pants_on_fire_ratio'] = data['pants_on_fire_counts'] / data['total_statements']

    data['confidence'] = data['statement'].apply(lambda x: sentiment_analyzer(x)[0]['score'])
    data['subjectivity'] = data['statement'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

    data = data[['barely_true_ratio', 'false_ratio', 'half_true_ratio', 'mostly_true_ratio', 'pants_on_fire_ratio', 'confidence', 'subjectivity', 'label']]
    # for some reason, there is still a big chunk of statements with speaker counts of 0 (even though, it includes the statement for that row??)
    # This is around 2% of the dataset. For now, i'm going to drop them in order to train properly. check 2932.json
    return data

In [12]:
test_data = preprocessing("dataset/test2.tsv")
val_data = preprocessing('dataset/val2.tsv')
train_data = preprocessing('dataset/train2.tsv')

In [13]:
X_train = train_data[['barely_true_ratio', 'false_ratio', 'half_true_ratio', 'mostly_true_ratio', 'pants_on_fire_ratio', 'confidence', 'subjectivity']]
y_train = train_data['label']

X_val = val_data[['barely_true_ratio', 'false_ratio', 'half_true_ratio', 'mostly_true_ratio', 'pants_on_fire_ratio', 'confidence', 'subjectivity']]
y_val = val_data['label']

X_test = test_data[['barely_true_ratio', 'false_ratio', 'half_true_ratio', 'mostly_true_ratio', 'pants_on_fire_ratio', 'confidence', 'subjectivity']]
y_test = test_data['label']

# Model Selection
Our features are the speaker's credibility through their ratios, confidence, and subjectivity determined by ML models. We will use machine learning models from simple to more complex ones to see which ones give us the best performance.\
Ideally, we would like a performance of better than 20% because that is better than simply guessing the label. \
In terms of evaluation, we only have accuracy, because there are no true or false positives

## Logistic Regression
The simplest of the models we will try. 

In [16]:
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)

In [17]:
y_train_pred = model.predict(X_train)
print("Validation Set Performance:")
print(f"Accuracy: {accuracy_score(y_train, y_train_pred)}")

Validation Set Performance:
Accuracy: 0.44641955358044644


In [18]:
y_val_pred = model.predict(X_val)
print("Validation Set Performance:")
print(f"Accuracy: {accuracy_score(y_val, y_val_pred)}")

Validation Set Performance:
Accuracy: 0.48044692737430167


In [19]:
y_test_pred = model.predict(X_test)
print("Test Set Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_test_pred)}")

Test Set Performance:
Accuracy: 0.4407467532467532


Wow! This is surprising! The accuracy is already at 44%! I'm not sure whether I did something wrong, so make sure to ask **Dr. Arsanjani** about this

## XGBoosted Decision Trees

This can handle null values directly (though we are using the same data for fairness).

In [14]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)
y_test_encoded = label_encoder.transform(y_test)

In [15]:
dtrain = xgb.DMatrix(X_train, label=y_train_encoded)
dval = xgb.DMatrix(X_val, label=y_val_encoded)
dtest = xgb.DMatrix(X_test, label=y_test_encoded)

In [16]:
param_grid = {
    'max_depth': [2, 3, 4, 5, 7],
    'eta': [0.01, 0.1, 0.3],
    'n_estimators': [50, 100, 150],
}

In [17]:
xgb_model = xgb.XGBClassifier(objective="multi:softmax", num_class=6, eval_metric='mlogloss')
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=3, verbose=1)

In [18]:
grid_search.fit(X_train, y_train_encoded)

Fitting 3 folds for each of 45 candidates, totalling 135 fits


In [19]:
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

Best Parameters: {'eta': 0.01, 'max_depth': 3, 'n_estimators': 100}
Best Cross-Validation Score: 0.45526468974166884


In [20]:
bst = grid_search.best_estimator_

In [21]:
y_train_pred = bst.predict(X_train)
print("Train Set Performance:")
print(f"Accuracy: {accuracy_score(y_train_encoded, y_train_pred)}")

Train Set Performance:
Accuracy: 0.46307872631373315


In [22]:
y_val_pred = bst.predict(X_val)
print("Validation Set Performance:")
print(f"Accuracy: {accuracy_score(y_val_encoded, y_val_pred)}")

Validation Set Performance:
Accuracy: 0.4742990654205608


In [23]:
y_test_pred = bst.predict(X_test)
print("Test Set Performance:")
print(f"Accuracy: {accuracy_score(y_test_encoded, y_test_pred)}")

Test Set Performance:
Accuracy: 0.4451460142067877


In [25]:
pickle.dump(bst, open("model/XGModel.sav", 'wb'))

XG Boosted Trees with hyperparameter tuning actually performs worse :(. I wonder why that is the case

In [26]:
loaded_model = pickle.load(open("model/XGModel.sav", 'rb'))
loaded_model