In [None]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import os
import sys
import spacy
from collections import Counter
import matplotlib.pyplot as plt
sys.path.append('..')
from preprocess import parse_debate_dataset, preprocess_hp_dataset

In [None]:
plt.rcParams['figure.figsize'] = [5, 3]

In [None]:
nlp = spacy.load("en_core_web_sm", disable=["parser"]) 
nlp.add_pipe(nlp.create_pipe('sentencizer'))

In [None]:
def get_baseline(labels, binary=False):
    if not binary:
        label_set = sorted(labels.unique())
        preds = np.random.randint(label_set[0], high=label_set[-1]+1, size=len(labels))
        return sum(preds == labels) / len(labels)
    else:
        return sum(labels) / len(labels)

## Persuasiveness

In [None]:
debates_df = parse_debate_dataset('./DebatePersuasiveness/DebateArguments.txt', nlp)

In [None]:
plt.hist(debates_df['Persuasiveness'], bins=6, edgecolor='white')

In [None]:
sorted(debates_df['Persuasiveness'].unique())

In [None]:
preds = np.random.randint(1, high=7, size=len(debates_df))

In [None]:
get_baseline(debates_df['Persuasiveness'])

In [None]:
debates_train = pd.read_json('DebatePersuasiveness/persuasiveness_dataset-train.json', orient='records')

In [None]:
debates_valid = pd.read_json('DebatePersuasiveness/persuasiveness_dataset-valid.json', orient='records')

In [None]:
debates_test = pd.read_json('DebatePersuasiveness/persuasiveness_dataset-test.json', orient='records')

In [None]:
plt.hist(debates_train['Persuasiveness'], bins=6, edgecolor='white')

In [None]:
plt.hist(debates_valid['Persuasiveness'], bins=6, edgecolor='white')

In [None]:
plt.hist(debates_test['Persuasiveness'], bins=6, edgecolor='white')

In [None]:
print(f'Train: samples {len(debates_train)}, baseline {get_baseline(debates_train["Persuasiveness"]):.4f}')
print(f'Valid: samples {len(debates_valid)}, baseline {get_baseline(debates_train["Persuasiveness"]):.4f}')
print(f'Test:  samples {len(debates_test)}, baseline {get_baseline(debates_train["Persuasiveness"]):.4f}')

## GCDC

### Clinton

In [None]:
clinton_train = pd.read_csv('./GCDC/Clinton_train.csv')
clinton_test  = pd.read_csv('./GCDC/Clinton_test.csv')

In [None]:
plt.hist(clinton_train['labelA'], bins=3, edgecolor='white')

In [None]:
plt.hist(clinton_test['labelA'], bins=3, edgecolor='white')

In [None]:
print(f'Clinton train baseline: {get_baseline(clinton_train["labelA"])}')
print(f'Clinton test  baseline: {get_baseline(clinton_test["labelA"])}')

### Enron

In [None]:
enron_train = pd.read_csv('./GCDC/Enron_train.csv')
enron_test  = pd.read_csv('./GCDC/Enron_test.csv')

In [None]:
plt.hist(enron_train['labelA'], bins=3, edgecolor='white')

In [None]:
plt.hist(enron_test['labelA'], bins=3, edgecolor='white')

In [None]:
print(f'Enron train baseline: {get_baseline(enron_train["labelA"])}')
print(f'Enron test  baseline: {get_baseline(enron_test["labelA"])}')

### Yahoo

In [None]:
yahoo_train = pd.read_csv('./GCDC/Yahoo_train.csv')
yahoo_test  = pd.read_csv('./GCDC/Yahoo_test.csv')

In [None]:
plt.hist(yahoo_train['labelA'], bins=3, edgecolor='white')

In [None]:
plt.hist(yahoo_test['labelA'], bins=3, edgecolor='white')

In [None]:
print(f'Yahoo train baseline: {get_baseline(yahoo_train["labelA"])}')
print(f'Yahoo test  baseline: {get_baseline(yahoo_test["labelA"])}')

### Yelp

In [None]:
yelp_train = pd.read_csv('./GCDC/Yelp_train.csv')
yelp_test  = pd.read_csv('./GCDC/Yelp_test.csv')

In [None]:
plt.hist(yelp_train['labelA'], bins=3, edgecolor='white')

In [None]:
plt.hist(yelp_test['labelA'], bins=3, edgecolor='white')

In [None]:
print(f'Yelp train baseline: {get_baseline(yelp_train["labelA"])}')
print(f'Yelp test  baseline: {get_baseline(yelp_test["labelA"])}')

## Hyperpartisan

In [None]:
hyper_train = pd.read_json('./SemEval/byarticle-train.json', orient='records')
hyper_valid = pd.read_json('./SemEval/byarticle-valid.json', orient='records')
hyper_test = pd.read_json('./SemEval/byarticle-test.json', orient='records')

In [None]:
n_train_hyp = len(hyper_train)
n_valid_hyp = len(hyper_valid)
n_test_hyp  = len(hyper_test)
n_train_hyp, n_valid_hyp, n_test_hyp

In [None]:
hyper_train['label_int'] = hyper_train['label'].apply(lambda x: 1 if x == 'true' else 0)
hyper_valid['label_int'] = hyper_valid['label'].apply(lambda x: 1 if x == 'true' else 0)
hyper_test['label_int']  = hyper_test['label'].apply(lambda x: 1 if x == 'true' else 0)

In [None]:
train_pos_hyp = sum(hyper_train['label_int'])
valid_pos_hyp = sum(hyper_valid['label_int'])
test_pos_hyp  = sum(hyper_test['label_int'])

In [None]:
print(f'Train: samples {n_train_hyp}, pos {train_pos_hyp}, baseline {get_baseline(hyper_train["label_int"], binary=True):.4f}')
print(f'Valid: samples {n_valid_hyp}, pos {valid_pos_hyp}, baseline {get_baseline(hyper_valid["label_int"], binary=True):.4f}')
print(f'Test:  samples {n_test_hyp}, pos {test_pos_hyp}, baseline {get_baseline(hyper_test["label_int"], binary=True):.4f}')

In [None]:
plt.hist(hyper_train['label_int'], bins=2, edgecolor='white')

In [None]:
plt.hist(hyper_valid['label_int'], bins=2, edgecolor='white')

In [None]:
plt.hist(hyper_test['label_int'], bins=2, edgecolor='white')

## Fake News (Polit)

In [None]:
fake_news_train = pd.read_csv('./FakeNews/politifact/train.tsv', sep='\t', header=0, names=['text', 'label'])
fake_news_valid = pd.read_csv('./FakeNews/politifact/val.tsv', sep='\t', header=0, names=['text', 'label'])
fake_news_test  = pd.read_csv('./FakeNews/politifact/test.tsv', sep='\t', header=0, names=['text', 'label'])

In [None]:
n_train_fake = len(fake_news_train)
n_valid_fake = len(fake_news_valid)
n_test_fake  = len(fake_news_test)
n_train_fake, n_valid_fake, n_test_fake

In [None]:
train_pos_fake = sum(fake_news_train['label'])
valid_pos_fake = sum(fake_news_valid['label'])
test_pos_fake  = sum(fake_news_test['label'])

In [None]:
print(f'Train: samples {n_train_fake}, pos {train_pos_fake}, baseline {get_baseline(fake_news_train["label"], binary=True):.4f}')
print(f'Valid: samples {n_valid_fake}, pos {valid_pos_fake}, baseline {get_baseline(fake_news_valid["label"], binary=True):.4f}')
print(f'Test:  samples {n_test_fake}, pos {test_pos_fake}, baseline {get_baseline(fake_news_test["label"], binary=True):.4f}')

In [None]:
plt.hist(fake_news_train['label'], bins=2, edgecolor='white')

In [None]:
plt.hist(fake_news_valid['label'], bins=2, edgecolor='white')

In [None]:
plt.hist(fake_news_test['label'], bins=2,edgecolor='white')