In [37]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [38]:
BEGINNING_OF_SPAN = "<BOS>"
END_OF_SPAN = "<EOS>"

In [39]:
# DATASET = '/Users/henrywilliams/Documents/uni/anle/assessment/propaganda_dataset_v2'
train = pd.read_csv(f'{DATASET}/propaganda_train.tsv', sep='\t', header=0, quoting=3)
val = pd.read_csv(f'{DATASET}/propaganda_val.tsv', sep='\t', header=0, quoting=3)

In [40]:
train = train[train['label'] != 'not_propaganda']
val = val[val['label'] != 'not_propaganda']

In [41]:
def extract_snippet(sample: str) -> str:
    """
    Extract text within the <BOS> and <EOS> tags.

    Args:
        sample (str): A text sample containing both <BOS> and <EOS>

    Returns:
        str: A string with only the text within the span
    """

    assert (
        BEGINNING_OF_SPAN in sample
    ), "Text sample should contain beginning of span tag (<BOS>)"
    assert END_OF_SPAN in sample, "Text sample should contain end of span tag (<EOS>)"
    s_idx = sample.index(BEGINNING_OF_SPAN) + len(BEGINNING_OF_SPAN)
    e_idx = sample.index(END_OF_SPAN)
    return sample[s_idx:e_idx]

train['tagged_in_context'] = train['tagged_in_context'].apply(extract_snippet)
val['tagged_in_context'] = val['tagged_in_context'].apply(extract_snippet)

In [46]:
tfidf = TfidfVectorizer(max_features=1_000_000)
train_vecs = tfidf.fit_transform(train['tagged_in_context'])
test_vecs = tfidf.transform(val['tagged_in_context'])
model = LogisticRegression()
model.fit(train_vecs, train['label'])

test_predictions = model.predict(test_vecs)
metrics = classification_report(test_predictions, val['label'])