In [1]:
import pandas as pd 
import numpy as np 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
BEGINNING_OF_SPAN = "<BOS>"
END_OF_SPAN = "<EOS>"

In [3]:
DATASET = '../propaganda_dataset_v2/'
train = pd.read_csv(f'{DATASET}/propaganda_train.tsv', sep='\t', header=0, quoting=3)
val = pd.read_csv(f'{DATASET}/propaganda_val.tsv', sep='\t', header=0, quoting=3)

In [4]:
train = train[train['label'] != 'not_propaganda']
val = val[val['label'] != 'not_propaganda']

In [5]:
def extract_snippet(sample: str) -> str:
    """
    Extract text within the <BOS> and <EOS> tags.

    Args:
        sample (str): A text sample containing both <BOS> and <EOS>

    Returns:
        str: A string with only the text within the span
    """

    assert (
        BEGINNING_OF_SPAN in sample
    ), "Text sample should contain beginning of span tag (<BOS>)"
    assert END_OF_SPAN in sample, "Text sample should contain end of span tag (<EOS>)"
    s_idx = sample.index(BEGINNING_OF_SPAN) + len(BEGINNING_OF_SPAN)
    e_idx = sample.index(END_OF_SPAN)
    return sample[s_idx:e_idx]

train['tagged_in_context'] = train['tagged_in_context'].apply(extract_snippet)
val['tagged_in_context'] = val['tagged_in_context'].apply(extract_snippet)

In [9]:
overall_metrics = {
    'linear': {
        'precision': [],
        'recall': [],    
        'f1': [],
        'acc': []
    },
    'poly': {
        'precision': [],
        'recall': [],    
        'f1': [],
        'acc': []
    },
    'rbf': {
        'precision': [],
        'recall': [],    
        'f1': [],
        'acc': []
    },
    'sigmoid': {
        'precision': [],
        'recall': [],    
        'f1': [],
        'acc': []
    },
}

In [19]:
tfidf = TfidfVectorizer()
train_vecs = tfidf.fit_transform(train['tagged_in_context'])
test_vecs = tfidf.transform(val['tagged_in_context'])

for i in range(10):
    for kernel in ['linear', 'poly', 'rbf', 'sigmoid']:
        model = SVC(kernel=kernel)
        model.fit(train_vecs, train['label'])

        test_predictions = model.predict(test_vecs)
        this_metrics = classification_report(val['label'], test_predictions, output_dict=True)
    overall_metrics[kernel]['precision'].append(this_metrics['weighted avg']['precision'])
    overall_metrics[kernel]['recall'].append(this_metrics['weighted avg']['recall'])
    overall_metrics[kernel]['f1'].append(this_metrics['weighted avg']['f1-score'])
    overall_metrics[kernel]['acc'].append(this_metrics['accuracy'])

In [17]:
for kernel, metrics in overall_metrics.items():
    print(f"====={kernel}===== ")
    print(f"Precision:\t{np.round( np.mean(metrics['precision']), decimals=2 )}")
    print(f"Recall:\t\t{np.round(np.mean(metrics['recall']), decimals=2)}")
    print(f"F1:\t\t{np.round(np.mean(metrics['f1']), decimals=2)}")
    print(f"Acc:\t\t{np.round(np.mean(metrics['acc']), decimals=2)}")

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
