## 3.2 ML models with features

For this approach, we used the following training
characteristics: 

* Sentence BERT (Reimers and Gurevych, 2019) similarity, 
* Sentiment analysis, based on the python package pysentimiento (Pérez et al., 2023)
* The number of adjectives, adverbs and total words contained in the biased lexicons reported or collected by Recasens (Recasens et al., 2013).

In [None]:
import pandas as pd # data processing
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
import re #regex
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import spacy
import os
nlp_en = spacy.load('en_core_web_lg')
from pysentimiento import create_analyzer
analyzer = create_analyzer(task="sentiment", lang="en")

## Only if your run it in colab 
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
def get_wikibias():
    '''
    We read the wikibias corpus, prepare the partitions and return them. 
    '''
    corpus = pd.read_csv(f'data/wikibias_re.csv')
    #To remove the epistemologically biased sentences (part of an experiment in the article), you can comment this line.
    corpus = corpus[~corpus.type.str.contains('0\|1\|0')]
    corpus['binary_class'] = corpus['type'].apply(lambda x: 1 if '1' in x else 0) #We change to binary classes.
    corpus = corpus.sample(frac = 1)
    #We separate and mix sentences.
    train = corpus[corpus.partition == 0].sample(frac = 1)
    test =  corpus[corpus.partition == 1]
    x_train, y_train = train.sentence.values, train.binary_class.values
    x_test, y_test = test.sentence.values, test.binary_class.values
    return x_train, x_test, y_train, y_test

x_train_en, x_test_en, y_train_en, y_test_en = get_wikibias()

In [None]:
#We get the weight of each class for training because there is an imbalance.
class_weights = (1 - (np.count_nonzero(y_train_en) / len(y_train_en))),(1 - ((len(y_train_en) - np.count_nonzero(y_train_en)) / len(y_train_en)))
class_weights

(0.6581104400345126, 0.34188955996548753)

In [None]:
#We load the lexicons
def txt_to_set(url):
    '''
    It reads all files in a folder and returns a set of their contents.

    Args:
        url (str): folder path
    '''
    lex = set({})
    for file in os.listdir(url):
        if ('README' and ".pdf") not in file:
            with open(url + file, "r") as text_file:
              for line in text_file:
                    aux = line.strip()
                    if ' ' not in aux and '' != aux:
                        lex.add(aux)
    return lex

lex_en = txt_to_set("lexicon/")
print(f"lex_en: {len(lex_en)}")

lex_en: 11156


In [None]:
def get_features(data, lex):
    '''
    It calculates the number of adjectives, 
    adverbs and words in the Recasens lexicons.

    Args:
        data (array): Data partition
        lex (set): lexicon
    '''
    pattern = r'[-_{}(),;:"#\/.¡!¿?·\[\]\'`‘’%0123456789…—\n]'
    bias, adj, adv = [],[],[]
    for i in range(len(data)):
        doc = nlp_en(data[i])
        b, a, ad = 0,0,0
        for word in doc:
            #Removes the sign, converts to lower case, and keeps the lemma.
            aux = re.sub(pattern,'', word.lemma_.lower())  
            if aux != '':
                if aux in lex:
                    b += 1
                if word.pos_ == 'ADJ':
                    a += 1
                elif word.pos_ == 'ADV':
                    ad += 1
        bias.append(b/len(doc))
        adj.append(a/len(doc))
        adv.append(ad/len(doc))
    return bias, adj, adv

num_bias, adj, adv = get_features(x_train_en, lex_en)
num_bias_t, adj_t, adv_t = get_features(x_test_en, lex_en)

In [None]:
def get_sentiment(X):
    '''
    Sentiment analysis, based on the python 
    package pysentimiento

    Args:
        X (array): Data partition
    '''
    sentiment = []
    for x in X:
        sentiment.append(analyzer.predict(x))
    return sentiment

sentiment_train  = get_sentiment(x_train_en)
sentiment_test = get_sentiment(x_test_en)

In [None]:
#Sentence BERT
model_ = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

train_embeddings = model_.encode(x_train_en)
test_embeddings = model_.encode(x_test_en)

#### Classification

In [None]:
def get_probs(probs, sbert, num_bias, adj, adv):
    '''
    Prepares the feature vector to classify

    Args:
        probs (dict): Likelihood of each sentence being 
                     negative, positive or neutral
        sbert (array): Sentence BERT of each sentence
        num_bias (int): Number of words in the lexicon
        adj (int): Number of adjectives
        adv (int): Number of adverbs
    '''
    proba = []
    for i,p in enumerate(probs):
        features = np.array([p.probas['NEG']+p.probas['POS'], adj[i], adv[i], num_bias[i]])
        proba.append(np.concatenate((features, sbert[i])))
    return np.array(proba)

train_en_X = get_probs(sentiment_train, train_embeddings, num_bias, adj, adv)
test_en_X = get_probs(sentiment_test, test_embeddings, num_bias_t, adj_t, adv_t)

In [None]:
def predict_score(model, test_X, test_Y, is_cont=False):
    '''
    Predicts the class of data given a trained model
    '''
    y_pred = model.predict(test_X)
    y_pred = y_pred.round() if is_cont else y_pred
    accuracy = accuracy_score(test_Y, y_pred)
    f1 = f1_score(test_Y, y_pred, average="macro")
    precision = precision_score(test_Y, y_pred, pos_label=None,average='macro',zero_division=0.0)
    recall = recall_score(test_Y, y_pred, pos_label=None,average='macro',zero_division=0.0)
    print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
    return y_pred

def classify(model, train_X, train_Y, test_X, test_Y, is_cont=False):
    '''
    Train a model and predict outcomes from the data.
    '''
    model.fit(train_X, train_Y)
    return predict_score(model, test_X, test_Y, is_cont)

In [None]:
class_wei = {0: 0.39295663, 1: 0.60704337}
#class_wei = {0: 0.34188955996548753,1:0.6581104400345126}

print('LogisticRegression')
clf = LogisticRegression()
classify(clf, train_en_X, y_train_en, test_en_X, y_test_en)

print('SVC')
clf_svm = svm.SVC()
classify(clf_svm, train_en_X, y_train_en, test_en_X, y_test_en)

print('SVR')
clf_svr = svm.SVR()
classify(clf_svr, train_en_X, y_train_en, test_en_X, y_test_en, True)

print('GaussianNB')
mo = GaussianNB()
m = classify(mo, train_en_X, y_train_en, test_en_X, y_test_en, True)

LogisticRegression
accuracy = 0.694, precision = 0.646, recall = 0.590, f1 = 0.588
SVC
accuracy = 0.692, precision = 0.650, recall = 0.570, f1 = 0.557
SVR
accuracy = 0.687, precision = 0.633, recall = 0.593, f1 = 0.595
GaussianNB
accuracy = 0.617, precision = 0.588, recall = 0.596, f1 = 0.588


In [None]:
print('SVR')
clf_svm = svm.SVC(class_weight=class_wei)
m = classify(clf_svm, train_en_X, y_train_en, test_en_X, y_test_en, True)

SVR
accuracy = 0.669, precision = 0.616, recall = 0.603, f1 = 0.607


### SG2

In [None]:
sg2_corpus = pd.read_csv('data/final_labels_SG2.csv', delimiter=';')
sg2_corpus['binary_class'] = sg2_corpus['label_bias'].apply(lambda x: 1 if 'Biased' in x else 0)
sg2_corpus = sg2_corpus[["text","binary_class"]]
#Partition
_, x_sg2_test, _, y_sg2_test = train_test_split(sg2_corpus.text.values,
                                                    sg2_corpus.binary_class.values,
                                                    train_size=.8,
                                                    random_state=0)
#Features
num_bias_t_sg2, adj_t_sg2, adv_t_sg2 = get_features(x_sg2_test, lex_en, False)
test_embeddings_sg2 = model_.encode(x_sg2_test)
sentiment_sg2  = get_sentiment(x_sg2_test)
_, test_sg2_X = get_probs(sentiment_sg2, test_embeddings_sg2, _, num_bias_t_sg2, adj_t_sg2, adv_t_sg2)

#Predict
predict_score(clf_svm, test_sg2_X , y_sg2_test)

accuracy = 0.639, precision = 0.644, recall = 0.641, f1 = 0.638


array([[0.75721098, 0.24278902],
       [0.75226968, 0.24773032],
       [0.60477472, 0.39522528],
       ...,
       [0.56535004, 0.43464996],
       [0.65021644, 0.34978356],
       [0.74770155, 0.25229845]])

### Checkthat

In [None]:
corpus_Check = pd.read_csv('data/dev_en.tsv', sep='\t')
corpus_Check['label'] = np.where(corpus_Check['label']=='SUBJ',1,0)

#Features
num_bias_t_Check, adj_t_Check, adv_t_Check = get_features(corpus_Check['sentence'].values, lex_en, False)
test_embeddings_Check = model_.encode(corpus_Check['sentence'].values)
sentiment_Check  = get_sentiment(corpus_Check['sentence'].values)
_, test_Check_X = get_probs(sentiment_Check, test_embeddings_Check, _, num_bias_t_Check, adj_t_Check, adv_t_Check)

#Predict
predict_score(clf_svm, test_Check_X , corpus_Check['label'].values)

accuracy = 0.597, precision = 0.596, recall = 0.596, f1 = 0.596


array([[0.57069692, 0.42930308],
       [0.56004556, 0.43995444],
       [0.54996382, 0.45003618],
       [0.53756265, 0.46243735],
       [0.62919135, 0.37080865],
       [0.65280538, 0.34719462],
       [0.46582593, 0.53417407],
       [0.64688458, 0.35311542],
       [0.68987905, 0.31012095],
       [0.54449739, 0.45550261],
       [0.47559691, 0.52440309],
       [0.59624766, 0.40375234],
       [0.67148172, 0.32851828],
       [0.60023142, 0.39976858],
       [0.57112916, 0.42887084],
       [0.76404451, 0.23595549],
       [0.61595417, 0.38404583],
       [0.69836603, 0.30163397],
       [0.55694424, 0.44305576],
       [0.6267005 , 0.3732995 ],
       [0.59757154, 0.40242846],
       [0.62796411, 0.37203589],
       [0.60011293, 0.39988707],
       [0.6110431 , 0.3889569 ],
       [0.66945384, 0.33054616],
       [0.5       , 0.5       ],
       [0.72113548, 0.27886452],
       [0.42048974, 0.57951026],
       [0.73503664, 0.26496336],
       [0.46441123, 0.53558877],
       [0.