# CheckThat! Task 2: Subjectivity Classification

## Introduction
This notebook is a part of the CheckThat! 2024 Task 2: Subjectivity Classification. The task is to classify the tweets into subjective and objective categories.

## Preprocessing

In [1]:
import pandas as pd
import numpy as np
import spacy
import re

import spacy.symbols

# load spacy model
nlp = spacy.load('en_core_web_sm')

# customize the tokenizer
special_tokens = ['<quote>', '<num>']
for token in special_tokens:
    nlp.tokenizer.add_special_case(token, [{spacy.symbols.ORTH: token}])

# load data
train = pd.read_csv('data/train_en.tsv', sep='\t')
validation = pd.read_csv('data/dev_en.tsv', sep='\t')
test = pd.read_csv('data/dev_test_en.tsv', sep='\t')
final_test = pd.read_csv('data/test_en.tsv', sep='\t')
final_test_labels = pd.read_csv('data/test_en_gold.tsv', sep='\t')

# split into X and y
X_train = train['sentence']
y_train = train['label']

X_validation = validation['sentence']
y_validation = validation['label']

X_test = test['sentence']
y_test = test['label']

X_final_test = final_test['sentence']
y_final_test = final_test_labels['label']

# preprocessing
def preprocess(sentence):
    # lowercase
    sentence = sentence.lower()
    # replace everything between “ and ” or between " and " with <QUOTE>
    sentence = re.sub(r'“.*?”|".*?"', '<quote>', sentence)
    # remove links
    sentence = sentence.replace(r'http\S+', '')
    # remove usernames
    sentence = sentence.replace(r'@\S+', '')
    # tokenize
    tokenized = nlp(sentence)
    # remove special characters except for ? !
    preserved = ['?', '!']
    tokens = []
    # filter out stopwords
    articles = {'a', 'an', 'the'}
    prepositions = {'in', 'on', 'at', 'to', 'from', 'by', 'with', 'about', 'against', 'between', 'during', 'without', 'within', 'among', 'upon'}
    conjunctions = {'and', 'but', 'or', 'nor', 'so', 'yet'}
    determiners = {'this', 'that', 'these', 'those', 'some', 'any', 'each', 'every', 'all', 'both', 'few', 'many', 'much', 'most', 'other', 'another'}
    stropwords = articles | prepositions | conjunctions | determiners

    for t in tokenized:
        if (not t.is_punct or t.text in preserved) and not t.is_space and t.text not in stropwords:
            if t.like_num: # replace numbers with <NUM>
                tokens.append('<num>')
            else:
                tokens.append(t.lemma_) # lemmatize
    return ' '.join(tokens)

## Feaure Extraction

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2
import gensim.downloader as api


class FeauturesExtractor:
    def __init__(self, chi2_features=2000):
        # Initialize the TF-IDF vectorizer
        self.tfidf_vectorizer = TfidfVectorizer()
        # Initialize Word2Vec
        self.word2vec = api.load('word2vec-google-news-300')
        # Load the pre-trained SBERT model
        self.model = 'paraphrase-multilingual-MiniLM-L12-v2'
        self.sbert = SentenceTransformer(self.model)
        # Initialize StandardScaler
        self.scaler = StandardScaler()
        # Initialize MinMaxScaler SBERT
        self.min_max_scaler_sbert = MinMaxScaler()
        # Initialize MinMaxScaler Word2Vec
        self.min_max_scaler_word2vec = MinMaxScaler()
        # Initialize Chi2
        k_best_features = chi2_features
        self.chi2_selector = SelectKBest(chi2, k=k_best_features)


    def fit_transform(self, X, y):
        self.tfidf_vectorizer.fit(X)
        tfidf_features = self.tfidf_vectorizer.fit_transform(X).toarray()
        sbert_features = self.sbert.encode(X)
        sbert_features = self.scaler.fit_transform(sbert_features)
        sbert_features_non_negative = self.min_max_scaler_sbert.fit_transform(sbert_features)
        word2vec_feautres = self.applyWord2Vec(X)
        word2vec_feautres = self.min_max_scaler_word2vec.fit_transform(word2vec_feautres)
        combined_features = np.concatenate([tfidf_features, sbert_features_non_negative, word2vec_feautres], axis=1)
        chi2_features = self.chi2_selector.fit_transform(combined_features, y)
        return chi2_features

    def transform(self, X):
        tfidf_features = self.tfidf_vectorizer.transform(X).toarray()
        sbert_features = self.sbert.encode(X)
        sbert_features = self.scaler.transform(sbert_features)
        sbert_features_non_negative = self.min_max_scaler_sbert.transform(sbert_features)
        word2vec_feautres = self.applyWord2Vec(X)
        word2vec_feautres = self.min_max_scaler_word2vec.transform(word2vec_feautres)
        combined_features = np.concatenate([tfidf_features, sbert_features_non_negative, word2vec_feautres], axis=1)
        chi2_features = self.chi2_selector.transform(combined_features)
        return chi2_features
    
    def applyWord2Vec(self, X):
        word2vec_features = []
        for sentence in X:
            word_vectors = [self.word2vec[word] for word in sentence if word in self.word2vec]
            if word_vectors:
                word_vectors = np.mean(word_vectors, axis=0)
            else:
                word_vectors = np.zeros(self.word2vec.vector_size)
            word2vec_features.append(word_vectors)
        return np.array(word2vec_features)
    
    def __str__(self) -> str:
        return str(self.tfidf_vectorizer) + ', ' + str(self.model) + ', StandardScaler then MinMaxScaler on sbert, ' + str(self.applyWord2Vec) + ', then MinMaxScaler on word2vec' + ', ' + str(self.chi2_selector)
    

  from tqdm.autonotebook import tqdm, trange


## Model

In [3]:

# logistic regression model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)



## Evaluation

In [4]:
from sklearn.metrics import classification_report, f1_score, confusion_matrix

def evaluate(y_pred, y_true):
    # f1 macro
    f1_score_macro = f1_score(y_true, y_pred, average='macro')
    # classification report
    report = classification_report(y_true, y_pred)
    # relative confusion matrix
    confusion_mat = confusion_matrix(y_true, y_pred)
    confusion_mat = confusion_mat / confusion_mat.sum(axis=1, keepdims=True)

    print('F1 macro:', f1_score_macro, '\n', 'Classfication report:\n', report, '\n', 'Relative confusion matrix:\n', confusion_mat)



## Development Helper Functions

In [5]:
import os
from datetime import datetime

def logEvaluation(y_true, y_pred, model, feature_extractor, preprocessing, additional_info='', filepath='evaluation_log.tsv'):
    # Calculate F1 macro score
    f1_macro = f1_score(y_true, y_pred, average='macro')
    
    # Get relative confusion matrix
    confusion_mat = confusion_matrix(y_true, y_pred)
    confusion_mat = confusion_mat / confusion_mat.sum(axis=1, keepdims=True)
    
    # Get model and feature extractor details
    model_info = str(model)
    feature_extractor_info = str(feature_extractor)
    
    # Get preprocessing details
    preprocessing_info = preprocessing
    
    # Get current date and time
    current_datetime = datetime.now().strftime("%Y-%m-%d %H:%M")
    
    # Create a DataFrame for the new log entry
    new_entry = pd.DataFrame([{
        'f1_macro': f1_macro,
        'rel_confusion_matrix': confusion_mat,
        'model_info': model_info,
        'feature_extractor_info': feature_extractor_info,
        'preprocessing_info': preprocessing_info,
        'additional_info': additional_info,
        'datetime': current_datetime
    }])
    
    # Check if the log file exists
    if os.path.exists(filepath):
        # Read the existing log file
        log_df = pd.read_csv(filepath, sep='\t')
        
        # Check if the model or feature extractor has changed
        last_entry = log_df.iloc[-1]
        if any(last_entry[field] != info for field, info in 
               [('model_info', model_info), ('feature_extractor_info', feature_extractor_info), 
                ('preprocessing_info', preprocessing_info), ('additional_info', additional_info)]):
            # Append the new entry
            log_df = pd.concat([log_df, new_entry], ignore_index=True)
    else:
        # If the file doesn't exist, create a new DataFrame
        log_df = new_entry
    
    # Write the log to the TSV file
    log_df.to_csv(filepath, sep='\t', index=False)

## Main

In [6]:
fe = FeauturesExtractor()

# training
X_train = X_train.apply(preprocess)
X_train = fe.fit_transform(X_train, y_train)
model.fit(X_train, y_train)

# validation
X_validation = X_validation.apply(preprocess)
X_validation = fe.transform(X_validation)
y_pred = model.predict(X_validation)
evaluate(y_pred, y_validation)

# test
X_test = X_test.apply(preprocess)
X_test = fe.transform(X_test)
y_pred = model.predict(X_test)
evaluate(y_pred, y_test)

# final test
X_final_test = X_final_test.apply(preprocess)
X_final_test = fe.transform(X_final_test)
y_pred = model.predict(X_final_test)
evaluate(y_pred, y_final_test)

# log evaluation
# preprocessing = 'preprocessing: lowercase, replace quotes with <quote>, remove links, usernames and stopwords, lemmatize, remove special characters except for ? and !, replace numbers with <num>'
# additional_info = 'erased auxilariy verbs from stopwords'
# logEvaluation(y_validation, y_pred, model, fe, preprocessing, additional_info=additional_info, filepath='evaluation_log.tsv')



F1 macro: 0.6983219235264653 
 Classfication report:
               precision    recall  f1-score   support

         OBJ       0.67      0.75      0.71       106
        SUBJ       0.74      0.65      0.69       113

    accuracy                           0.70       219
   macro avg       0.70      0.70      0.70       219
weighted avg       0.70      0.70      0.70       219
 
 Relative confusion matrix:
 [[0.75471698 0.24528302]
 [0.3539823  0.6460177 ]]
F1 macro: 0.6789634146341463 
 Classfication report:
               precision    recall  f1-score   support

         OBJ       0.65      0.72      0.68       116
        SUBJ       0.72      0.64      0.67       127

    accuracy                           0.68       243
   macro avg       0.68      0.68      0.68       243
weighted avg       0.68      0.68      0.68       243
 
 Relative confusion matrix:
 [[0.72413793 0.27586207]
 [0.36220472 0.63779528]]
F1 macro: 0.6388501529051988 
 Classfication report:
               precisio