# Laboratory work #4 (text classification)

In [None]:
import os
import time

import pandas as pd
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

In [None]:
test_vectors_read = {}

with open('../assets/annotated-corpus/test-embeddings.tsv', 'r') as file:
    for line in file:
        parts = line.strip().split('\t')
        doc_id = parts[0]
        vector = list(map(float, parts[1:]))
        test_vectors_read[doc_id] = vector
        
test_embeddings = pd.DataFrame(test_vectors_read).T

In [None]:
test_embeddings

In [None]:
data = []

for root, dirs, files in os.walk('../assets/annotated-corpus'):
    for file in files:
        if file.endswith('.tsv'):
            parts = root.split(os.sep)
            if len(parts) >= 2:
                train_test_val = parts[-2]  # train/test/val part
                fake_true = parts[-1]       # fake/true class
                document_index = file.split('.')[0]  # document index

                if train_test_val == 'assets':
                    continue
                data.append([document_index, train_test_val, fake_true])

In [None]:
df = pd.DataFrame(data, columns=['document_index', 'part', 'class'])
df.set_index('document_index', inplace=True)

In [None]:
df.head()

In [None]:
test_embeddings = test_embeddings.merge(df, left_index=True, right_index=True, how='left')

In [None]:
test_embeddings.head()

In [None]:
X = test_embeddings.drop(['part', 'class'], axis=1)
y = test_embeddings['class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


def calculate_metrics(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    TP = np.diag(cm)
    FP = cm.sum(axis=0) - TP
    FN = cm.sum(axis=1) - TP
    TN = cm.sum() - (FP + FN + TP)

    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    f1_score = 2 * (precision * recall) / (precision + recall)
    accuracy = (TP + TN) / (TP + FP + FN + TN)

    return np.nanmean(precision), np.nanmean(recall), np.nanmean(f1_score), np.nanmean(accuracy)

In [None]:
kernels = [
    'linear', 
    'poly', 
    'rbf', 
    'sigmoid'
]
results = []

for kernel in kernels:
    model = SVC(kernel=kernel)
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time

    y_pred = model.predict(X_test)
    precision, recall, f1_score, accuracy = calculate_metrics(y_test, y_pred)

    results.append({
        'kernel': kernel,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'accuracy': accuracy,
        'training_time': training_time
    })

In [None]:
results_df = pd.DataFrame(results)
results_df

Best kernel: poly, because it is accurate and comparable fast.

In [None]:
def no_transform(x):
    return x

def add_transformed_features(X, func):
    if func is not no_transform:
        transformed_X = np.apply_along_axis(func, 1, X)
        transformed_X = np.nan_to_num(transformed_X, nan=0.0, posinf=0.0, neginf=0.0)
        return np.concatenate((X, transformed_X), axis=1)
    else:
        return X

def safe_sqrt(x):
    return np.sqrt(np.abs(x))

transformations = [no_transform, safe_sqrt, np.abs, np.log1p, np.cos, np.sin]
results = []

X_train_transformed = X_train.copy()
X_test_transformed = X_test.copy()

for transform in transformations:
    X_train_transformed = add_transformed_features(X_train_transformed, transform)
    X_test_transformed = add_transformed_features(X_test_transformed, transform)

    model = SVC(kernel='poly')
    model.fit(X_train_transformed, y_train)

    y_pred = model.predict(X_test_transformed)
    precision, recall, f1_score, accuracy = calculate_metrics(y_test, y_pred)

    results.append({
        'transformation': transform.__name__,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'accuracy': accuracy
    })

In [None]:
X_test_transformed.shape

In [None]:
results_df = pd.DataFrame(results)
print(results_df)

Adding safe_sqrt is helpful, but other features are useless.