In [8]:
import numpy as np
import time
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [9]:
import os

def get_file_class_map(root_dir):
    file_class_map = {}

    for root, dirs, files in os.walk(root_dir):
        class_name = os.path.basename(root)

        if class_name == os.path.basename(root_dir):
            continue

        for file_name in files:
            key = file_name.split('.')[0]
            file_class_map[key] = class_name

    return file_class_map

test_directory = '../../dataset/20news-bydate-test'
file_class_test_map = get_file_class_map(test_directory)

train_directory = '../../dataset/20news-bydate-train'
file_class_train_map = get_file_class_map(train_directory)

file_class_map = file_class_test_map | file_class_train_map

print(file_class_map['53068'])
print(file_class_map['38761'])
print(file_class_map['49960'])

alt.atheism
comp.graphics
alt.atheism


In [10]:
def load_embeddings(file_path):
    embeddings = []
    doc_ids = []
    with open(file_path, 'r') as f:
        for line in f:
            data = line.strip().split('\t')
            doc_id = data[0]
            embedding = list(map(float, data[1:]))
            doc_ids.append(doc_id)
            embeddings.append(embedding)
    return np.array(embeddings), doc_ids

def get_labels(doc_ids, file_class_map):
    labels = [file_class_map[doc_id] for doc_id in doc_ids]
    return labels

In [11]:
from sklearn.preprocessing import LabelEncoder

train_embedding_file_path = '../../nechkasova-vectorization/assets/annotated-corpus/train.tsv'
test_embedding_file_path = '../../nechkasova-vectorization/assets/annotated-corpus/test.tsv'

X_train, doc_ids = load_embeddings(train_embedding_file_path)
y_train = get_labels(doc_ids, file_class_map)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

X_test, doc_ids = load_embeddings(test_embedding_file_path)
y_test = get_labels(doc_ids, file_class_map)

label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)

In [None]:
print(X_train, X_test, y_train_encoded, y_test_encoded)

In [13]:
def calculate_metrics(y_true, y_pred):
    labels = np.unique(y_true)
    precision_per_class = []
    recall_per_class = []
    f1_score_per_class = []
    
    total_samples = len(y_true)
    
    for label in labels:
        tp = sum((y_true == label) & (y_pred == label))
        fp = sum((y_true != label) & (y_pred == label))
        fn = sum((y_true == label) & (y_pred != label))
        tn = sum((y_true != label) & (y_pred != label))
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        precision_per_class.append(precision)
        recall_per_class.append(recall)
        f1_score_per_class.append(f1_score)
    
    accuracy = np.sum(y_true == y_pred) / total_samples
    
    return {
        'precision': np.mean(precision_per_class),
        'recall': np.mean(recall_per_class),
        'f1-score': np.mean(f1_score_per_class),
        'accuracy': accuracy
    }

In [14]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
kernel_params = {
    'linear': {'C': [0.1, 1, 10]},
    'poly': {'C': [0.1, 1], 'degree': [2, 3], 'coef0': [0.1, 1]},
    'rbf': {'C': [0.1, 1], 'gamma': ['scale', 0.1]},
    'sigmoid': {'C': [0.1, 1], 'gamma': ['scale', 0.1], 'coef0': [0, 0.5]}
}

In [15]:
def experiment_svm(X_train, y_train, X_test, y_test, kernels, kernel_params):
    results = {}

    for kernel in kernels:
        param_values = kernel_params[kernel]
        
        for C in param_values.get('C', [1]):
            for degree in param_values.get('degree', [3]):
                for gamma in param_values.get('gamma', ['scale']):
                    for coef0 in param_values.get('coef0', [0]):
                        model_params = {'kernel': kernel, 'C': C}
                        if kernel == 'poly':
                            model_params['degree'] = degree
                            model_params['coef0'] = coef0
                        elif kernel in ['rbf', 'sigmoid']:
                            model_params['gamma'] = gamma
                            model_params['coef0'] = coef0

                        model = SVC(**model_params, random_state=42)
                        start_time = time.time()
                        model.fit(X_train, y_train)
                        train_time = time.time() - start_time

                        y_pred = model.predict(X_test)

                        metrics = calculate_metrics(y_test, y_pred)

                        results[(kernel, C, degree, gamma, coef0)] = {
                            'metrics': metrics,
                            'training_time': train_time
                        }

                        print(f"Kernel: {kernel}, C: {C}, Degree: {degree}, Gamma: {gamma}, Coef0: {coef0}")
                        print(f"Accuracy: {metrics['accuracy']:.4f}, Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1-Score: {metrics['f1-score']:.4f}")
                        print(f"Training Time: {train_time:.4f} seconds\n")
    return results

In [None]:
# SVM (линейное ядро)
results = experiment_svm(X_train, y_train_encoded, X_test, y_test_encoded, kernels, kernel_params)

print("SVM (линейное ядро) - метрики")
for params, metrics in results.items():
    print(f"Параметры: {params}, Метрики: {metrics}")

In [17]:
hidden_layer_options = [(50,), (100,), (100, 50), (100, 100, 50)]
max_iter_options = [100, 300, 500, 1000]
learning_rate_options = [0.001, 0.01]
activation_options = ['relu', 'tanh', 'logistic']

In [18]:
def experiment_mlp(X_train, y_train, X_test, y_test):
    results = {}
    
    for hidden_layers in hidden_layer_options:
        for max_iter in max_iter_options:
            for learning_rate in learning_rate_options:
                for activation in activation_options:
                    model = MLPClassifier(
                        hidden_layer_sizes=hidden_layers,
                        max_iter=max_iter,
                        learning_rate_init=learning_rate,
                        activation=activation,
                        random_state=42
                    )

                    start_time = time.time()
                    model.fit(X_train, y_train)
                    training_time = time.time() - start_time
                    
                    y_pred = model.predict(X_test)
                    metrics = calculate_metrics(y_test, y_pred)

                    params = (hidden_layers, max_iter, learning_rate, activation)
                    results[params] = {
                        'metrics': metrics,
                        'training_time': training_time
                    }

                    print(f"Params: {params}")
                    print(f"Metrics: {metrics}")
                    print(f"Training Time: {training_time:.4f} seconds\n")
    
    return results

In [None]:
# MLP
results = experiment_mlp(X_train, y_train_encoded, X_test, y_test_encoded)

print("MLP - метрики:")
for params, result in results.items():
    print(f"Params: {params}, Metrics: {result['metrics']}, Training Time: {result['training_time']:.4f} seconds")

In [20]:
def safe_log1p(embeddings, shift=True):
    if shift:
        min_value = embeddings.min()
        if min_value < -1:
            embeddings_shifted = embeddings + abs(min_value) + 1
        else:
            embeddings_shifted = embeddings
        return np.log1p(embeddings_shifted)
    else:
        return np.log1p(np.abs(embeddings))

transformations = [
    safe_log1p,
    np.sin,
    np.cos,
    np.square
]

def extend_embeddings(embeddings, functions):
    extended_embeddings = embeddings.copy()
    for func in functions:
        transformed = func(embeddings)
        extended_embeddings = np.concatenate((extended_embeddings, transformed), axis=1)
    return extended_embeddings

In [21]:
X_train_extended = extend_embeddings(X_train, transformations)
X_test_extended = extend_embeddings(X_test, transformations)

In [22]:
def experiment_mlp_extended(X_train, y_train, X_test, y_test):
    model = MLPClassifier(hidden_layer_sizes=(100, 100, 50), max_iter=300, learning_rate_init=0.01, activation='logistic')
    
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    y_pred = model.predict(X_test)
    
    metrics = calculate_metrics(y_test, y_pred)
    
    return metrics, training_time

In [None]:
metrics_mlp, training_time_mlp = experiment_mlp_extended(X_train_extended, y_train_encoded, X_test_extended, y_test_encoded)
print(metrics_mlp)
print("Training Time:", training_time_mlp)

In [None]:
X_train_extended = extend_embeddings(X_train, [np.sin])
X_test_extended = extend_embeddings(X_test, [np.sin])

metrics_mlp, training_time_mlp = experiment_mlp_extended(X_train_extended, y_train_encoded, X_test_extended, y_test_encoded)
print(metrics_mlp)
print("Training Time:", training_time_mlp)

In [None]:
X_train_extended = extend_embeddings(X_train, [np.square])
X_test_extended = extend_embeddings(X_test, [np.square])

metrics_mlp, training_time_mlp = experiment_mlp_extended(X_train_extended, y_train_encoded, X_test_extended, y_test_encoded)
print(metrics_mlp)
print("Training Time:", training_time_mlp)

In [None]:
X_train_extended = extend_embeddings(X_train, [safe_log1p])
X_test_extended = extend_embeddings(X_test, [safe_log1p])

metrics_mlp, training_time_mlp = experiment_mlp_extended(X_train_extended, y_train_encoded, X_test_extended, y_test_encoded)
print(metrics_mlp)
print("Training Time:", training_time_mlp)

In [None]:
X_train_extended = extend_embeddings(X_train, [np.sin, np.square])
X_test_extended = extend_embeddings(X_test, [np.sin, np.square])

metrics_mlp, training_time_mlp = experiment_mlp_extended(X_train_extended, y_train_encoded, X_test_extended, y_test_encoded)
print(metrics_mlp)
print("Training Time:", training_time_mlp)