In [8]:
import numpy as np
import time
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [9]:
import os

def get_file_class_map(root_dir):
    file_class_map = {}

    for root, dirs, files in os.walk(root_dir):
        class_name = os.path.basename(root)

        if class_name == os.path.basename(root_dir):
            continue

        for file_name in files:
            key = file_name.split('.')[0]
            file_class_map[key] = class_name

    return file_class_map

test_directory = '../../dataset/20news-bydate-test'
file_class_test_map = get_file_class_map(test_directory)

train_directory = '../../dataset/20news-bydate-train'
file_class_train_map = get_file_class_map(train_directory)

file_class_map = file_class_test_map | file_class_train_map

print(file_class_map['53068'])
print(file_class_map['38761'])
print(file_class_map['49960'])

alt.atheism
comp.graphics
alt.atheism


In [10]:
def load_embeddings(file_path):
    embeddings = []
    doc_ids = []
    with open(file_path, 'r') as f:
        for line in f:
            data = line.strip().split('\t')
            doc_id = data[0]
            embedding = list(map(float, data[1:]))
            doc_ids.append(doc_id)
            embeddings.append(embedding)
    return np.array(embeddings), doc_ids

def get_labels(doc_ids, file_class_map):
    labels = [file_class_map[doc_id] for doc_id in doc_ids]
    return labels

In [11]:
from sklearn.preprocessing import LabelEncoder

train_embedding_file_path = '../../nechkasova-vectorization/assets/annotated-corpus/train.tsv'
test_embedding_file_path = '../../nechkasova-vectorization/assets/annotated-corpus/test.tsv'

X_train, doc_ids = load_embeddings(train_embedding_file_path)
y_train = get_labels(doc_ids, file_class_map)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

X_test, doc_ids = load_embeddings(test_embedding_file_path)
y_test = get_labels(doc_ids, file_class_map)

label_encoder = LabelEncoder()
y_test_encoded = label_encoder.fit_transform(y_test)

In [12]:
print(X_train, X_test, y_train_encoded, y_test_encoded)

[[ 0.20773591  0.72988412 -0.11352285 ... -0.29935595 -0.1885143
   0.286544  ]
 [ 0.14539602  0.61061025  0.04516278 ... -0.24808198 -0.27161203
   0.3643503 ]
 [ 0.18635802  0.51708716 -0.02916354 ... -0.17222796 -0.28976381
   0.04486433]
 ...
 [ 0.19553041  0.5397074  -0.04884807 ... -0.34322487 -0.2403506
  -0.01544757]
 [ 0.26425181  0.69465227 -0.04891185 ... -0.29389333 -0.26407785
   0.18813769]
 [ 0.20166849  0.6093968  -0.02778901 ... -0.29972352 -0.31026409
   0.2870507 ]] [[ 0.08262633  0.39071409 -0.00992401 ... -0.21398073 -0.2028388
   0.24255998]
 [ 0.08318808  0.51211188 -0.16336193 ... -0.20390385 -0.2129409
   0.11551128]
 [ 0.11453234  0.67152243 -0.02109246 ... -0.1015598  -0.25121764
   0.19826109]
 ...
 [ 0.18724539  0.53250695  0.00307958 ... -0.22663249 -0.2919248
   0.29681634]
 [ 0.27671094  0.50519299  0.01900373 ... -0.30224663 -0.279837
   0.16828765]
 [ 0.04503434  0.5982903  -0.10858543 ... -0.21932402 -0.2808188
   0.16079474]] [ 6 17  6 ... 19 19 19] 

In [13]:
def calculate_metrics(y_true, y_pred):
    labels = np.unique(y_true)
    precision_per_class = []
    recall_per_class = []
    f1_score_per_class = []
    
    total_samples = len(y_true)
    
    for label in labels:
        tp = sum((y_true == label) & (y_pred == label))
        fp = sum((y_true != label) & (y_pred == label))
        fn = sum((y_true == label) & (y_pred != label))
        tn = sum((y_true != label) & (y_pred != label))
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        precision_per_class.append(precision)
        recall_per_class.append(recall)
        f1_score_per_class.append(f1_score)
    
    accuracy = np.sum(y_true == y_pred) / total_samples
    
    return {
        'precision': np.mean(precision_per_class),
        'recall': np.mean(recall_per_class),
        'f1-score': np.mean(f1_score_per_class),
        'accuracy': accuracy
    }

In [14]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
kernel_params = {
    'linear': {'C': [0.1, 1, 10]},
    'poly': {'C': [0.1, 1], 'degree': [2, 3], 'coef0': [0.1, 1]},
    'rbf': {'C': [0.1, 1], 'gamma': ['scale', 0.1]},
    'sigmoid': {'C': [0.1, 1], 'gamma': ['scale', 0.1], 'coef0': [0, 0.5]}
}

In [15]:
def experiment_svm(X_train, y_train, X_test, y_test, kernels, kernel_params):
    results = {}

    for kernel in kernels:
        param_values = kernel_params[kernel]
        
        for C in param_values.get('C', [1]):
            for degree in param_values.get('degree', [3]):
                for gamma in param_values.get('gamma', ['scale']):
                    for coef0 in param_values.get('coef0', [0]):
                        model_params = {'kernel': kernel, 'C': C}
                        if kernel == 'poly':
                            model_params['degree'] = degree
                            model_params['coef0'] = coef0
                        elif kernel in ['rbf', 'sigmoid']:
                            model_params['gamma'] = gamma
                            model_params['coef0'] = coef0

                        model = SVC(**model_params, random_state=42)
                        start_time = time.time()
                        model.fit(X_train, y_train)
                        train_time = time.time() - start_time

                        y_pred = model.predict(X_test)

                        metrics = calculate_metrics(y_test, y_pred)

                        results[(kernel, C, degree, gamma, coef0)] = {
                            'metrics': metrics,
                            'training_time': train_time
                        }

                        print(f"Kernel: {kernel}, C: {C}, Degree: {degree}, Gamma: {gamma}, Coef0: {coef0}")
                        print(f"Accuracy: {metrics['accuracy']:.4f}, Precision: {metrics['precision']:.4f}, Recall: {metrics['recall']:.4f}, F1-Score: {metrics['f1-score']:.4f}")
                        print(f"Training Time: {train_time:.4f} seconds\n")
    return results

In [16]:
# SVM (линейное ядро)
results = experiment_svm(X_train, y_train_encoded, X_test, y_test_encoded, kernels, kernel_params)

print("SVM (линейное ядро) - метрики")
for params, metrics in results.items():
    print(f"Параметры: {params}, Метрики: {metrics}")

Kernel: linear, C: 0.1, Degree: 3, Gamma: scale, Coef0: 0
Accuracy: 0.2387, Precision: 0.2066, Recall: 0.2075, F1-Score: 0.1723
Training Time: 6.2211 seconds

Kernel: linear, C: 1, Degree: 3, Gamma: scale, Coef0: 0
Accuracy: 0.2788, Precision: 0.2597, Recall: 0.2475, F1-Score: 0.2235
Training Time: 5.4420 seconds

Kernel: linear, C: 10, Degree: 3, Gamma: scale, Coef0: 0
Accuracy: 0.3139, Precision: 0.3063, Recall: 0.2829, F1-Score: 0.2636
Training Time: 6.9223 seconds

Kernel: poly, C: 0.1, Degree: 2, Gamma: scale, Coef0: 0.1
Accuracy: 0.1782, Precision: 0.1875, Recall: 0.1479, F1-Score: 0.1211
Training Time: 7.3157 seconds

Kernel: poly, C: 0.1, Degree: 2, Gamma: scale, Coef0: 1
Accuracy: 0.2063, Precision: 0.1485, Recall: 0.1732, F1-Score: 0.1366
Training Time: 6.8237 seconds

Kernel: poly, C: 0.1, Degree: 3, Gamma: scale, Coef0: 0.1
Accuracy: 0.1831, Precision: 0.2144, Recall: 0.1514, F1-Score: 0.1239
Training Time: 7.1892 seconds

Kernel: poly, C: 0.1, Degree: 3, Gamma: scale, Coef

Параметры: ('poly', 1, 3, 'scale', 1), Метрики: {'metrics': {'precision': 0.521016335174504, 'recall': 0.363963840727409, 'f1-score': 0.36057352794214004, 'accuracy': 0.4193919038359554}, 'training_time': 5.166396856307983}

In [17]:
hidden_layer_options = [(50,), (100,), (100, 50), (100, 100, 50)]
max_iter_options = [100, 300, 500, 1000]
learning_rate_options = [0.001, 0.01]
activation_options = ['relu', 'tanh', 'logistic']

In [18]:
def experiment_mlp(X_train, y_train, X_test, y_test):
    results = {}
    
    for hidden_layers in hidden_layer_options:
        for max_iter in max_iter_options:
            for learning_rate in learning_rate_options:
                for activation in activation_options:
                    model = MLPClassifier(
                        hidden_layer_sizes=hidden_layers,
                        max_iter=max_iter,
                        learning_rate_init=learning_rate,
                        activation=activation,
                        random_state=42
                    )

                    start_time = time.time()
                    model.fit(X_train, y_train)
                    training_time = time.time() - start_time
                    
                    y_pred = model.predict(X_test)
                    metrics = calculate_metrics(y_test, y_pred)

                    params = (hidden_layers, max_iter, learning_rate, activation)
                    results[params] = {
                        'metrics': metrics,
                        'training_time': training_time
                    }

                    print(f"Params: {params}")
                    print(f"Metrics: {metrics}")
                    print(f"Training Time: {training_time:.4f} seconds\n")
    
    return results

In [19]:
# MLP
results = experiment_mlp(X_train, y_train_encoded, X_test, y_test_encoded)

print("MLP - метрики:")
for params, result in results.items():
    print(f"Params: {params}, Metrics: {result['metrics']}, Training Time: {result['training_time']:.4f} seconds")



Params: ((50,), 100, 0.001, 'relu')
Metrics: {'precision': 0.2887943548147237, 'recall': 0.2835544638488966, 'f1-score': 0.25665263582757414, 'accuracy': 0.31240042485395647}
Training Time: 2.8510 seconds





Params: ((50,), 100, 0.001, 'tanh')
Metrics: {'precision': 0.2861109979379039, 'recall': 0.28341303677576357, 'f1-score': 0.25402041764228234, 'accuracy': 0.3053637812002124}
Training Time: 2.7025 seconds





Params: ((50,), 100, 0.001, 'logistic')
Metrics: {'precision': 0.24661646708841065, 'recall': 0.24471938697400403, 'f1-score': 0.21524574989051723, 'accuracy': 0.2711099309612321}
Training Time: 3.0110 seconds





Params: ((50,), 100, 0.01, 'relu')
Metrics: {'precision': 0.31339012416468937, 'recall': 0.29494191567578965, 'f1-score': 0.27322354654271447, 'accuracy': 0.3167817312798725}
Training Time: 2.6946 seconds





Params: ((50,), 100, 0.01, 'tanh')
Metrics: {'precision': 0.3186698008689377, 'recall': 0.310135183628648, 'f1-score': 0.2950365115903789, 'accuracy': 0.3312533191715348}
Training Time: 2.7135 seconds





Params: ((50,), 100, 0.01, 'logistic')
Metrics: {'precision': 0.2993331274796712, 'recall': 0.28717445115342904, 'f1-score': 0.2709782583441668, 'accuracy': 0.3133297928836962}
Training Time: 2.8817 seconds





Params: ((50,), 300, 0.001, 'relu')
Metrics: {'precision': 0.3015312190173235, 'recall': 0.29020864259754264, 'f1-score': 0.2657409730310184, 'accuracy': 0.31625066383430694}
Training Time: 7.9727 seconds





Params: ((50,), 300, 0.001, 'tanh')
Metrics: {'precision': 0.3093811066195343, 'recall': 0.2910100494671335, 'f1-score': 0.2700193362932811, 'accuracy': 0.3236856080722252}
Training Time: 14.1763 seconds





Params: ((50,), 300, 0.001, 'logistic')
Metrics: {'precision': 0.27999299719565096, 'recall': 0.2820117858468103, 'f1-score': 0.24995513284818024, 'accuracy': 0.3068242166755178}
Training Time: 14.7366 seconds

Params: ((50,), 300, 0.01, 'relu')
Metrics: {'precision': 0.3057453746364285, 'recall': 0.28384188551661343, 'f1-score': 0.2590760676067395, 'accuracy': 0.30576208178438663}
Training Time: 4.7121 seconds

Params: ((50,), 300, 0.01, 'tanh')
Metrics: {'precision': 0.3035034009876988, 'recall': 0.29300648660737727, 'f1-score': 0.27745080248907594, 'accuracy': 0.3131970260223048}
Training Time: 9.4542 seconds

Params: ((50,), 300, 0.01, 'logistic')
Metrics: {'precision': 0.31409499673137675, 'recall': 0.30108699917100756, 'f1-score': 0.2796441816615953, 'accuracy': 0.3259426447158789}
Training Time: 12.5368 seconds





Params: ((50,), 500, 0.001, 'relu')
Metrics: {'precision': 0.31215280941392043, 'recall': 0.2995570610883971, 'f1-score': 0.2767272039202898, 'accuracy': 0.32461497610196494}
Training Time: 20.9627 seconds

Params: ((50,), 500, 0.001, 'tanh')
Metrics: {'precision': 0.3123186505297578, 'recall': 0.3011874579367784, 'f1-score': 0.27816739283737607, 'accuracy': 0.3175783324482209}
Training Time: 21.9764 seconds





Params: ((50,), 500, 0.001, 'logistic')
Metrics: {'precision': 0.30171756225460533, 'recall': 0.29345699684997084, 'f1-score': 0.26712937082865595, 'accuracy': 0.31651619755708976}
Training Time: 18.4201 seconds

Params: ((50,), 500, 0.01, 'relu')
Metrics: {'precision': 0.3057453746364285, 'recall': 0.28384188551661343, 'f1-score': 0.2590760676067395, 'accuracy': 0.30576208178438663}
Training Time: 7.6609 seconds

Params: ((50,), 500, 0.01, 'tanh')
Metrics: {'precision': 0.3035034009876988, 'recall': 0.29300648660737727, 'f1-score': 0.27745080248907594, 'accuracy': 0.3131970260223048}
Training Time: 9.2516 seconds

Params: ((50,), 500, 0.01, 'logistic')
Metrics: {'precision': 0.31409499673137675, 'recall': 0.30108699917100756, 'f1-score': 0.2796441816615953, 'accuracy': 0.3259426447158789}
Training Time: 7.0738 seconds

Params: ((50,), 1000, 0.001, 'relu')
Metrics: {'precision': 0.32379479669905387, 'recall': 0.3143554679219026, 'f1-score': 0.2865575208754112, 'accuracy': 0.32952734997



Params: ((100,), 100, 0.001, 'relu')
Metrics: {'precision': 0.29925120747061296, 'recall': 0.2862422090687536, 'f1-score': 0.26248906449846193, 'accuracy': 0.3201009028146575}
Training Time: 5.1288 seconds





Params: ((100,), 100, 0.001, 'tanh')
Metrics: {'precision': 0.30293669042232324, 'recall': 0.289321047445549, 'f1-score': 0.27013975570488513, 'accuracy': 0.32501327668613916}
Training Time: 5.1084 seconds





Params: ((100,), 100, 0.001, 'logistic')
Metrics: {'precision': 0.2649534299119716, 'recall': 0.25950317989025445, 'f1-score': 0.23178541636000433, 'accuracy': 0.29168879447689855}
Training Time: 6.9840 seconds





Params: ((100,), 100, 0.01, 'relu')
Metrics: {'precision': 0.29725703990048813, 'recall': 0.30318079695527406, 'f1-score': 0.27923241710836916, 'accuracy': 0.32116303770578863}
Training Time: 4.6075 seconds





Params: ((100,), 100, 0.01, 'tanh')
Metrics: {'precision': 0.29005924874337263, 'recall': 0.2915249572020232, 'f1-score': 0.2654980223261196, 'accuracy': 0.3138608603292618}
Training Time: 5.7260 seconds





Params: ((100,), 100, 0.01, 'logistic')
Metrics: {'precision': 0.3064979440962091, 'recall': 0.2974948753263488, 'f1-score': 0.279872100166005, 'accuracy': 0.3190387679235263}
Training Time: 5.6854 seconds





Params: ((100,), 300, 0.001, 'relu')
Metrics: {'precision': 0.3112832617410908, 'recall': 0.3054991680680366, 'f1-score': 0.28368443500264573, 'accuracy': 0.32886351566648964}
Training Time: 22.4425 seconds





Params: ((100,), 300, 0.001, 'tanh')
Metrics: {'precision': 0.316933582985438, 'recall': 0.3030399367187726, 'f1-score': 0.28725957393448315, 'accuracy': 0.3307222517259692}
Training Time: 17.5371 seconds





Params: ((100,), 300, 0.001, 'logistic')
Metrics: {'precision': 0.3108247908466656, 'recall': 0.29059214454645554, 'f1-score': 0.27164089905739425, 'accuracy': 0.3244822092405735}
Training Time: 23.1114 seconds

Params: ((100,), 300, 0.01, 'relu')
Metrics: {'precision': 0.2943366227198877, 'recall': 0.3029800361868518, 'f1-score': 0.2703598354652749, 'accuracy': 0.3098778544875199}
Training Time: 4.6261 seconds

Params: ((100,), 300, 0.01, 'tanh')
Metrics: {'precision': 0.33064355050642935, 'recall': 0.28731562307943925, 'f1-score': 0.2704725888335774, 'accuracy': 0.29540626659585767}
Training Time: 11.5031 seconds

Params: ((100,), 300, 0.01, 'logistic')
Metrics: {'precision': 0.3110316758188104, 'recall': 0.29638547185216957, 'f1-score': 0.27205296921498323, 'accuracy': 0.3193043016463091}
Training Time: 15.5134 seconds





Params: ((100,), 500, 0.001, 'relu')
Metrics: {'precision': 0.3103633402581203, 'recall': 0.31394167787584326, 'f1-score': 0.2851346089534436, 'accuracy': 0.3272703133297929}
Training Time: 34.1345 seconds





Params: ((100,), 500, 0.001, 'tanh')
Metrics: {'precision': 0.32273018237094575, 'recall': 0.31663228672281296, 'f1-score': 0.28527527305346145, 'accuracy': 0.3279341476367499}
Training Time: 27.3184 seconds





Params: ((100,), 500, 0.001, 'logistic')
Metrics: {'precision': 0.304188815684954, 'recall': 0.29542868530748934, 'f1-score': 0.27722662985824365, 'accuracy': 0.32620817843866173}
Training Time: 37.9590 seconds

Params: ((100,), 500, 0.01, 'relu')
Metrics: {'precision': 0.2943366227198877, 'recall': 0.3029800361868518, 'f1-score': 0.2703598354652749, 'accuracy': 0.3098778544875199}
Training Time: 6.5029 seconds

Params: ((100,), 500, 0.01, 'tanh')
Metrics: {'precision': 0.33064355050642935, 'recall': 0.28731562307943925, 'f1-score': 0.2704725888335774, 'accuracy': 0.29540626659585767}
Training Time: 15.2128 seconds

Params: ((100,), 500, 0.01, 'logistic')
Metrics: {'precision': 0.3110316758188104, 'recall': 0.29638547185216957, 'f1-score': 0.27205296921498323, 'accuracy': 0.3193043016463091}
Training Time: 20.6853 seconds

Params: ((100,), 1000, 0.001, 'relu')
Metrics: {'precision': 0.3094396106537294, 'recall': 0.30880769817941317, 'f1-score': 0.28600069211859197, 'accuracy': 0.325278



Params: ((100,), 1000, 0.001, 'logistic')
Metrics: {'precision': 0.3079621274261727, 'recall': 0.3002380534788616, 'f1-score': 0.28025355869749985, 'accuracy': 0.3230217737652682}
Training Time: 45.9840 seconds

Params: ((100,), 1000, 0.01, 'relu')
Metrics: {'precision': 0.2943366227198877, 'recall': 0.3029800361868518, 'f1-score': 0.2703598354652749, 'accuracy': 0.3098778544875199}
Training Time: 5.7324 seconds

Params: ((100,), 1000, 0.01, 'tanh')
Metrics: {'precision': 0.33064355050642935, 'recall': 0.28731562307943925, 'f1-score': 0.2704725888335774, 'accuracy': 0.29540626659585767}
Training Time: 13.8999 seconds

Params: ((100,), 1000, 0.01, 'logistic')
Metrics: {'precision': 0.3110316758188104, 'recall': 0.29638547185216957, 'f1-score': 0.27205296921498323, 'accuracy': 0.3193043016463091}
Training Time: 16.5312 seconds





Params: ((100, 50), 100, 0.001, 'relu')
Metrics: {'precision': 0.30669106833642823, 'recall': 0.28324088115688967, 'f1-score': 0.2622626590061439, 'accuracy': 0.3153212958045672}
Training Time: 5.5869 seconds





Params: ((100, 50), 100, 0.001, 'tanh')
Metrics: {'precision': 0.3198251030374534, 'recall': 0.28632366282051713, 'f1-score': 0.26416019749513353, 'accuracy': 0.31917153478491767}
Training Time: 5.8138 seconds





Params: ((100, 50), 100, 0.001, 'logistic')
Metrics: {'precision': 0.22241425446695703, 'recall': 0.2434557804117637, 'f1-score': 0.20793427301547332, 'accuracy': 0.2786776420605417}
Training Time: 6.5465 seconds





Params: ((100, 50), 100, 0.01, 'relu')
Metrics: {'precision': 0.282374723306375, 'recall': 0.28567655333218417, 'f1-score': 0.25714667848380607, 'accuracy': 0.3028412108337759}
Training Time: 6.3791 seconds





Params: ((100, 50), 100, 0.01, 'tanh')
Metrics: {'precision': 0.29052217273207265, 'recall': 0.2692043930592288, 'f1-score': 0.24964043472001096, 'accuracy': 0.2902283590015932}
Training Time: 5.7251 seconds





Params: ((100, 50), 100, 0.01, 'logistic')
Metrics: {'precision': 0.2841358114324271, 'recall': 0.2884219590897093, 'f1-score': 0.26187493231379777, 'accuracy': 0.30921402018056293}
Training Time: 6.0921 seconds





Params: ((100, 50), 300, 0.001, 'relu')
Metrics: {'precision': 0.30803625721325384, 'recall': 0.29864115396968466, 'f1-score': 0.2772759665724144, 'accuracy': 0.3146574614976102}
Training Time: 24.0242 seconds





Params: ((100, 50), 300, 0.001, 'tanh')
Metrics: {'precision': 0.31062921274265176, 'recall': 0.3018939027795114, 'f1-score': 0.2784655434919891, 'accuracy': 0.3158523632501328}
Training Time: 25.6093 seconds





Params: ((100, 50), 300, 0.001, 'logistic')
Metrics: {'precision': 0.2773676721848185, 'recall': 0.2791865944601873, 'f1-score': 0.2490847085011064, 'accuracy': 0.29620286776420607}
Training Time: 23.1911 seconds

Params: ((100, 50), 300, 0.01, 'relu')
Metrics: {'precision': 0.27531884031987863, 'recall': 0.2883943381093754, 'f1-score': 0.25838795721756724, 'accuracy': 0.3040361125862985}
Training Time: 13.6974 seconds

Params: ((100, 50), 300, 0.01, 'tanh')
Metrics: {'precision': 0.27058296104858115, 'recall': 0.27553716987728566, 'f1-score': 0.24850742348128668, 'accuracy': 0.28000531067445567}
Training Time: 14.8357 seconds

Params: ((100, 50), 300, 0.01, 'logistic')
Metrics: {'precision': 0.2738151786166074, 'recall': 0.28619749288088814, 'f1-score': 0.25923875550872705, 'accuracy': 0.2879713223579395}
Training Time: 25.0207 seconds

Params: ((100, 50), 500, 0.001, 'relu')
Metrics: {'precision': 0.30315002800980173, 'recall': 0.3003603051119324, 'f1-score': 0.2789714623987263, 'acc



Params: ((100, 50), 500, 0.001, 'tanh')
Metrics: {'precision': 0.3114269276808848, 'recall': 0.30325168830740734, 'f1-score': 0.2838710150775539, 'accuracy': 0.3185077004779607}
Training Time: 32.2945 seconds





Params: ((100, 50), 500, 0.001, 'logistic')
Metrics: {'precision': 0.3042711839441039, 'recall': 0.2975926611956507, 'f1-score': 0.27253555979014643, 'accuracy': 0.31691449814126393}
Training Time: 36.9666 seconds

Params: ((100, 50), 500, 0.01, 'relu')
Metrics: {'precision': 0.27531884031987863, 'recall': 0.2883943381093754, 'f1-score': 0.25838795721756724, 'accuracy': 0.3040361125862985}
Training Time: 6.8823 seconds

Params: ((100, 50), 500, 0.01, 'tanh')
Metrics: {'precision': 0.27058296104858115, 'recall': 0.27553716987728566, 'f1-score': 0.24850742348128668, 'accuracy': 0.28000531067445567}
Training Time: 12.8159 seconds

Params: ((100, 50), 500, 0.01, 'logistic')
Metrics: {'precision': 0.2738151786166074, 'recall': 0.28619749288088814, 'f1-score': 0.25923875550872705, 'accuracy': 0.2879713223579395}
Training Time: 28.8604 seconds

Params: ((100, 50), 1000, 0.001, 'relu')
Metrics: {'precision': 0.30315002800980173, 'recall': 0.3003603051119324, 'f1-score': 0.2789714623987263, 'ac



Params: ((100, 50), 1000, 0.001, 'logistic')
Metrics: {'precision': 0.30993546117395476, 'recall': 0.3052603307164128, 'f1-score': 0.27930007057010103, 'accuracy': 0.3175783324482209}
Training Time: 181.8324 seconds

Params: ((100, 50), 1000, 0.01, 'relu')
Metrics: {'precision': 0.27531884031987863, 'recall': 0.2883943381093754, 'f1-score': 0.25838795721756724, 'accuracy': 0.3040361125862985}
Training Time: 55.5850 seconds

Params: ((100, 50), 1000, 0.01, 'tanh')
Metrics: {'precision': 0.27058296104858115, 'recall': 0.27553716987728566, 'f1-score': 0.24850742348128668, 'accuracy': 0.28000531067445567}
Training Time: 14.3892 seconds

Params: ((100, 50), 1000, 0.01, 'logistic')
Metrics: {'precision': 0.2738151786166074, 'recall': 0.28619749288088814, 'f1-score': 0.25923875550872705, 'accuracy': 0.2879713223579395}
Training Time: 24.7436 seconds





Params: ((100, 100, 50), 100, 0.001, 'relu')
Metrics: {'precision': 0.31220216564812087, 'recall': 0.2890190639571804, 'f1-score': 0.26005415495931594, 'accuracy': 0.306558682952735}
Training Time: 11.9725 seconds





Params: ((100, 100, 50), 100, 0.001, 'tanh')
Metrics: {'precision': 0.29301873904790254, 'recall': 0.29509219478605214, 'f1-score': 0.2676488169797688, 'accuracy': 0.3139936271906532}
Training Time: 12.5276 seconds





Params: ((100, 100, 50), 100, 0.001, 'logistic')
Metrics: {'precision': 0.18939528972454117, 'recall': 0.23035472388898198, 'f1-score': 0.19473433431354464, 'accuracy': 0.25955921402018056}
Training Time: 13.0572 seconds





Params: ((100, 100, 50), 100, 0.01, 'relu')
Metrics: {'precision': 0.2646431198564355, 'recall': 0.26379482288316003, 'f1-score': 0.2406036976532948, 'accuracy': 0.2818640467339352}
Training Time: 11.8393 seconds





Params: ((100, 100, 50), 100, 0.01, 'tanh')
Metrics: {'precision': 0.2533458982060774, 'recall': 0.253840166094461, 'f1-score': 0.2309664894163188, 'accuracy': 0.26858736059479554}
Training Time: 12.1535 seconds





Params: ((100, 100, 50), 100, 0.01, 'logistic')
Metrics: {'precision': 0.2699183029151622, 'recall': 0.27646076636027306, 'f1-score': 0.24936619379869235, 'accuracy': 0.29235262878385554}
Training Time: 12.6237 seconds





Params: ((100, 100, 50), 300, 0.001, 'relu')
Metrics: {'precision': 0.2997608972882175, 'recall': 0.29546103227626624, 'f1-score': 0.27681378742334917, 'accuracy': 0.3125331917153478}
Training Time: 39.4265 seconds





Params: ((100, 100, 50), 300, 0.001, 'tanh')
Metrics: {'precision': 0.2857332943086019, 'recall': 0.2894047108389409, 'f1-score': 0.2696273429133225, 'accuracy': 0.2987254381306426}
Training Time: 50.1466 seconds





Params: ((100, 100, 50), 300, 0.001, 'logistic')
Metrics: {'precision': 0.2507974415646908, 'recall': 0.25871444322555837, 'f1-score': 0.23345281298748338, 'accuracy': 0.29328199681359535}
Training Time: 54.1008 seconds

Params: ((100, 100, 50), 300, 0.01, 'relu')
Metrics: {'precision': 0.2626198375794607, 'recall': 0.2658610687687185, 'f1-score': 0.24739517650905535, 'accuracy': 0.28690918746680827}
Training Time: 28.5076 seconds

Params: ((100, 100, 50), 300, 0.01, 'tanh')
Metrics: {'precision': 0.2579859690595285, 'recall': 0.24741675703736346, 'f1-score': 0.23513458079782312, 'accuracy': 0.26394052044609667}
Training Time: 30.4669 seconds

Params: ((100, 100, 50), 300, 0.01, 'logistic')
Metrics: {'precision': 0.23742889233707967, 'recall': 0.2403316862210297, 'f1-score': 0.2259100875794676, 'accuracy': 0.24787573021773765}
Training Time: 51.8297 seconds





Params: ((100, 100, 50), 500, 0.001, 'relu')
Metrics: {'precision': 0.2852622425722148, 'recall': 0.28017638335142925, 'f1-score': 0.2617840614647514, 'accuracy': 0.2862453531598513}
Training Time: 67.1767 seconds





Params: ((100, 100, 50), 500, 0.001, 'tanh')
Metrics: {'precision': 0.2648569640089752, 'recall': 0.27021102591243074, 'f1-score': 0.25204377390008187, 'accuracy': 0.2773499734466277}
Training Time: 68.8698 seconds





Params: ((100, 100, 50), 500, 0.001, 'logistic')
Metrics: {'precision': 0.27470184843809786, 'recall': 0.287920272803822, 'f1-score': 0.2551199155768756, 'accuracy': 0.3032395114179501}
Training Time: 71.5932 seconds

Params: ((100, 100, 50), 500, 0.01, 'relu')
Metrics: {'precision': 0.2626198375794607, 'recall': 0.2658610687687185, 'f1-score': 0.24739517650905535, 'accuracy': 0.28690918746680827}
Training Time: 22.8926 seconds

Params: ((100, 100, 50), 500, 0.01, 'tanh')
Metrics: {'precision': 0.2579859690595285, 'recall': 0.24741675703736346, 'f1-score': 0.23513458079782312, 'accuracy': 0.26394052044609667}
Training Time: 18.0362 seconds

Params: ((100, 100, 50), 500, 0.01, 'logistic')
Metrics: {'precision': 0.23742889233707967, 'recall': 0.2403316862210297, 'f1-score': 0.2259100875794676, 'accuracy': 0.24787573021773765}
Training Time: 38.1466 seconds

Params: ((100, 100, 50), 1000, 0.001, 'relu')
Metrics: {'precision': 0.2784795613331424, 'recall': 0.281212607573238, 'f1-score': 0.



Params: ((100, 100, 50), 1000, 0.001, 'logistic')
Metrics: {'precision': 0.2844620753807278, 'recall': 0.28914832191050766, 'f1-score': 0.2629028688513816, 'accuracy': 0.30231014338821033}
Training Time: 171.7048 seconds

Params: ((100, 100, 50), 1000, 0.01, 'relu')
Metrics: {'precision': 0.2626198375794607, 'recall': 0.2658610687687185, 'f1-score': 0.24739517650905535, 'accuracy': 0.28690918746680827}
Training Time: 22.9529 seconds

Params: ((100, 100, 50), 1000, 0.01, 'tanh')
Metrics: {'precision': 0.2579859690595285, 'recall': 0.24741675703736346, 'f1-score': 0.23513458079782312, 'accuracy': 0.26394052044609667}
Training Time: 17.2888 seconds

Params: ((100, 100, 50), 1000, 0.01, 'logistic')
Metrics: {'precision': 0.23742889233707967, 'recall': 0.2403316862210297, 'f1-score': 0.2259100875794676, 'accuracy': 0.24787573021773765}
Training Time: 39.2774 seconds

MLP - метрики:
Params: ((50,), 100, 0.001, 'relu'), Metrics: {'precision': 0.2887943548147237, 'recall': 0.2835544638488966, 

Params: ((100, 100, 50), [300, 500, 1000], 0.01, 'logistic'), Metrics: {'precision': 0.9321507597441017, 'recall': 0.9333474888506172, 'f1-score': 0.932394304773428, 'accuracy': 0.9355665547109776}, Training Time: 33.0178 seconds

In [20]:
def safe_log1p(embeddings, shift=True):
    if shift:
        min_value = embeddings.min()
        if min_value < -1:
            embeddings_shifted = embeddings + abs(min_value) + 1
        else:
            embeddings_shifted = embeddings
        return np.log1p(embeddings_shifted)
    else:
        return np.log1p(np.abs(embeddings))

transformations = [
    safe_log1p,
    np.sin,
    np.cos,
    np.square
]

def extend_embeddings(embeddings, functions):
    extended_embeddings = embeddings.copy()
    for func in functions:
        transformed = func(embeddings)
        extended_embeddings = np.concatenate((extended_embeddings, transformed), axis=1)
    return extended_embeddings

In [21]:
X_train_extended = extend_embeddings(X_train, transformations)
X_test_extended = extend_embeddings(X_test, transformations)

In [22]:
def experiment_mlp_extended(X_train, y_train, X_test, y_test):
    model = MLPClassifier(hidden_layer_sizes=(100, 100, 50), max_iter=300, learning_rate_init=0.01, activation='logistic')
    
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    y_pred = model.predict(X_test)
    
    metrics = calculate_metrics(y_test, y_pred)
    
    return metrics, training_time

In [23]:
metrics_mlp, training_time_mlp = experiment_mlp_extended(X_train_extended, y_train_encoded, X_test_extended, y_test_encoded)
print(metrics_mlp)
print("Training Time:", training_time_mlp)

{'precision': 0.24003695862342753, 'recall': 0.2399057659308343, 'f1-score': 0.21673998982477238, 'accuracy': 0.26433882103027084}
Training Time: 49.51618528366089


In [24]:
X_train_extended = extend_embeddings(X_train, [np.sin])
X_test_extended = extend_embeddings(X_test, [np.sin])

metrics_mlp, training_time_mlp = experiment_mlp_extended(X_train_extended, y_train_encoded, X_test_extended, y_test_encoded)
print(metrics_mlp)
print("Training Time:", training_time_mlp)

{'precision': 0.24035001203467177, 'recall': 0.24735044200114595, 'f1-score': 0.2268078620400887, 'accuracy': 0.25185873605947956}
Training Time: 42.9942889213562


In [25]:
X_train_extended = extend_embeddings(X_train, [np.square])
X_test_extended = extend_embeddings(X_test, [np.square])

metrics_mlp, training_time_mlp = experiment_mlp_extended(X_train_extended, y_train_encoded, X_test_extended, y_test_encoded)
print(metrics_mlp)
print("Training Time:", training_time_mlp)

{'precision': 0.24301578285239017, 'recall': 0.24410509110262782, 'f1-score': 0.23023554968705393, 'accuracy': 0.2569038767923526}
Training Time: 69.53520083427429


In [26]:
X_train_extended = extend_embeddings(X_train, [safe_log1p])
X_test_extended = extend_embeddings(X_test, [safe_log1p])

metrics_mlp, training_time_mlp = experiment_mlp_extended(X_train_extended, y_train_encoded, X_test_extended, y_test_encoded)
print(metrics_mlp)
print("Training Time:", training_time_mlp)

{'precision': 0.26035749634242544, 'recall': 0.24750608656332745, 'f1-score': 0.22119166861716977, 'accuracy': 0.2741635687732342}
Training Time: 68.3126540184021


In [27]:
X_train_extended = extend_embeddings(X_train, [np.sin, np.square])
X_test_extended = extend_embeddings(X_test, [np.sin, np.square])

metrics_mlp, training_time_mlp = experiment_mlp_extended(X_train_extended, y_train_encoded, X_test_extended, y_test_encoded)
print(metrics_mlp)
print("Training Time:", training_time_mlp)

{'precision': 0.24431156260498935, 'recall': 0.257920322506607, 'f1-score': 0.23254087803275972, 'accuracy': 0.25677110993096125}
Training Time: 67.94425988197327
