In [14]:
def get_train_test():
    x_train = []
    y_train = []
    with open("assets/annotated-corpus/train-embeddings.tsv", "r") as file:
        for line in file:
            if len(line) < 100:
                continue
            train = line.split('\t')
            # убрать \n
            train = train[:-1] + [train[-1][:-1]]
            x_train.append(train[1:])
            y_train.append(train[0].split('/')[0])
    x_test = []
    y_test = []
    with open("assets/annotated-corpus/test-embeddings.tsv", "r") as file:
        for line in file:
            if len(line) < 100:
                continue
            test = line.split('\t')
            # убрать \n
            test = test[:-1] + [test[-1][:-1]]
            x_test.append(test[1:])
            y_test.append(test[0].split('/')[0])
    return x_train, y_train, x_test, y_test

In [4]:
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from time import time
import numpy as np
import pandas as pd

In [6]:
def confusion_matrix(test, pred):
    df = pd.crosstab(test, pred, rownames=['Actual'], colnames=['Predicted'])
    for index, row in df.iterrows():
        if index not in df.columns:
            df[index] = 0
    return df

def calc_accuracy(conf_matrix_df):
    sum_all = conf_matrix_df.values.sum()
    pos = 0
    for index, row in conf_matrix_df.iterrows():
        try:
            pos += conf_matrix_df[index].loc[index]
        except:
            pass
    return round(pos/sum_all, 2)

def calc_precision(conf_matrix_df):
    # Нахождение precision для каждого класса, затем усреднение найденных precision
    return round((np.diag(conf_matrix_df) / np.sum(conf_matrix_df, axis=0)).mean(axis=0), 2)

def calc_recall(conf_matrix_df):
    # Нахождение recall для каждого класса, затем усреднение найденных recall
    return round((np.diag(conf_matrix_df) / np.sum(conf_matrix_df, axis=1)).mean(axis=0), 2)

def calc_f1(precision, recall):
    return round((2 * precision * recall / (precision + recall)), 2)


def score(test, pred):
    conf_matrix = confusion_matrix(test, pred)
    accuracy = calc_accuracy(conf_matrix)
    precision = calc_precision(conf_matrix)
    recall = calc_recall(conf_matrix)
    f1 = calc_f1(precision, recall)
    return conf_matrix, accuracy, precision, recall, f1

In [7]:
def svc_linear(x_train, y_train, x_test, y_test):
    svc_lin = SVC(kernel="linear", random_state=10)
    time_start = time()
    svc_lin.fit(x_train, y_train)
    time_end = time()
    pred_lin = svc_lin.predict(x_test)
    conf_matrix, accuracy, precision, recall, f1 = score(y_test, pred_lin)
    print(f'Model: {svc_lin}, training time: {time_end-time_start}\n\n', f'{conf_matrix.to_string()}\n\n',
          f'accuracy: {accuracy}\n', f'precision: {precision}\n', f'recall: {recall}\n', f'f1: {f1}\n\n')

In [8]:
def svc_poly(x_train, y_train, x_test, y_test):
    svc_poly = SVC(kernel="poly", random_state=10)
    time_start = time()
    svc_poly.fit(x_train, y_train)
    time_end = time()
    pred_poly = svc_poly.predict(x_test)
    conf_matrix, accuracy, precision, recall, f1 = score(y_test, pred_poly)
    print(f'Model: {svc_poly}, training time: {time_end-time_start}\n\n', f'{conf_matrix.to_string()}\n\n',
          f'accuracy: {accuracy}\n', f'precision: {precision}\n', f'recall: {recall}\n', f'f1: {f1}\n\n')

In [9]:
def svc_rbf(x_train, y_train, x_test, y_test):
    svc_rbf = SVC(kernel="rbf", class_weight="balanced",
                  decision_function_shape="ovr", random_state=10)
    time_start = time()
    svc_rbf.fit(x_train, y_train)
    time_end = time()
    pred_rbf = svc_rbf.predict(x_test)
    conf_matrix, accuracy, precision, recall, f1 = score(y_test, pred_rbf)
    print(f'Model: {svc_rbf}, training time: {time_end-time_start}\n\n', f'{conf_matrix.to_string()}\n\n',
          f'accuracy: {accuracy}\n', f'precision: {precision}\n', f'recall: {recall}\n', f'f1: {f1}\n\n')

In [16]:
def mlp_classify(x_train, y_train, x_test, y_test, max_iter):
    mlp = MLPClassifier(max_iter=max_iter)
    encoder = LabelEncoder()
    x_train = np.array(x_train, dtype=float)
    x_test = np.array(x_test, dtype=float)
    y_train = encoder.fit_transform(y_train)
    y_test = encoder.transform(y_test)
    time_start = time()
    mlp.fit(x_train, y_train)
    time_end = time()
    pred_mlp = mlp.predict(x_test)
    conf_matrix, accuracy, precision, recall, f1 = score(y_test, pred_mlp)
    print(f'Model: {mlp}, training time: {time_end-time_start}\n\n', f'{conf_matrix.to_string()}\n\n',
            f'accuracy: {accuracy}\n', f'precision: {precision}\n', f'recall: {recall}\n', f'f1: {f1}\n\n')

In [17]:
def svc_by_components(n_components):

    x_train, y_train, x_test, y_test = get_train_test()

    if n_components:
        pca = PCA(n_components=n_components)
        x_train = pca.fit_transform(x_train)
        x_test = pca.transform(x_test)

    svc_linear(x_train, y_train, x_test, y_test)

    svc_poly(x_train, y_train, x_test, y_test)

    svc_rbf(x_train, y_train, x_test, y_test)

    mlp_classify(x_train, y_train, x_test, y_test, 500)
    mlp_classify(x_train, y_train, x_test, y_test, 1000)
    mlp_classify(x_train, y_train, x_test, y_test, 1500)

In [18]:
svc_by_components(None)

Model: SVC(kernel='linear', random_state=10), training time: 4.212909460067749

 Predicted                 alt.atheism  comp.graphics  comp.os.ms-windows.misc  comp.sys.ibm.pc.hardware  comp.sys.mac.hardware  comp.windows.x  misc.forsale  rec.autos  rec.motorcycles  rec.sport.baseball  rec.sport.hockey  sci.crypt  sci.electronics  sci.med  sci.space  soc.religion.christian  talk.politics.guns  talk.politics.mideast  talk.politics.misc  talk.religion.misc
Actual                                                                                                                                                                                                                                                                                                                                                                                   
alt.atheism                       132              4                        1                         0                      0               0             3       

In [19]:
svc_by_components(50)

Model: SVC(kernel='linear', random_state=10), training time: 3.3081634044647217

 Predicted                 alt.atheism  comp.graphics  comp.os.ms-windows.misc  comp.sys.ibm.pc.hardware  comp.sys.mac.hardware  comp.windows.x  misc.forsale  rec.autos  rec.motorcycles  rec.sport.baseball  rec.sport.hockey  sci.crypt  sci.electronics  sci.med  sci.space  soc.religion.christian  talk.politics.guns  talk.politics.mideast  talk.politics.misc  talk.religion.misc
Actual                                                                                                                                                                                                                                                                                                                                                                                   
alt.atheism                       129              3                        1                         0                      0               0             3      



Model: MLPClassifier(max_iter=500), training time: 20.607722997665405

 Predicted   0    1    2    3    4    5    6    7    8    9    10   11   12   13   14   15   16   17  18  19
Actual                                                                                                     
0          110    1    1    0    0    1    0    2   15   13    8    3    0   20    6   56    8   23  10  41
1            0  164   43   11   21   44   14    7    3    8    4    7   27   10   20    1    0    1   1   3
2            2   41  188   42   17   56    9    3    4    6    0    6    8    1    3    3    2    0   1   1
3            0   18   42  147   84   12   31   10    6    1    1    7   27    1    1    2    1    0   0   1
4            0    7   26   68  137   12   28   23    7   11    3    1   46    4    8    0    1    1   1   0
5            0   41   55    6    5  230   12    1    4    2    1   12   10    4    8    2    0    0   1   1
6            0    3    1   11   11    8  295    2    6   14    5

In [20]:
svc_by_components(10)

Model: SVC(kernel='linear', random_state=10), training time: 3.3856770992279053

 Predicted                 alt.atheism  comp.graphics  comp.os.ms-windows.misc  comp.sys.ibm.pc.hardware  comp.sys.mac.hardware  comp.windows.x  misc.forsale  rec.autos  rec.motorcycles  rec.sport.baseball  rec.sport.hockey  sci.crypt  sci.electronics  sci.med  sci.space  soc.religion.christian  talk.politics.guns  talk.politics.mideast  talk.politics.misc  talk.religion.misc
Actual                                                                                                                                                                                                                                                                                                                                                                                   
alt.atheism                       113              2                        0                         0                      0               2             6      