In [29]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.decomposition import PCA

In [2]:
topics = ['baseball', 'electronics', 'hockey', 'med', 'motorcycles', 'space']
data_path = '..\\..\\assets\\vectorized'

def read_and_split(test_size=0.2):
    train_df = pd.DataFrame()
    test_df = pd.DataFrame()
    for i, topic in enumerate(topics):
        data_df = pd.read_csv(os.path.join(data_path, topic)+ '.tsv', delimiter='\t', header=None).iloc[:,1:]
        data_df['target'] = np.ones((len(data_df))) * i

        train_, test_ = train_test_split(data_df, test_size=test_size)
        train_df = pd.concat([train_df, train_], ignore_index=True)
        test_df = pd.concat([test_df, test_], ignore_index=True)
    return train_df, test_df

train_df, test_df = read_and_split()



In [None]:
train_df

In [None]:
test_df

In [51]:

import matplotlib.pyplot as plt
import tqdm

def RBF(X, gamma=None):
    
    # Free parameter gamma
    if gamma == None:
        gamma = 1.0/X.shape[1]
        
    # RBF kernel Equation
    K = np.exp(-gamma * np.sum((X - X[:,np.newaxis])**2, axis = -1))
    
    return K

def visualize_clusters(train_df):
    x_train = train_df.iloc[:,:-1].values
    # x_train = RBF(x_train)
    y_train = train_df.iloc[:,-1].values.astype(int).tolist()

    pca_ = PCA(n_components=2)
    x_transformed = pca_.fit_transform(x_train)
    x_transformed_rbf = RBF(x_transformed)

    colors = ['red', 'yellow', 'green', 'black', 'blue', 'purple']
    

    plt.figure(figsize=(10,10))
    for ind, (x,y) in tqdm.tqdm(enumerate(x_transformed)):
        plt.scatter(x, y, c=colors[y_train[ind]])
    plt.show()

    

In [None]:

visualize_clusters(test_df)

In [28]:
def precision_score(conf_matrix):
    if isinstance(conf_matrix, np.ndarray) and len(conf_matrix.shape) == 3:
        precision = []
        for conf_m in conf_matrix:
            TP, FP = conf_m[1][1], conf_m[0][1]
            precision.append((TP) / (TP + FP))
    else:
        TP, FP = conf_matrix[1][1], conf_matrix[0][1]
        precision = (TP) / (TP + FP)
    return np.array(precision)

def recall_score(conf_matrix):
    if isinstance(conf_matrix, np.ndarray) and len(conf_matrix.shape) == 3:
        recall = []
        for conf_m in conf_matrix:
            TP, FN = conf_m[1][1], conf_m[1][0]
            recall.append((TP) / (TP + FN))
    else:
        TP, FN = conf_matrix[1][1], conf_matrix[1][0]
        recall = (TP) / (TP + FN)
    return np.array(recall)

def f1_score(conf_matrix):
    precision = precision_score(conf_matrix)
    recall = recall_score(conf_matrix)
    f1 = 2 * precision * recall / (precision + recall)
    return f1

def accuracy_score(conf_matrix):
    if isinstance(conf_matrix, np.ndarray) and len(conf_matrix.shape) == 3:
        accuracy = []
        for conf_m in conf_matrix:
            (TN, FP), (FN, TP) =conf_m.tolist()
            accuracy.append((TP) / (TP + FN))
    else:
        (TN, FP), (FN, TP) =conf_matrix.tolist()
        accuracy = (TP + TN) / (TP + TN + FP + FN)
    return np.array(accuracy)


def calculate_metrics(y_test, y_pred) -> dict:
    conf_matrix = multilabel_confusion_matrix(y_test, y_pred)
    prec = precision_score(conf_matrix)
    rec = recall_score(conf_matrix)
    f1  = f1_score(conf_matrix)
    acc = accuracy_score(conf_matrix)

    return {
        'precision': np.mean(prec),
        'recall': np.mean(rec),
        'f1': np.mean(f1),
        'accuracy': np.mean(acc),
    }

In [39]:
# from sklearn.metrics import precision_score, recall_score, f1_score
import time

def train_and_test_svm(train_df: pd.DataFrame, test_df: pd.DataFrame, kernels=('linear','poly', 'rbf', 'sigmoid'), max_iter=(100, 300, 600, 1000)):
    X_train, X_test = train_df.drop(columns='target').values, test_df.drop(columns='target').values
    y_train, y_test = train_df['target'].values, test_df['target'].values

    results = []
    for it in max_iter:
        for ker in kernels:

            svm_ = svm.SVC(kernel=ker)
            t_start = time.time()
            svm_.fit(X_train, y_train)
            t_stop = time.time()
            y_pred = svm_.predict(X_test)
            
            res = calculate_metrics(y_test, y_pred)
            res.update({
                'kernel': ker,
                'time': t_stop - t_start,
                'iter': it
            })
            results.append(res)


    results = pd.DataFrame(results)
    results.index = results['kernel']
    results.drop(columns='kernel', inplace=True)
    return results

In [None]:
train_results = train_and_test_svm(train_df, test_df)
train_results


### №2 Задание

In [None]:
def highlight_max_green(s):
    is_max = s == s.max()
    return ['background-color: green' if v else '' for v in is_max]

def highlight_max_red(s):
    is_max = s == s.max()
    return ['background-color: red' if v else '' for v in is_max]

def highlight_min_green(s):
    is_max = s == s.min()
    return ['background-color: green' if v else '' for v in is_max]

def highlight_min_red(s):
    is_max = s == s.min()
    return ['background-color: red' if v else '' for v in is_max]

train_results[train_results['iter'] == 600].drop(columns='iter').style.\
    apply(highlight_max_green, subset=['precision', 'recall', 'f1', 'accuracy'], axis=0).\
    apply(highlight_min_red, subset=['precision', 'recall', 'f1', 'accuracy'], axis=0).\
    apply(highlight_max_red, subset=['time'], axis=0).\
    apply(highlight_min_green, subset=['time'], axis=0)



In [None]:
train_results.reset_index()[train_results.reset_index()['kernel'] == 'rbf'].style.\
    apply(highlight_min_red, subset=['precision', 'recall', 'f1', 'accuracy'], axis=0).\
    apply(highlight_max_green, subset=['precision', 'recall', 'f1', 'accuracy'], axis=0).\
    apply(highlight_max_red, subset=['time'], axis=0).\
    apply(highlight_min_green, subset=['time'], axis=0)

### №3 Задание

In [40]:
def decrease_dementions(num_components, train_x, test_x):
    pca_ = PCA(num_components)
    train_x = pca_.fit_transform(train_x)
    test_x = pca_.transform(test_x)
    return train_x, test_x

def train_test_svm_with_pca(arr_num_components, train_df: pd.DataFrame, test_df: pd.DataFrame, kernels=('linear','poly', 'rbf', 'sigmoid')):
    X_train, X_test = train_df.drop(columns='target').values, test_df.drop(columns='target').values
    y_train, y_test = train_df['target'].values, test_df['target'].values

    results = []
    for num_components in arr_num_components:
        new_X_train, new_X_test = decrease_dementions(num_components, X_train, X_test)
        for kernel in kernels:
            svm_ = svm.SVC(kernel=kernel)
            t_start = time.time()
            svm_.fit(new_X_train, y_train)
            t_stop = time.time()
            y_pred = svm_.predict(new_X_test)
            
            res = calculate_metrics(y_test, y_pred)
            res.update({
                'num_components': num_components,
                'kernel': kernel,
                'time': t_stop - t_start
            })
            results.append(res)
    results = pd.DataFrame(results)
    return results


In [None]:
train_results_with_pca = train_test_svm_with_pca((2, 20, 40, 60), train_df, test_df)
train_results_with_pca

In [None]:
train_results_with_pca[train_results_with_pca['kernel'] == 'rbf'].style.\
    apply(highlight_min_red, subset=['precision', 'recall', 'f1', 'accuracy'], axis=0).\
    apply(highlight_max_green, subset=['precision', 'recall', 'f1', 'accuracy'], axis=0).\
    apply(highlight_max_red, subset=['time'], axis=0).\
    apply(highlight_min_green, subset=['time'], axis=0)

In [None]:
train_results_with_pca[train_results_with_pca['kernel'] == 'linear'].style.\
    apply(highlight_min_red, subset=['precision', 'recall', 'f1', 'accuracy'], axis=0).\
    apply(highlight_max_green, subset=['precision', 'recall', 'f1', 'accuracy'], axis=0).\
    apply(highlight_max_red, subset=['time'], axis=0).\
    apply(highlight_min_green, subset=['time'], axis=0)

In [None]:
train_results_with_pca[train_results_with_pca['kernel'] == 'sigmoid'].style.\
    apply(highlight_min_red, subset=['precision', 'recall', 'f1', 'accuracy'], axis=0).\
    apply(highlight_max_green, subset=['precision', 'recall', 'f1', 'accuracy'], axis=0).\
    apply(highlight_max_red, subset=['time'], axis=0).\
    apply(highlight_min_green, subset=['time'], axis=0)

In [None]:
train_results_with_pca[train_results_with_pca['kernel'] == 'poly'].style.\
    apply(highlight_min_red, subset=['precision', 'recall', 'f1', 'accuracy'], axis=0).\
    apply(highlight_max_green, subset=['precision', 'recall', 'f1', 'accuracy'], axis=0).\
    apply(highlight_max_red, subset=['time'], axis=0).\
    apply(highlight_min_green, subset=['time'], axis=0)

In [None]:
selected_rows = pd.concat(
    (
        train_results_with_pca[(train_results_with_pca['kernel'] == 'poly') & (train_results_with_pca['num_components'] == 60)],
        train_results_with_pca[(train_results_with_pca['kernel'] == 'sigmoid') & (train_results_with_pca['num_components'] == 60)],
        train_results_with_pca[(train_results_with_pca['kernel'] == 'linear') & (train_results_with_pca['num_components'] == 60)],
        train_results_with_pca[(train_results_with_pca['kernel'] == 'rbf') & (train_results_with_pca['num_components'] == 40)]
    )
)
selected_rows.style.\
    apply(highlight_min_red, subset=['precision', 'recall', 'f1', 'accuracy'], axis=0).\
    apply(highlight_max_green, subset=['precision', 'recall', 'f1', 'accuracy'], axis=0).\
    apply(highlight_max_red, subset=['time'], axis=0).\
    apply(highlight_min_green, subset=['time'], axis=0)