#**Optimization & OvO**

In [1]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


###L-BFGS-B (Limited-memory Broyden-Fletcher-Goldfarb-Shanno with Box constraints)

In [167]:
import numpy as np
from scipy.optimize import minimize
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from itertools import combinations
from ucimlrepo import fetch_ucirepo

np.random.seed(42)  # 시드 설정

class LBFGS_SVM:
    def __init__(self, C=0.01, svm_type='L2', c=1.0):
        self.C = C  # 정규화 상수
        self.svm_type = svm_type  # L1 또는 L2
        self.w_best = None
        self.b_best = None
        self.c = c

    def compute_slack_term(self, xi):
        """ SVM 타입별 슬랙 변수 비용 계산 """
        if self.svm_type == 'L1':
            return np.abs(xi)
        elif self.svm_type == 'L2':
            return xi ** 2
        elif self.svm_type == 'Fair':
            return np.sum(self.c**2 * ((xi / self.c) - np.log(1 + xi / self.c)))
        elif self.svm_type == 'Cauchy':
            return np.sum((self.c**2 / 2) * np.log(1 + (xi / self.c)**2))
        elif self.svm_type == 'Welsch':
            return np.sum((self.c**2 / 2) * (1 - np.exp(-(xi / self.c)**2)))
        elif self.svm_type == 'Geman-McClure':
            return np.sum(((xi**2)/2) / (1 + xi**2))
        else:
            raise ValueError("Unknown SVM type. Choose from ['L1', 'L2', 'Cauchy', 'Fair', 'Welsch', 'Geman-McClure'].")

    def fitness(self, params, X, y):
        """ 손실 함수 계산 (Hinge Loss 기반) """
        n_features = X.shape[1]
        w = params[:n_features]
        b = params[n_features]
        margins = y * (np.dot(X, w) + b)
        slack = np.maximum(0, 1 - margins)  # Hinge Loss 적용
        slack_term = self.compute_slack_term(slack)
        regularization = 0.5 * np.dot(w, w)  # L2 Regularization
        return regularization + self.C * np.sum(slack_term)  # 최소화할 손실 값

    def fit(self, X, y):
        n_samples, n_features = X.shape
        initial_params = np.zeros(n_features + 1)
        bounds = [(None, None)] * n_features + [(None, None)]  # 경계 조건 추가
        options = {'disp': True, 'maxiter': 5000, 'ftol': 1e-9}  # 최적화 옵션 추가: maxiter 증가, ftol 추가
        result = minimize(self.fitness, initial_params, args=(X, y), method='L-BFGS-B', bounds=bounds, options=options)
        # If optimization fails, try SLSQP solver
        if not result.success:
            result = minimize(self.fitness, initial_params, args=(X, y), method='SLSQP', bounds=bounds, options=options)

        if result.success:
            self.w_best = result.x[:n_features]
            self.b_best = result.x[n_features]
        else:
            raise ValueError("Optimization failed: " + result.message)

    def predict(self, X):
        return np.sign(np.dot(X, self.w_best) + self.b_best)

In [168]:
def train_ovo(X, y, C, svm_type):
    classes = np.unique(y)
    models = []
    class_pairs = list(combinations(classes, 2))
    for (cl1, cl2) in class_pairs:
        mask = (y == cl1) | (y == cl2)
        X_bin = X[mask]
        y_bin = y[mask]
        y_bin = (y_bin == cl1).astype(int) * 2 - 1
        model = LBFGS_SVM(C=C, svm_type=svm_type)
        model.fit(X_bin, y_bin)
        w = model.w_best
        b = model.b_best
        models.append(((cl1, cl2), (w, b)))
    return models

def predict_ovo(X, models):
    votes = np.zeros((X.shape[0], len(models)))
    classes = np.unique(np.hstack([pair for pair, _ in models]))
    for i, ((cl1, cl2), (w, b)) in enumerate(models):
        preds = np.sign(X.dot(w) + b)
        votes[:, i] = np.where(preds == 1, cl1, cl2)
    final_predictions = []
    for j in range(votes.shape[0]):
        valid_votes = votes[j][votes[j] >= 0]
        if len(valid_votes) > 0:
            bincount = np.bincount(valid_votes.astype(int))
            final_predictions.append(bincount.argmax())
        else:
            final_predictions.append(np.random.choice(classes))
    return np.array(final_predictions)


###1. Genetic Algorithm (GA)

In [60]:
class GA_SVM:
    def __init__(self, C=1.0, pop_size=20, max_iter=100, mutation_rate=0.1, crossover_rate=0.7, svm_type='L2', c=1.0):
        self.C = C  # 정규화 상수
        self.pop_size = pop_size  # 개체 수
        self.max_iter = max_iter  # 세대 수
        self.mutation_rate = mutation_rate  # 돌연변이 확률
        self.crossover_rate = crossover_rate  # 교차 확률
        self.svm_type = svm_type  # 'L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure'
        self.w_best = None
        self.b_best = None
        self.c = c

    def compute_slack_term(self, xi):
        if self.svm_type == 'L1':
            return np.abs(xi)
        elif self.svm_type == 'L2':
            return xi ** 2
        elif self.svm_type == 'Fair':
            return np.sum(self.c**2 * ((xi / self.c) - np.log(1 + xi / self.c)))
        elif self.svm_type == 'Cauchy':
            return np.sum((self.c**2 / 2) * np.log(1 + (xi / self.c)**2))
        elif self.svm_type == 'Welsch':
            return np.sum((self.c**2 / 2) * (1 - np.exp(-(xi / self.c)**2)))
        elif self.svm_type == 'Geman-McClure':
            return np.sum(((xi**2)/2) / (1 + xi**2))
        else:
            raise ValueError("Unknown SVM type. Choose from ['L1', 'L2', 'Cauchy', 'Fair', 'Welsch', 'Geman-McClure'].")

    def fitness(self, w, b, X, y):
        margins = y * (np.dot(X, w) + b)
        slack = np.maximum(0, 1 - margins)
        slack_term = self.compute_slack_term(slack)
        regularization = 0.5 * np.dot(w, w)
        return regularization + self.C * np.sum(slack_term)

    def initialize_population(self, n_features):
        population = [(np.random.randn(n_features), np.random.randn()) for _ in range(self.pop_size)]
        return population

    def selection(self, population, fitness_values):
        probabilities = 1 / (fitness_values + 1e-6)
        probabilities /= probabilities.sum()
        selected_indices = np.random.choice(len(population), size=len(population), p=probabilities)
        return [population[i] for i in selected_indices]

    def crossover(self, parent1, parent2):
        w1, b1 = parent1
        w2, b2 = parent2
        if np.random.rand() < self.crossover_rate:
            point = np.random.randint(len(w1))
            new_w = np.concatenate((w1[:point], w2[point:]))
            new_b = (b1 + b2) / 2
        else:
            new_w, new_b = w1.copy(), b1
        return new_w, new_b

    def mutation(self, w, b):
        if np.random.rand() < self.mutation_rate:
            mutation_vector = np.random.randn(*w.shape) * 0.1
            w += mutation_vector
            b += np.random.randn() * 0.1
        return w, b

    def fit(self, X, y):
        n_samples, n_features = X.shape
        population = self.initialize_population(n_features)
        fitness_values = np.array([self.fitness(w, b, X, y) for w, b in population])
        np.random.seed(42)  # 시드 설정
        for _ in range(self.max_iter):
            selected_population = self.selection(population, fitness_values)
            new_population = []
            for i in range(0, len(selected_population), 2):
                p1, p2 = selected_population[i], selected_population[(i + 1) % len(selected_population)]
                offspring1 = self.crossover(p1, p2)
                offspring2 = self.crossover(p2, p1)
                new_population.append(self.mutation(*offspring1))
                new_population.append(self.mutation(*offspring2))
            population = new_population
            fitness_values = np.array([self.fitness(w, b, X, y) for w, b in population])
        best_idx = np.argmin(fitness_values)
        self.w_best, self.b_best = population[best_idx]

    def predict(self, X):
        return np.sign(np.dot(X, self.w_best) + self.b_best)

In [61]:
def train_ovo(X, y, C, svm_type):
    classes = np.unique(y)
    models = []
    class_pairs = list(combinations(classes, 2))
    for (cl1, cl2) in class_pairs:
        mask = (y == cl1) | (y == cl2)
        X_bin = X[mask]
        y_bin = y[mask]
        y_bin = (y_bin == cl1).astype(int) * 2 - 1
        model = GA_SVM(C=C, svm_type=svm_type)
        model.fit(X_bin, y_bin)
        w = model.w_best
        b = model.b_best
        models.append(((cl1, cl2), (w, b)))
    return models

def predict_ovo(X, models):
    votes = np.zeros((X.shape[0], len(models)))
    classes = np.unique(np.hstack([pair for pair, _ in models]))
    for i, ((cl1, cl2), (w, b)) in enumerate(models):
        preds = np.sign(X.dot(w) + b)
        votes[:, i] = np.where(preds == 1, cl1, cl2)
    final_predictions = []
    for j in range(votes.shape[0]):
        valid_votes = votes[j][votes[j] >= 0]
        if len(valid_votes) > 0:
            bincount = np.bincount(valid_votes.astype(int))
            final_predictions.append(bincount.argmax())
        else:
            final_predictions.append(np.random.choice(classes))
    return np.array(final_predictions)

###SMO (Sequential Minial Optimization)

In [119]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
from itertools import combinations

class SMO_SVM:
    def __init__(self, C=1.0, kernel='linear', tol=1e-3, max_iter=1000, svm_type='L2', c=1.0, lr=0.01):
        self.C = C
        self.kernel = kernel
        self.tol = tol
        self.max_iter = max_iter
        self.svm_type = svm_type
        self.c = c
        self.lr = lr
        self.w_best = None  # 최적 가중치 벡터 추가
        self.b_best = None  # 최적 편향 값 추가

    def kernel_function(self, X, Y):
        if self.kernel == 'linear':
            return np.dot(X, Y.T)
        else:
            raise ValueError("Only linear kernel is supported in this implementation.")

    def compute_slack_term(self, xi):
        xi = np.maximum(xi, 1e-6)

        if self.svm_type == 'Fair':
            return np.sum(self.c**2 * ((xi / self.c) - np.log(1 + xi / self.c)))
        elif self.svm_type == 'Cauchy':
            return np.sum((self.c**2 / 2) * np.log(1 + (xi / self.c)**2))
        elif self.svm_type == 'Welsch':
            return np.sum((self.c**2 / 2) * (1 - np.exp(-(xi / self.c)**2)))
        elif self.svm_type == 'Geman-McClure':
            return np.sum(((xi**2)/2) / (1 + xi**2))
        elif self.svm_type == 'L1':
            return np.abs(xi)
        elif self.svm_type == 'L2':
            return xi ** 2
        else:
            raise ValueError("Unknown SVM type. Choose from ['L1', 'L2', 'Cauchy', 'Fair', 'Welsch', 'Geman-McClure'].")

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.alpha = np.zeros(n_samples)
        self.b = 0
        self.w = np.zeros(n_features)

        for _ in range(self.max_iter):
            for i in range(n_samples):
                xi, yi = X[i], y[i]
                margin = yi * (np.dot(self.w, xi) + self.b)
                slack = max(0, 1 - margin)
                slack_term = self.compute_slack_term(slack)

                if slack > 0:
                    delta_alpha = self.C * (1 - margin) - slack_term
                    self.alpha[i] = np.clip(self.alpha[i] + self.lr * delta_alpha, 0, self.C)
                    self.w += self.lr * self.alpha[i] * yi * xi
                    self.b += self.lr * self.alpha[i] * yi

        self.w_best = self.w  # 최적의 가중치 저장
        self.b_best = self.b  # 최적의 편향 값 저장

    def predict(self, X):
        return np.sign(np.dot(X, self.w_best) + self.b_best)  # b 대신 b_best 사용


In [120]:
def train_ovo(X, y, C, svm_type):
    classes = np.unique(y)
    models = []
    class_pairs = list(combinations(classes, 2))
    for (cl1, cl2) in class_pairs:
        mask = (y == cl1) | (y == cl2)
        X_bin = X[mask]
        y_bin = y[mask]
        y_bin = (y_bin == cl1).astype(int) * 2 - 1
        # Convert X_bin and y_bin to NumPy arrays if they are pandas Series
        X_bin = X_bin.to_numpy() if isinstance(X_bin, pd.Series) else X_bin
        y_bin = y_bin.to_numpy() if isinstance(y_bin, pd.Series) else y_bin
        model = SMO_SVM(C=C, svm_type=svm_type)
        model.fit(X_bin, y_bin)
        w = model.w
        b = model.b
        models.append(((cl1, cl2), (w, b)))
    return models

def predict_ovo(X, models):
    votes = np.zeros((X.shape[0], len(models)))
    classes = np.unique(np.hstack([pair for pair, _ in models]))
    for i, ((cl1, cl2), (w, b)) in enumerate(models):
        preds = np.sign(X.dot(w) + b)
        votes[:, i] = np.where(preds == 1, cl1, cl2)
    final_predictions = []
    for j in range(votes.shape[0]):
        valid_votes = votes[j][votes[j] >= 0]
        if len(valid_votes) > 0:
            bincount = np.bincount(valid_votes.astype(int))
            final_predictions.append(bincount.argmax())
        else:
            final_predictions.append(np.random.choice(classes))
    return np.array(final_predictions)


###PSO (Particle Swarm Optimization)

In [177]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from itertools import combinations
from ucimlrepo import fetch_ucirepo

class PSO_SVM:
    def __init__(self, C=1.0, num_particles=30, max_iter=100, svm_type='L2', c=1.0):
        self.C = C  # 정규화 상수
        self.num_particles = num_particles  # PSO 입자 개수
        self.max_iter = max_iter  # PSO 반복 횟수
        self.svm_type = svm_type  # L1 또는 L2
        self.w_best = None
        self.b_best = None
        self.c = c

    def compute_slack_term(self, xi):
        """ SVM 타입별 슬랙 변수 비용 계산 """
        if self.svm_type == 'L1':
            return np.abs(xi)
        elif self.svm_type == 'L2':
            return xi ** 2
        elif self.svm_type == 'Fair':
            return np.sum(self.c**2 * ((xi / self.c) - np.log(1 + xi / self.c)))
        elif self.svm_type == 'Cauchy':
            return np.sum((self.c**2 / 2) * np.log(1 + (xi / self.c)**2))
        elif self.svm_type == 'Welsch':
            return np.sum((self.c**2 / 2) * (1 - np.exp(-(xi / self.c)**2)))
        elif self.svm_type == 'Geman-McClure':
            return np.sum(((xi**2)/2) / (1 + xi**2))
        else:
            raise ValueError("Unknown SVM type. Choose from ['L1', 'L2', 'Cauchy', 'Fair', 'Welsch', 'Geman-McClure'].")

    def fitness(self, w, b, X, y):
        """ 손실 함수 계산 (Hinge Loss 기반) """
        margins = y * (np.dot(X, w) + b)
        slack = np.maximum(0, 1 - margins)  # Hinge Loss 적용
        slack_term = self.compute_slack_term(slack)
        regularization = 0.5 * np.dot(w, w)  # L2 Regularization
        return regularization + self.C * np.sum(slack_term)  # 최소화할 손실 값

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # PSO 초기화
        w_particles = np.random.randn(self.num_particles, n_features)  # 초기 w 값
        b_particles = np.random.randn(self.num_particles)  # 초기 b 값
        velocities_w = np.random.randn(self.num_particles, n_features) * 0.1  # 속도 초기화
        velocities_b = np.random.randn(self.num_particles) * 0.1

        # 개별 최적 및 전역 최적 초기화
        p_best_w = np.copy(w_particles)
        p_best_b = np.copy(b_particles)
        p_best_scores = np.array([self.fitness(w, b, X, y) for w, b in zip(w_particles, b_particles)])

        g_best_index = np.argmin(p_best_scores)
        g_best_w = p_best_w[g_best_index]
        g_best_b = p_best_b[g_best_index]

        # PSO 학습 진행
        w_inertia = 0.7  # 관성 계수
        c1 = 1.5  # 개인 최적화 계수
        c2 = 1.5  # 글로벌 최적화 계수

        for _ in range(self.max_iter):
            for i in range(self.num_particles):
                r1, r2 = np.random.rand(), np.random.rand()
                velocities_w[i] = (w_inertia * velocities_w[i] +
                                   c1 * r1 * (p_best_w[i] - w_particles[i]) +
                                   c2 * r2 * (g_best_w - w_particles[i]))
                velocities_b[i] = (w_inertia * velocities_b[i] +
                                   c1 * r1 * (p_best_b[i] - b_particles[i]) +
                                   c2 * r2 * (g_best_b - b_particles[i]))

                # 업데이트된 위치
                w_particles[i] += velocities_w[i]
                b_particles[i] += velocities_b[i]

                # 새로운 피트니스 값 계산
                new_fitness = self.fitness(w_particles[i], b_particles[i], X, y)

                # 최적값 갱신
                if new_fitness < p_best_scores[i]:
                    p_best_w[i] = w_particles[i]
                    p_best_b[i] = b_particles[i]
                    p_best_scores[i] = new_fitness

            # 전체 최적 갱신
            g_best_index = np.argmin(p_best_scores)
            g_best_w = p_best_w[g_best_index]
            g_best_b = p_best_b[g_best_index]

        self.w_best = g_best_w
        self.b_best = g_best_b

    def predict(self, X):
        return np.sign(np.dot(X, self.w_best) + self.b_best)
def train_ovo(X, y, C, svm_type):
    classes = np.unique(y)
    models = []
    class_pairs = list(combinations(classes, 2))
    for (cl1, cl2) in class_pairs:
        mask = (y == cl1) | (y == cl2)
        X_bin = X[mask]
        y_bin = y[mask]
        y_bin = (y_bin == cl1).astype(int) * 2 - 1
        model = PSO_SVM(C=C, svm_type=svm_type)
        model.fit(X_bin, y_bin)
        w = model.w_best
        b = model.b_best
        models.append(((cl1, cl2), (w, b)))
    return models

def predict_ovo(X, models):
    votes = np.zeros((X.shape[0], len(models)))
    classes = np.unique(np.hstack([pair for pair, _ in models]))
    for i, ((cl1, cl2), (w, b)) in enumerate(models):
        preds = np.sign(X.dot(w) + b)
        votes[:, i] = np.where(preds == 1, cl1, cl2)
    final_predictions = []
    for j in range(votes.shape[0]):
        valid_votes = votes[j][votes[j] >= 0]
        if len(valid_votes) > 0:
            bincount = np.bincount(valid_votes.astype(int))
            final_predictions.append(bincount.argmax())
        else:
            final_predictions.append(np.random.choice(classes))
    return np.array(final_predictions)


In [178]:
def train_ovo(X, y, C, svm_type):
    classes = np.unique(y)
    models = []
    class_pairs = list(combinations(classes, 2))
    for (cl1, cl2) in class_pairs:
        mask = (y == cl1) | (y == cl2)
        X_bin = X[mask]
        y_bin = y[mask]
        y_bin = (y_bin == cl1).astype(int) * 2 - 1
        model = PSO_SVM(C=C, svm_type=svm_type)
        model.fit(X_bin, y_bin)
        w = model.w_best
        b = model.b_best
        models.append(((cl1, cl2), (w, b)))
    return models

def predict_ovo(X, models):
    votes = np.zeros((X.shape[0], len(models)))
    classes = np.unique(np.hstack([pair for pair, _ in models]))
    for i, ((cl1, cl2), (w, b)) in enumerate(models):
        preds = np.sign(X.dot(w) + b)
        votes[:, i] = np.where(preds == 1, cl1, cl2)
    final_predictions = []
    for j in range(votes.shape[0]):
        valid_votes = votes[j][votes[j] >= 0]
        if len(valid_votes) > 0:
            bincount = np.bincount(valid_votes.astype(int))
            final_predictions.append(bincount.argmax())
        else:
            final_predictions.append(np.random.choice(classes))
    return np.array(final_predictions)


###ACO (Ant Colony Optimization)

In [98]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from itertools import combinations
from ucimlrepo import fetch_ucirepo

class ACO_SVM:
    def __init__(self, C=1.0, num_ants=30, max_iter=100, decay=0.5, alpha=1, beta=2, svm_type='L2', c=1.0):
        self.C = C  # 정규화 상수
        self.num_ants = num_ants  # 개미 개수
        self.max_iter = max_iter  # 반복 횟수
        self.decay = decay  # 페로몬 증발 계수
        self.alpha = alpha  # 페로몬 영향도
        self.beta = beta  # 휴리스틱 정보 영향도
        self.svm_type = svm_type  # L1 또는 L2
        self.w_best = None
        self.b_best = None
        self.c = c

    def compute_slack_term(self, xi):
        """ SVM 타입별 슬랙 변수 비용 계산 """
        if self.svm_type == 'L1':
            return np.abs(xi)
        elif self.svm_type == 'L2':
            return xi ** 2
        elif self.svm_type == 'Fair':
            return np.sum(self.c**2 * ((xi / self.c) - np.log(1 + xi / self.c)))
        elif self.svm_type == 'Cauchy':
            return np.sum((self.c**2 / 2) * np.log(1 + (xi / self.c)**2))
        elif self.svm_type == 'Welsch':
            return np.sum((self.c**2 / 2) * (1 - np.exp(-(xi / self.c)**2)))
        elif self.svm_type == 'Geman-McClure':
            return np.sum(((xi**2)/2) / (1 + xi**2))
        else:
            raise ValueError("Unknown SVM type. Choose from ['L1', 'L2', 'Cauchy', 'Fair', 'Welsch', 'Geman-McClure'].")

    def fitness(self, w, b, X, y):
        """ 손실 함수 계산 (Hinge Loss 기반) """
        margins = y * (np.dot(X, w) + b)
        slack = np.maximum(0, 1 - margins)  # Hinge Loss 적용
        slack_term = self.compute_slack_term(slack)
        regularization = 0.5 * np.dot(w, w)  # L2 Regularization
        return regularization + self.C * np.sum(slack_term)  # 최소화할 손실 값

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # 개미들의 초기 해 (랜덤 가중치 및 바이어스)
        w_ants = np.random.randn(self.num_ants, n_features)
        b_ants = np.random.randn(self.num_ants)
        pheromones = np.ones(self.num_ants)  # 초기 페로몬 값 동일

        best_fitness = float('inf')
        g_best_w, g_best_b = None, None

        for _ in range(self.max_iter):
            fitness_values = np.array([self.fitness(w, b, X, y) for w, b in zip(w_ants, b_ants)])

            # 가장 좋은 개미 선택
            best_index = np.argmin(fitness_values)
            if fitness_values[best_index] < best_fitness:
                best_fitness = fitness_values[best_index]
                g_best_w, g_best_b = w_ants[best_index], b_ants[best_index]

            # 페로몬 업데이트 (좋은 해에 페로몬 증가)
            pheromones = (1 - self.decay) * pheromones  # 증발 적용
            pheromones[best_index] += 1 / (1 + best_fitness)  # 좋은 해 강화

            # 개미들의 새로운 탐색 방향 선택
            probabilities = (pheromones ** self.alpha) * ((1 / (1 + fitness_values)) ** self.beta)
            probabilities /= np.sum(probabilities)

            selected_indices = np.random.choice(self.num_ants, size=self.num_ants, p=probabilities)
            w_ants = w_ants[selected_indices] + np.random.randn(self.num_ants, n_features) * 0.1
            b_ants = b_ants[selected_indices] + np.random.randn(self.num_ants) * 0.1

        self.w_best = g_best_w
        self.b_best = g_best_b

    def predict(self, X):
        return np.sign(np.dot(X, self.w_best) + self.b_best)


In [99]:
def train_ovo(X, y, C, svm_type):
    classes = np.unique(y)
    models = []
    class_pairs = list(combinations(classes, 2))
    for (cl1, cl2) in class_pairs:
        mask = (y == cl1) | (y == cl2)
        X_bin = X[mask]
        y_bin = y[mask]
        y_bin = (y_bin == cl1).astype(int) * 2 - 1
        model = ACO_SVM(C=C, svm_type=svm_type)
        model.fit(X_bin, y_bin)
        w = model.w_best
        b = model.b_best
        models.append(((cl1, cl2), (w, b)))
    return models

def predict_ovo(X, models):
    votes = np.zeros((X.shape[0], len(models)))
    classes = np.unique(np.hstack([pair for pair, _ in models]))
    for i, ((cl1, cl2), (w, b)) in enumerate(models):
        preds = np.sign(X.dot(w) + b)
        votes[:, i] = np.where(preds == 1, cl1, cl2)
    final_predictions = []
    for j in range(votes.shape[0]):
        valid_votes = votes[j][votes[j] >= 0]
        if len(valid_votes) > 0:
            bincount = np.bincount(valid_votes.astype(int))
            final_predictions.append(bincount.argmax())
        else:
            final_predictions.append(np.random.choice(classes))
    return np.array(final_predictions)


###HS (Harmony Search)

In [75]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from itertools import combinations
from ucimlrepo import fetch_ucirepo

class HS_SVM:
    def __init__(self, C=1.0, hm_size=20, max_iter=100, hmcr=0.9, par=0.3, bw=0.1, svm_type='L2', c=1.0):
        self.C = C  # 정규화 상수
        self.hm_size = hm_size  # 하모니 메모리 크기
        self.max_iter = max_iter  # 반복 횟수
        self.hmcr = hmcr  # 하모니 메모리 고려율 (기존 해를 선택할 확률)
        self.par = par  # 피치 조정 비율 (기존 해를 변형할 확률)
        self.bw = bw  # 변형 크기
        self.svm_type = svm_type  # 'L1' 또는 'L2'
        self.w_best = None
        self.b_best = None
        self.c = c

    def compute_slack_term(self, xi):
        """ SVM 타입별 슬랙 변수 비용 계산 """
        if self.svm_type == 'L1':
            return np.abs(xi)
        elif self.svm_type == 'L2':
            return xi ** 2
        elif self.svm_type == 'Fair':
            return np.sum(self.c**2 * ((xi / self.c) - np.log(1 + xi / self.c)))
        elif self.svm_type == 'Cauchy':
            return np.sum((self.c**2 / 2) * np.log(1 + (xi / self.c)**2))
        elif self.svm_type == 'Welsch':
            return np.sum((self.c**2 / 2) * (1 - np.exp(-(xi / self.c)**2)))
        elif self.svm_type == 'Geman-McClure':
            return np.sum(((xi**2)/2) / (1 + xi**2))
        else:
            raise ValueError("Unknown SVM type. Choose from ['L1', 'L2', 'Cauchy', 'Fair', 'Welsch', 'Geman-McClure'].")

    def fitness(self, w, b, X, y):
        """ 손실 함수 계산 (Hinge Loss 기반) """
        margins = y * (np.dot(X, w) + b)
        slack = np.maximum(0, 1 - margins)  # Hinge Loss 적용
        slack_term = self.compute_slack_term(slack)

        regularization = 0.5 * np.dot(w, w)  # L2 Regularization
        return regularization + self.C * np.sum(slack_term)  # 최소화할 손실 값

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # 초기 하모니 메모리 (랜덤 가중치 및 바이어스)
        harmony_memory = [(np.random.randn(n_features), np.random.randn()) for _ in range(self.hm_size)]
        fitness_values = np.array([self.fitness(w, b, X, y) for w, b in harmony_memory])

        for _ in range(self.max_iter):
            # 새로운 해 생성
            new_w, new_b = np.zeros(n_features), 0

            for j in range(n_features):
                if np.random.rand() < self.hmcr:  # 기존 해에서 선택할 확률
                    new_w[j] = harmony_memory[np.random.randint(self.hm_size)][0][j]
                    if np.random.rand() < self.par:  # 피치 조정
                        new_w[j] += np.random.uniform(-self.bw, self.bw)
                else:  # 랜덤 탐색
                    new_w[j] = np.random.randn()

            if np.random.rand() < self.hmcr:
                new_b = harmony_memory[np.random.randint(self.hm_size)][1]
                if np.random.rand() < self.par:
                    new_b += np.random.uniform(-self.bw, self.bw)
            else:
                new_b = np.random.randn()

            # 새로운 해의 적합도 평가
            new_fitness = self.fitness(new_w, new_b, X, y)

            # 기존 해 중 최악의 해와 비교 후 교체
            worst_idx = np.argmax(fitness_values)
            if new_fitness < fitness_values[worst_idx]:
                harmony_memory[worst_idx] = (new_w, new_b)
                fitness_values[worst_idx] = new_fitness

        # 최적 해 선택
        best_idx = np.argmin(fitness_values)
        self.w_best, self.b_best = harmony_memory[best_idx]

    def predict(self, X):
        return np.sign(np.dot(X, self.w_best) + self.b_best)


In [76]:
def train_ovo(X, y, C, svm_type):
    classes = np.unique(y)
    models = []
    class_pairs = list(combinations(classes, 2))
    for (cl1, cl2) in class_pairs:
        mask = (y == cl1) | (y == cl2)
        X_bin = X[mask]
        y_bin = y[mask]
        y_bin = (y_bin == cl1).astype(int) * 2 - 1
        model = HS_SVM(C=C, hm_size=20, max_iter=100, hmcr=0.9, par=0.3, bw=0.1, svm_type=svm_type)
        model.fit(X_bin, y_bin)
        w = model.w_best
        b = model.b_best
        models.append(((cl1, cl2), (w, b)))
    return models

def predict_ovo(X, models):
    votes = np.zeros((X.shape[0], len(models)))
    classes = np.unique(np.hstack([pair for pair, _ in models]))
    for i, ((cl1, cl2), (w, b)) in enumerate(models):
        preds = np.sign(X.dot(w) + b)
        votes[:, i] = np.where(preds == 1, cl1, cl2)
    final_predictions = []
    for j in range(votes.shape[0]):
        valid_votes = votes[j][votes[j] >= 0]
        if len(valid_votes) > 0:
            bincount = np.bincount(valid_votes.astype(int))
            final_predictions.append(bincount.argmax())
        else:
            final_predictions.append(np.random.choice(classes))
    return np.array(final_predictions)


#**Optimization & OvR**

###L-BFGS-B (Limited-memory Broyden-Fletcher-Goldfarb-Shanno with Box constraints)

In [128]:
import numpy as np
from scipy.optimize import minimize
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from itertools import combinations
from ucimlrepo import fetch_ucirepo

np.random.seed(42)  # 시드 설정

class LBFGS_SVM:
    def __init__(self, C=0.01, svm_type='L2', c=1.0):
        self.C = C  # 정규화 상수
        self.svm_type = svm_type  # L1 또는 L2
        self.w_best = None
        self.b_best = None
        self.c = c

    def compute_slack_term(self, xi):
        """ SVM 타입별 슬랙 변수 비용 계산 """
        if self.svm_type == 'L1':
            return np.abs(xi)
        elif self.svm_type == 'L2':
            return xi ** 2
        elif self.svm_type == 'Fair':
            return np.sum(self.c**2 * ((xi / self.c) - np.log(1 + xi / self.c)))
        elif self.svm_type == 'Cauchy':
            return np.sum((self.c**2 / 2) * np.log(1 + (xi / self.c)**2))
        elif self.svm_type == 'Welsch':
            return np.sum((self.c**2 / 2) * (1 - np.exp(-(xi / self.c)**2)))
        elif self.svm_type == 'Geman-McClure':
            return np.sum(((xi**2)/2) / (1 + xi**2))
        else:
            raise ValueError("Unknown SVM type. Choose from ['L1', 'L2', 'Cauchy', 'Fair', 'Welsch', 'Geman-McClure'].")

    def fitness(self, params, X, y):
        """ 손실 함수 계산 (Hinge Loss 기반) """
        n_features = X.shape[1]
        w = params[:n_features]
        b = params[n_features]
        margins = y * (np.dot(X, w) + b)
        slack = np.maximum(0, 1 - margins)  # Hinge Loss 적용
        slack_term = self.compute_slack_term(slack)
        regularization = 0.5 * np.dot(w, w)  # L2 Regularization
        return regularization + self.C * np.sum(slack_term)  # 최소화할 손실 값

    def fit(self, X, y):
        n_samples, n_features = X.shape
        initial_params = np.zeros(n_features + 1)
        bounds = [(None, None)] * n_features + [(None, None)]  # 경계 조건 추가
        options = {'disp': True, 'maxiter': 5000, 'ftol': 1e-9}  # 최적화 옵션 추가: maxiter 증가, ftol 추가
        result = minimize(self.fitness, initial_params, args=(X, y), method='L-BFGS-B', bounds=bounds, options=options)
        # If optimization fails, try SLSQP solver
        if not result.success:
            result = minimize(self.fitness, initial_params, args=(X, y), method='SLSQP', bounds=bounds, options=options)

        if result.success:
            self.w_best = result.x[:n_features]
            self.b_best = result.x[n_features]
        else:
            raise ValueError("Optimization failed: " + result.message)

    def predict(self, X):
        return np.sign(np.dot(X, self.w_best) + self.b_best)

In [129]:
# OvR training function using LBFGS_SVM
def train_ovr(X, y, C, svm_type):
    classes = np.unique(y)  # 모든 클래스 목록을 가져옴
    models = []  # 학습된 모델을 저장할 리스트

    for cl in classes:
        y_bin = (y == cl).astype(int) * 2 - 1  # 현재 클래스는 +1, 나머지는 -1로 변환
        model = LBFGS_SVM(C=C, svm_type=svm_type)  # SVM 모델 생성
        model.fit(X, y_bin)  # 모델 학습

        w = model.w_best  # 최적화된 가중치 벡터
        b = model.b_best  # 최적화된 편향

        models.append((w, b))  # 학습된 모델을 리스트에 저장

    return models  # 모든 클래스별 학습된 모델 반환

def predict_ovr(X, models):
    predictions = [X.dot(w) + b for w, b in models]  # 각 클래스별 마진 계산
    return np.argmax(predictions, axis=0)  # 가장 높은 마진 값을 가진 클래스를 예측


###1. Genetic Algorithm (GA)

In [None]:
class GA_SVM:
    def __init__(self, C=1.0, pop_size=20, max_iter=100, mutation_rate=0.1, crossover_rate=0.7, svm_type='L2', c=1.0):
        self.C = C  # 정규화 상수
        self.pop_size = pop_size  # 개체 수
        self.max_iter = max_iter  # 세대 수
        self.mutation_rate = mutation_rate  # 돌연변이 확률
        self.crossover_rate = crossover_rate  # 교차 확률
        self.svm_type = svm_type  # 'L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure'
        self.w_best = None
        self.b_best = None
        self.c = c

    def compute_slack_term(self, xi):
        if self.svm_type == 'L1':
            return np.abs(xi)
        elif self.svm_type == 'L2':
            return xi ** 2
        elif self.svm_type == 'Fair':
            return np.sum(self.c**2 * ((xi / self.c) - np.log(1 + xi / self.c)))
        elif self.svm_type == 'Cauchy':
            return np.sum((self.c**2 / 2) * np.log(1 + (xi / self.c)**2))
        elif self.svm_type == 'Welsch':
            return np.sum((self.c**2 / 2) * (1 - np.exp(-(xi / self.c)**2)))
        elif self.svm_type == 'Geman-McClure':
            return np.sum(((xi**2)/2) / (1 + xi**2))
        else:
            raise ValueError("Unknown SVM type. Choose from ['L1', 'L2', 'Cauchy', 'Fair', 'Welsch', 'Geman-McClure'].")

    def fitness(self, w, b, X, y):
        margins = y * (np.dot(X, w) + b)
        slack = np.maximum(0, 1 - margins)
        slack_term = self.compute_slack_term(slack)
        regularization = 0.5 * np.dot(w, w)
        return regularization + self.C * np.sum(slack_term)

    def initialize_population(self, n_features):
        population = [(np.random.randn(n_features), np.random.randn()) for _ in range(self.pop_size)]
        return population

    def selection(self, population, fitness_values):
        probabilities = 1 / (fitness_values + 1e-6)
        probabilities /= probabilities.sum()
        selected_indices = np.random.choice(len(population), size=len(population), p=probabilities)
        return [population[i] for i in selected_indices]

    def crossover(self, parent1, parent2):
        w1, b1 = parent1
        w2, b2 = parent2
        if np.random.rand() < self.crossover_rate:
            point = np.random.randint(len(w1))
            new_w = np.concatenate((w1[:point], w2[point:]))
            new_b = (b1 + b2) / 2
        else:
            new_w, new_b = w1.copy(), b1
        return new_w, new_b

    def mutation(self, w, b):
        if np.random.rand() < self.mutation_rate:
            mutation_vector = np.random.randn(*w.shape) * 0.1
            w += mutation_vector
            b += np.random.randn() * 0.1
        return w, b

    def fit(self, X, y):
        n_samples, n_features = X.shape
        population = self.initialize_population(n_features)
        fitness_values = np.array([self.fitness(w, b, X, y) for w, b in population])
        np.random.seed(42)  # 시드 설정
        for _ in range(self.max_iter):
            selected_population = self.selection(population, fitness_values)
            new_population = []
            for i in range(0, len(selected_population), 2):
                p1, p2 = selected_population[i], selected_population[(i + 1) % len(selected_population)]
                offspring1 = self.crossover(p1, p2)
                offspring2 = self.crossover(p2, p1)
                new_population.append(self.mutation(*offspring1))
                new_population.append(self.mutation(*offspring2))
            population = new_population
            fitness_values = np.array([self.fitness(w, b, X, y) for w, b in population])
        best_idx = np.argmin(fitness_values)
        self.w_best, self.b_best = population[best_idx]

    def predict(self, X):
        return np.sign(np.dot(X, self.w_best) + self.b_best)

In [None]:
# OvR training function
def train_ovr(X, y, C, svm_type):
    classes = np.unique(y)  # 모든 클래스 목록을 가져옴
    models = []  # 학습된 모델을 저장할 리스트

    for cl in classes:
        y_bin = (y == cl).astype(int) * 2 - 1  # 현재 클래스는 +1, 나머지는 -1로 변환
        model = GA_SVM(C=C, svm_type=svm_type)  # SVM 모델 생성
        model.fit(X, y_bin)  # 모델 학습

        w = model.w_best  # 최적화된 가중치 벡터
        b = model.b_best  # 최적화된 편향

        models.append((w, b))  # 학습된 모델을 리스트에 저장

    return models  # 모든 클래스별 학습된 모델 반환

def predict_ovr(X, models):
    predictions = [X.dot(w) + b for w, b in models]  # 각 클래스별 마진 계산
    return np.argmax(predictions, axis=0)  # 가장 높은 마진 값을 가진 클래스를 예측


###SMO (Sequential Minial Optimization)

In [3]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
from itertools import combinations

class SMO_SVM:
    def __init__(self, C=1.0, kernel='linear', tol=1e-3, max_iter=1000, svm_type='L2', c=1.0, lr=0.01):
        self.C = C
        self.kernel = kernel
        self.tol = tol
        self.max_iter = max_iter
        self.svm_type = svm_type
        self.c = c
        self.lr = lr
        self.w_best = None  # 최적 가중치 벡터 추가
        self.b_best = None  # 최적 편향 값 추가

    def kernel_function(self, X, Y):
        if self.kernel == 'linear':
            return np.dot(X, Y.T)
        else:
            raise ValueError("Only linear kernel is supported in this implementation.")

    def compute_slack_term(self, xi):
        xi = np.maximum(xi, 1e-6)

        if self.svm_type == 'Fair':
            return np.sum(self.c**2 * ((xi / self.c) - np.log(1 + xi / self.c)))
        elif self.svm_type == 'Cauchy':
            return np.sum((self.c**2 / 2) * np.log(1 + (xi / self.c)**2))
        elif self.svm_type == 'Welsch':
            return np.sum((self.c**2 / 2) * (1 - np.exp(-(xi / self.c)**2)))
        elif self.svm_type == 'Geman-McClure':
            return np.sum(((xi**2)/2) / (1 + xi**2))
        elif self.svm_type == 'L1':
            return np.abs(xi)
        elif self.svm_type == 'L2':
            return xi ** 2
        else:
            raise ValueError("Unknown SVM type. Choose from ['L1', 'L2', 'Cauchy', 'Fair', 'Welsch', 'Geman-McClure'].")

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.alpha = np.zeros(n_samples)
        self.b = 0
        self.w = np.zeros(n_features)

        for _ in range(self.max_iter):
            for i in range(n_samples):
                xi, yi = X[i], y[i]
                margin = yi * (np.dot(self.w, xi) + self.b)
                slack = max(0, 1 - margin)
                slack_term = self.compute_slack_term(slack)

                if slack > 0:
                    delta_alpha = self.C * (1 - margin) - slack_term
                    self.alpha[i] = np.clip(self.alpha[i] + self.lr * delta_alpha, 0, self.C)
                    self.w += self.lr * self.alpha[i] * yi * xi
                    self.b += self.lr * self.alpha[i] * yi

        self.w_best = self.w  # 최적의 가중치 저장
        self.b_best = self.b  # 최적의 편향 값 저장

    def predict(self, X):
        return np.sign(np.dot(X, self.w_best) + self.b_best)  # b 대신 b_best 사용


In [4]:
# OvR training function using LBFGS_SVM
def train_ovr(X, y, C, svm_type):
    classes = np.unique(y)  # 모든 클래스 목록을 가져옴
    models = []  # 학습된 모델을 저장할 리스트

    for cl in classes:
        y_bin = (y == cl).astype(int) * 2 - 1  # 현재 클래스는 +1, 나머지는 -1로 변환
        model = SMO_SVM(C=C, svm_type=svm_type)  # SVM 모델 생성
        model.fit(X, y_bin)  # 모델 학습

        w = model.w_best  # 최적화된 가중치 벡터
        b = model.b_best  # 최적화된 편향

        models.append((w, b))  # 학습된 모델을 리스트에 저장

    return models  # 모든 클래스별 학습된 모델 반환

def predict_ovr(X, models):
    predictions = [X.dot(w) + b for w, b in models]  # 각 클래스별 마진 계산
    return np.argmax(predictions, axis=0)  # 가장 높은 마진 값을 가진 클래스를 예측


###PSO (Particle Swarm Optimization)

In [136]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from itertools import combinations
from ucimlrepo import fetch_ucirepo

class PSO_SVM:
    def __init__(self, C=1.0, num_particles=30, max_iter=100, svm_type='L2', c=1.0):
        self.C = C  # 정규화 상수
        self.num_particles = num_particles  # PSO 입자 개수
        self.max_iter = max_iter  # PSO 반복 횟수
        self.svm_type = svm_type  # L1 또는 L2
        self.w_best = None
        self.b_best = None
        self.c = c

    def compute_slack_term(self, xi):
        """ SVM 타입별 슬랙 변수 비용 계산 """
        if self.svm_type == 'L1':
            return np.abs(xi)
        elif self.svm_type == 'L2':
            return xi ** 2
        elif self.svm_type == 'Fair':
            return np.sum(self.c**2 * ((xi / self.c) - np.log(1 + xi / self.c)))
        elif self.svm_type == 'Cauchy':
            return np.sum((self.c**2 / 2) * np.log(1 + (xi / self.c)**2))
        elif self.svm_type == 'Welsch':
            return np.sum((self.c**2 / 2) * (1 - np.exp(-(xi / self.c)**2)))
        elif self.svm_type == 'Geman-McClure':
            return np.sum(((xi**2)/2) / (1 + xi**2))
        else:
            raise ValueError("Unknown SVM type. Choose from ['L1', 'L2', 'Cauchy', 'Fair', 'Welsch', 'Geman-McClure'].")

    def fitness(self, w, b, X, y):
        """ 손실 함수 계산 (Hinge Loss 기반) """
        margins = y * (np.dot(X, w) + b)
        slack = np.maximum(0, 1 - margins)  # Hinge Loss 적용
        slack_term = self.compute_slack_term(slack)
        regularization = 0.5 * np.dot(w, w)  # L2 Regularization
        return regularization + self.C * np.sum(slack_term)  # 최소화할 손실 값

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # PSO 초기화
        w_particles = np.random.randn(self.num_particles, n_features)  # 초기 w 값
        b_particles = np.random.randn(self.num_particles)  # 초기 b 값
        velocities_w = np.random.randn(self.num_particles, n_features) * 0.1  # 속도 초기화
        velocities_b = np.random.randn(self.num_particles) * 0.1

        # 개별 최적 및 전역 최적 초기화
        p_best_w = np.copy(w_particles)
        p_best_b = np.copy(b_particles)
        p_best_scores = np.array([self.fitness(w, b, X, y) for w, b in zip(w_particles, b_particles)])

        g_best_index = np.argmin(p_best_scores)
        g_best_w = p_best_w[g_best_index]
        g_best_b = p_best_b[g_best_index]

        # PSO 학습 진행
        w_inertia = 0.7  # 관성 계수
        c1 = 1.5  # 개인 최적화 계수
        c2 = 1.5  # 글로벌 최적화 계수

        for _ in range(self.max_iter):
            for i in range(self.num_particles):
                r1, r2 = np.random.rand(), np.random.rand()
                velocities_w[i] = (w_inertia * velocities_w[i] +
                                   c1 * r1 * (p_best_w[i] - w_particles[i]) +
                                   c2 * r2 * (g_best_w - w_particles[i]))
                velocities_b[i] = (w_inertia * velocities_b[i] +
                                   c1 * r1 * (p_best_b[i] - b_particles[i]) +
                                   c2 * r2 * (g_best_b - b_particles[i]))

                # 업데이트된 위치
                w_particles[i] += velocities_w[i]
                b_particles[i] += velocities_b[i]

                # 새로운 피트니스 값 계산
                new_fitness = self.fitness(w_particles[i], b_particles[i], X, y)

                # 최적값 갱신
                if new_fitness < p_best_scores[i]:
                    p_best_w[i] = w_particles[i]
                    p_best_b[i] = b_particles[i]
                    p_best_scores[i] = new_fitness

            # 전체 최적 갱신
            g_best_index = np.argmin(p_best_scores)
            g_best_w = p_best_w[g_best_index]
            g_best_b = p_best_b[g_best_index]

        self.w_best = g_best_w
        self.b_best = g_best_b

    def predict(self, X):
        return np.sign(np.dot(X, self.w_best) + self.b_best)
def train_ovo(X, y, C, svm_type):
    classes = np.unique(y)
    models = []
    class_pairs = list(combinations(classes, 2))
    for (cl1, cl2) in class_pairs:
        mask = (y == cl1) | (y == cl2)
        X_bin = X[mask]
        y_bin = y[mask]
        y_bin = (y_bin == cl1).astype(int) * 2 - 1
        model = PSO_SVM(C=C, svm_type=svm_type)
        model.fit(X_bin, y_bin)
        w = model.w_best
        b = model.b_best
        models.append(((cl1, cl2), (w, b)))
    return models

def predict_ovo(X, models):
    votes = np.zeros((X.shape[0], len(models)))
    classes = np.unique(np.hstack([pair for pair, _ in models]))
    for i, ((cl1, cl2), (w, b)) in enumerate(models):
        preds = np.sign(X.dot(w) + b)
        votes[:, i] = np.where(preds == 1, cl1, cl2)
    final_predictions = []
    for j in range(votes.shape[0]):
        valid_votes = votes[j][votes[j] >= 0]
        if len(valid_votes) > 0:
            bincount = np.bincount(valid_votes.astype(int))
            final_predictions.append(bincount.argmax())
        else:
            final_predictions.append(np.random.choice(classes))
    return np.array(final_predictions)


In [137]:
# OvR training function
def train_ovr(X, y, C, svm_type):
    classes = np.unique(y)  # 모든 클래스 목록을 가져옴
    models = []  # 학습된 모델을 저장할 리스트

    for cl in classes:
        y_bin = (y == cl).astype(int) * 2 - 1  # 현재 클래스는 +1, 나머지는 -1로 변환
        model = PSO_SVM(C=C, svm_type=svm_type)  # SVM 모델 생성
        model.fit(X, y_bin)  # 모델 학습

        w = model.w_best  # 최적화된 가중치 벡터
        b = model.b_best  # 최적화된 편향

        models.append((w, b))  # 학습된 모델을 리스트에 저장

    return models  # 모든 클래스별 학습된 모델 반환

def predict_ovr(X, models):
    predictions = [X.dot(w) + b for w, b in models]  # 각 클래스별 마진 계산
    return np.argmax(predictions, axis=0)  # 가장 높은 마진 값을 가진 클래스를 예측


###ACO (Ant Colony Optimization)

In [158]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from itertools import combinations
from ucimlrepo import fetch_ucirepo

np.random.seed(42)

class ACO_SVM:
    def __init__(self, C=1.0, num_ants=30, max_iter=100, decay=0.5, alpha=1, beta=2, svm_type='L2', c=1.0):
        self.C = C  # 정규화 상수
        self.num_ants = num_ants  # 개미 개수
        self.max_iter = max_iter  # 반복 횟수
        self.decay = decay  # 페로몬 증발 계수
        self.alpha = alpha  # 페로몬 영향도
        self.beta = beta  # 휴리스틱 정보 영향도
        self.svm_type = svm_type  # L1 또는 L2
        self.w_best = None
        self.b_best = None
        self.c = c

    def compute_slack_term(self, xi):
        """ SVM 타입별 슬랙 변수 비용 계산 """
        if self.svm_type == 'L1':
            return np.abs(xi)
        elif self.svm_type == 'L2':
            return xi ** 2
        elif self.svm_type == 'Fair':
            return np.sum(self.c**2 * ((xi / self.c) - np.log(1 + xi / self.c)))
        elif self.svm_type == 'Cauchy':
            return np.sum((self.c**2 / 2) * np.log(1 + (xi / self.c)**2))
        elif self.svm_type == 'Welsch':
            return np.sum((self.c**2 / 2) * (1 - np.exp(-(xi / self.c)**2)))
        elif self.svm_type == 'Geman-McClure':
            return np.sum(((xi**2)/2) / (1 + xi**2))
        else:
            raise ValueError("Unknown SVM type. Choose from ['L1', 'L2', 'Cauchy', 'Fair', 'Welsch', 'Geman-McClure'].")

    def fitness(self, w, b, X, y):
        """ 손실 함수 계산 (Hinge Loss 기반) """
        margins = y * (np.dot(X, w) + b)
        slack = np.maximum(0, 1 - margins)  # Hinge Loss 적용
        slack_term = self.compute_slack_term(slack)
        regularization = 0.5 * np.dot(w, w)  # L2 Regularization
        return regularization + self.C * np.sum(slack_term)  # 최소화할 손실 값

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # 개미들의 초기 해 (랜덤 가중치 및 바이어스)
        w_ants = np.random.randn(self.num_ants, n_features)
        b_ants = np.random.randn(self.num_ants)
        pheromones = np.ones(self.num_ants)  # 초기 페로몬 값 동일

        best_fitness = float('inf')
        g_best_w, g_best_b = None, None

        for _ in range(self.max_iter):
            fitness_values = np.array([self.fitness(w, b, X, y) for w, b in zip(w_ants, b_ants)])

            # 가장 좋은 개미 선택
            best_index = np.argmin(fitness_values)
            if fitness_values[best_index] < best_fitness:
                best_fitness = fitness_values[best_index]
                g_best_w, g_best_b = w_ants[best_index], b_ants[best_index]

            # 페로몬 업데이트 (좋은 해에 페로몬 증가)
            pheromones = (1 - self.decay) * pheromones  # 증발 적용
            pheromones[best_index] += 1 / (1 + best_fitness)  # 좋은 해 강화

            # 개미들의 새로운 탐색 방향 선택
            probabilities = (pheromones ** self.alpha) * ((1 / (1 + fitness_values)) ** self.beta)
            probabilities /= np.sum(probabilities)

            selected_indices = np.random.choice(self.num_ants, size=self.num_ants, p=probabilities)
            w_ants = w_ants[selected_indices] + np.random.randn(self.num_ants, n_features) * 0.1
            b_ants = b_ants[selected_indices] + np.random.randn(self.num_ants) * 0.1

        self.w_best = g_best_w
        self.b_best = g_best_b

    def predict(self, X):
        return np.sign(np.dot(X, self.w_best) + self.b_best)


In [159]:
# OvR training function
def train_ovr(X, y, C, svm_type):
    classes = np.unique(y)  # 모든 클래스 목록을 가져옴
    models = []  # 학습된 모델을 저장할 리스트

    for cl in classes:
        y_bin = (y == cl).astype(int) * 2 - 1  # 현재 클래스는 +1, 나머지는 -1로 변환
        model = ACO_SVM(C=C, svm_type=svm_type)  # SVM 모델 생성
        model.fit(X, y_bin)  # 모델 학습

        w = model.w_best  # 최적화된 가중치 벡터
        b = model.b_best  # 최적화된 편향

        models.append((w, b))  # 학습된 모델을 리스트에 저장

    return models  # 모든 클래스별 학습된 모델 반환

def predict_ovr(X, models):
    predictions = [X.dot(w) + b for w, b in models]  # 각 클래스별 마진 계산
    return np.argmax(predictions, axis=0)  # 가장 높은 마진 값을 가진 클래스를 예측


###HS (Harmony Search)

In [191]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from itertools import combinations
from ucimlrepo import fetch_ucirepo

class HS_SVM:
    def __init__(self, C=1.0, hm_size=20, max_iter=100, hmcr=0.9, par=0.3, bw=0.1, svm_type='L2', c=1.0):
        self.C = C  # 정규화 상수
        self.hm_size = hm_size  # 하모니 메모리 크기
        self.max_iter = max_iter  # 반복 횟수
        self.hmcr = hmcr  # 하모니 메모리 고려율 (기존 해를 선택할 확률)
        self.par = par  # 피치 조정 비율 (기존 해를 변형할 확률)
        self.bw = bw  # 변형 크기
        self.svm_type = svm_type  # 'L1' 또는 'L2'
        self.w_best = None
        self.b_best = None
        self.c = c

    def compute_slack_term(self, xi):
        """ SVM 타입별 슬랙 변수 비용 계산 """
        if self.svm_type == 'L1':
            return np.abs(xi)
        elif self.svm_type == 'L2':
            return xi ** 2
        elif self.svm_type == 'Fair':
            return np.sum(self.c**2 * ((xi / self.c) - np.log(1 + xi / self.c)))
        elif self.svm_type == 'Cauchy':
            return np.sum((self.c**2 / 2) * np.log(1 + (xi / self.c)**2))
        elif self.svm_type == 'Welsch':
            return np.sum((self.c**2 / 2) * (1 - np.exp(-(xi / self.c)**2)))
        elif self.svm_type == 'Geman-McClure':
            return np.sum(((xi**2)/2) / (1 + xi**2))
        else:
            raise ValueError("Unknown SVM type. Choose from ['L1', 'L2', 'Cauchy', 'Fair', 'Welsch', 'Geman-McClure'].")

    def fitness(self, w, b, X, y):
        """ 손실 함수 계산 (Hinge Loss 기반) """
        margins = y * (np.dot(X, w) + b)
        slack = np.maximum(0, 1 - margins)  # Hinge Loss 적용
        slack_term = self.compute_slack_term(slack)

        regularization = 0.5 * np.dot(w, w)  # L2 Regularization
        return regularization + self.C * np.sum(slack_term)  # 최소화할 손실 값

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # 초기 하모니 메모리 (랜덤 가중치 및 바이어스)
        harmony_memory = [(np.random.randn(n_features), np.random.randn()) for _ in range(self.hm_size)]
        fitness_values = np.array([self.fitness(w, b, X, y) for w, b in harmony_memory])

        for _ in range(self.max_iter):
            # 새로운 해 생성
            new_w, new_b = np.zeros(n_features), 0

            for j in range(n_features):
                if np.random.rand() < self.hmcr:  # 기존 해에서 선택할 확률
                    new_w[j] = harmony_memory[np.random.randint(self.hm_size)][0][j]
                    if np.random.rand() < self.par:  # 피치 조정
                        new_w[j] += np.random.uniform(-self.bw, self.bw)
                else:  # 랜덤 탐색
                    new_w[j] = np.random.randn()

            if np.random.rand() < self.hmcr:
                new_b = harmony_memory[np.random.randint(self.hm_size)][1]
                if np.random.rand() < self.par:
                    new_b += np.random.uniform(-self.bw, self.bw)
            else:
                new_b = np.random.randn()

            # 새로운 해의 적합도 평가
            new_fitness = self.fitness(new_w, new_b, X, y)

            # 기존 해 중 최악의 해와 비교 후 교체
            worst_idx = np.argmax(fitness_values)
            if new_fitness < fitness_values[worst_idx]:
                harmony_memory[worst_idx] = (new_w, new_b)
                fitness_values[worst_idx] = new_fitness

        # 최적 해 선택
        best_idx = np.argmin(fitness_values)
        self.w_best, self.b_best = harmony_memory[best_idx]

    def predict(self, X):
        return np.sign(np.dot(X, self.w_best) + self.b_best)


In [192]:
# OvR training function
def train_ovr(X, y, C, svm_type):
    classes = np.unique(y)  # 모든 클래스 목록을 가져옴
    models = []  # 학습된 모델을 저장할 리스트

    for cl in classes:
        y_bin = (y == cl).astype(int) * 2 - 1  # 현재 클래스는 +1, 나머지는 -1로 변환
        model = HS_SVM(C=C, svm_type=svm_type)  # SVM 모델 생성
        model.fit(X, y_bin)  # 모델 학습

        w = model.w_best  # 최적화된 가중치 벡터
        b = model.b_best  # 최적화된 편향

        models.append((w, b))  # 학습된 모델을 리스트에 저장

    return models  # 모든 클래스별 학습된 모델 반환

def predict_ovr(X, models):
    predictions = [X.dot(w) + b for w, b in models]  # 각 클래스별 마진 계산
    return np.argmax(predictions, axis=0)  # 가장 높은 마진 값을 가진 클래스를 예측


#**Optimization & Direct**

###L-BFGS-B (Limited-memory Broyden-Fletcher-Goldfarb-Shanno with Box constraints)

In [None]:
import numpy as np
from scipy.optimize import minimize
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from itertools import combinations
from ucimlrepo import fetch_ucirepo

np.random.seed(42)  # 시드 설정

class LBFGS_SVM:
    def __init__(self, C=0.01, svm_type='L2', c=1.0, num_classes=3):
        self.C = C  # 정규화 상수
        self.svm_type = svm_type  # L1 또는 L2
        self.num_classes = num_classes  # 다중 클래스 수
        self.W_best = None
        self.b_best = None
        self.c = c

    def compute_slack_term(self, xi):
        """ SVM 타입별 슬랙 변수 비용 계산 """
        if self.svm_type == 'L1':
            return np.abs(xi)
        elif self.svm_type == 'L2':
            return xi ** 2
        elif self.svm_type == 'Fair':
            return np.sum(self.c**2 * ((xi / self.c) - np.log(1 + xi / self.c)))
        elif self.svm_type == 'Cauchy':
            return np.sum((self.c**2 / 2) * np.log(1 + (xi / self.c)**2))
        elif self.svm_type == 'Welsch':
            return np.sum((self.c**2 / 2) * (1 - np.exp(-(xi / self.c)**2)))
        elif self.svm_type == 'Geman-McClure':
            return np.sum(((xi**2)/2) / (1 + xi**2))
        else:
            raise ValueError("Unknown SVM type. Choose from ['L1', 'L2', 'Cauchy', 'Fair', 'Welsch', 'Geman-McClure'].")

    def fitness(self, params, X, y):
        """ 다중 클래스 SVM 손실 함수 """
        n_features = X.shape[1]
        W = params[:-self.num_classes].reshape((self.num_classes, n_features))
        b = params[-self.num_classes:]

        scores = X.dot(W.T) + b
        margins = np.maximum(0, 1 - (scores[np.arange(X.shape[0]), y] - scores.T).T)
        margins[np.arange(X.shape[0]), y] = 0  # 정답 클래스의 손실 제거

        slack_term = self.compute_slack_term(margins)
        regularization = np.sum(W**2) / 2  # L2 Regularization
        return regularization + self.C * np.sum(slack_term)  # 최소화할 손실 값

    def fitness(self, params, X, y):
        """ 손실 함수 계산 (Hinge Loss 기반) """
        n_features = X.shape[1]
        w = params[:n_features]
        b = params[n_features]
        margins = y * (np.dot(X, w) + b)
        slack = np.maximum(0, 1 - margins)  # Hinge Loss 적용
        slack_term = self.compute_slack_term(slack)
        regularization = 0.5 * np.dot(w, w)  # L2 Regularization
        return regularization + self.C * np.sum(slack_term)  # 최소화할 손실 값

    def fit(self, X, y):
        n_samples, n_features = X.shape
        initial_params = np.zeros(n_features + 1)
        bounds = [(None, None)] * n_features + [(None, None)]  # 경계 조건 추가
        options = {'disp': True, 'maxiter': 5000, 'ftol': 1e-9}  # 최적화 옵션 추가: maxiter 증가, ftol 추가
        result = minimize(self.fitness, initial_params, args=(X, y), method='L-BFGS-B', bounds=bounds, options=options)
        # If optimization fails, try SLSQP solver
        if not result.success:
            result = minimize(self.fitness, initial_params, args=(X, y), method='SLSQP', bounds=bounds, options=options)

        if result.success:
            self.w_best = result.x[:n_features]
            self.b_best = result.x[n_features]
        else:
            raise ValueError("Optimization failed: " + result.message)

    def fit(self, X, y):
        n_samples, n_features = X.shape
        initial_params = np.zeros((self.num_classes * n_features) + self.num_classes)
        bounds = [(None, None)] * len(initial_params)
        options = {'disp': True, 'maxiter': 5000, 'ftol': 1e-9}  # 최적화 옵션 추가
        result = minimize(self.fitness, initial_params, args=(X, y), method='L-BFGS-B', bounds=bounds, options=options)

        if result.success:
            self.W_best = result.x[:-self.num_classes].reshape((self.num_classes, n_features))
            self.b_best = result.x[-self.num_classes:]
        else:
            raise ValueError("Optimization failed: " + result.message)

    def predict(self, X):
        scores = X.dot(self.W_best.T) + self.b_best
        return np.argmax(scores, axis=1)


In [None]:
# Direct SVM multiclass training function
def train_direct(X, y, C, num_classes, svm_type):
    model = LBFGS_SVM(C=C, svm_type=svm_type, num_classes=num_classes)
    model.fit(X, y)
    return model.W_best, model.b_best  # 모델의 가중치와 바이어스를 반환

# Multiclass prediction function
def predict_direct(X, weights, biases):
    scores = X.dot(weights.T) + biases
    return np.argmax(scores, axis=1)

###1. Genetic Algorithm (GA)

In [185]:
class GA_SVM:
  def __init__(self, C=1.0, pop_size=20, max_iter=100, mutation_rate=0.1, crossover_rate=0.7, svm_type='L2', c=1.0, num_classes=3):
      self.C = C  # 정규화 상수
      self.pop_size = pop_size  # 개체 수
      self.max_iter = max_iter  # 세대 수
      self.mutation_rate = mutation_rate  # 돌연변이 확률
      self.crossover_rate = crossover_rate  # 교차 확률
      self.svm_type = svm_type  # 'L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure'
      self.W_best = None
      self.b_best = None
      self.c = c
      self.num_classes = num_classes

  def compute_slack_term(self, xi):
      if self.svm_type == 'L1':
          return np.abs(xi)
      elif self.svm_type == 'L2':
          return xi ** 2
      elif self.svm_type == 'Fair':
          return np.sum(self.c**2 * ((xi / self.c) - np.log(1 + xi / self.c)))
      elif self.svm_type == 'Cauchy':
          return np.sum((self.c**2 / 2) * np.log(1 + (xi / self.c)**2))
      elif self.svm_type == 'Welsch':
          return np.sum((self.c**2 / 2) * (1 - np.exp(-(xi / self.c)**2)))
      elif self.svm_type == 'Geman-McClure':
          return np.sum(((xi**2)/2) / (1 + xi**2))
      else:
          raise ValueError("Unknown SVM type. Choose from ['L1', 'L2', 'Cauchy', 'Fair', 'Welsch', 'Geman-McClure'].")

  def fitness(self, W, b, X, y):
      scores = X.dot(W.T) + b
      margins = np.maximum(0, 1 - (scores[np.arange(X.shape[0]), y] - scores.T).T)
      margins[np.arange(X.shape[0]), y] = 0  # 정답 클래스의 손실 제거
      slack_term = self.compute_slack_term(margins)
      regularization = np.sum(W**2) / 2  # L2 Regularization
      return regularization + self.C * np.sum(slack_term)

  def initialize_population(self, n_features):
      population = [(np.random.randn(self.num_classes, n_features), np.random.randn(self.num_classes)) for _ in range(self.pop_size)]
      return population

  def selection(self, population, fitness_values):
      probabilities = 1 / (fitness_values + 1e-6)
      probabilities /= probabilities.sum()
      selected_indices = np.random.choice(len(population), size=len(population), p=probabilities)
      return [population[i] for i in selected_indices]

  def crossover(self, parent1, parent2):
      W1, b1 = parent1
      W2, b2 = parent2
      if np.random.rand() < self.crossover_rate:
          point = np.random.randint(W1.shape[1])
          new_W = np.hstack((W1[:, :point], W2[:, point:]))
          new_b = (b1 + b2) / 2
      else:
          new_W, new_b = W1.copy(), b1
      return new_W, new_b

  def mutation(self, W, b):
      if np.random.rand() < self.mutation_rate:
          mutation_matrix = np.random.randn(*W.shape) * 0.1
          W += mutation_matrix
          b += np.random.randn(*b.shape) * 0.1
      return W, b

  def fit(self, X, y):
      n_samples, n_features = X.shape
      population = self.initialize_population(n_features)
      fitness_values = np.array([self.fitness(W, b, X, y) for W, b in population])
      np.random.seed(42)  # 시드 설정
      for _ in range(self.max_iter):
          selected_population = self.selection(population, fitness_values)
          new_population = []
          for i in range(0, len(selected_population), 2):
              p1, p2 = selected_population[i], selected_population[(i + 1) % len(selected_population)]
              offspring1 = self.crossover(p1, p2)
              offspring2 = self.crossover(p2, p1)
              new_population.append(self.mutation(*offspring1))
              new_population.append(self.mutation(*offspring2))
          population = new_population
          fitness_values = np.array([self.fitness(W, b, X, y) for W, b in population])
      best_idx = np.argmin(fitness_values)
      self.W_best, self.b_best = population[best_idx]

  def predict(self, X):
      scores = X.dot(self.W_best.T) + self.b_best
      return np.argmax(scores, axis=1)

In [186]:
# Direct SVM multiclass training function
def train_direct(X, y, C, num_classes, svm_type):
    model = GA_SVM(C=C, svm_type=svm_type, num_classes=num_classes)
    model.fit(X, y)
    return model.W_best, model.b_best  # 모델의 가중치와 바이어스를 반환

# Multiclass prediction function
def predict_direct(X, weights, biases):
    scores = X.dot(weights.T) + biases
    return np.argmax(scores, axis=1)


###SMO (Sequential Minial Optimization)

In [7]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
from itertools import combinations

class SMO_SVM:
    def __init__(self, C=1.0, kernel='linear', tol=1e-3, max_iter=1000, svm_type='L2', c=1.0, lr=0.01, num_classes=3):
      self.C = C
      self.kernel = kernel
      self.tol = tol
      self.max_iter = max_iter
      self.svm_type = svm_type
      self.c = c
      self.lr = lr
      self.num_classes = num_classes
      self.W_best = None  # 최적 가중치 행렬 추가
      self.b_best = None  # 최적 편향 벡터 추가
    def kernel_function(self, X, Y):
      if self.kernel == 'linear':
          return np.dot(X, Y.T)
      else:
          raise ValueError("Only linear kernel is supported in this implementation.")

    def compute_slack_term(self, xi):
        xi = np.maximum(xi, 1e-6)
        if self.svm_type == 'Fair':
            return np.sum(self.c**2 * ((xi / self.c) - np.log(1 + xi / self.c)))
        elif self.svm_type == 'Cauchy':
            return np.sum((self.c**2 / 2) * np.log(1 + (xi / self.c)**2))
        elif self.svm_type == 'Welsch':
            return np.sum((self.c**2 / 2) * (1 - np.exp(-(xi / self.c)**2)))
        elif self.svm_type == 'Geman-McClure':
            return np.sum(((xi**2)/2) / (1 + xi**2))
        elif self.svm_type == 'L1':
            return np.abs(xi)
        elif self.svm_type == 'L2':
            return xi ** 2
        else:
            raise ValueError("Unknown SVM type. Choose from ['L1', 'L2', 'Cauchy', 'Fair', 'Welsch', 'Geman-McClure'].")

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.alpha = np.zeros((n_samples, self.num_classes))
        self.b = np.zeros(self.num_classes)
        self.W = np.zeros((self.num_classes, n_features))

        for _ in range(self.max_iter):
            for i in range(n_samples):
                xi, yi = X[i], y[i]
                margins = self.W.dot(xi) + self.b
                correct_class_margin = margins[yi]
                margin_diff = 1 - (correct_class_margin - margins)
                margin_diff[yi] = 0  # 정답 클래스의 손실 제거
                slack = np.maximum(0, margin_diff)
                slack_term = self.compute_slack_term(slack)

                if np.any(slack > 0):
                    delta_alpha = self.C * (1 - margin_diff) - slack_term
                    self.alpha[i, yi] = np.clip(self.alpha[i, yi] + self.lr * delta_alpha[yi], 0, self.C)
                    self.W[yi] += self.lr * self.alpha[i, yi] * xi
                    self.b[yi] += self.lr * self.alpha[i, yi]

        self.W_best = self.W  # 최적의 가중치 행렬 저장
        self.b_best = self.b  # 최적의 편향 벡터 저장

    def predict(self, X):
        scores = X.dot(self.W_best.T) + self.b_best
        return np.argmax(scores, axis=1)

In [8]:
# Direct SVM multiclass training function
def train_direct(X, y, C, num_classes, svm_type):
    model = SMO_SVM(C=C, svm_type=svm_type, num_classes=num_classes)
    model.fit(X, y)
    return model.W_best, model.b_best  # 모델의 가중치와 바이어스를 반환

# Multiclass prediction function
def predict_direct(X, weights, biases):
    scores = X.dot(weights.T) + biases
    return np.argmax(scores, axis=1)


###PSO (Particle Swarm Optimization)

In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from itertools import combinations
from ucimlrepo import fetch_ucirepo

class PSO_SVM:
    def __init__(self, C=1.0, num_particles=30, max_iter=100, svm_type='L2', c=1.0, num_classes=3):
        self.C = C  # 정규화 상수
        self.num_particles = num_particles  # PSO 입자 개수
        self.max_iter = max_iter  # PSO 반복 횟수
        self.svm_type = svm_type  # L1 또는 L2
        self.num_classes = num_classes
        self.W_best = None
        self.b_best = None
        self.c = c
    def compute_slack_term(self, xi):
        if self.svm_type == 'L1':
            return np.abs(xi)
        elif self.svm_type == 'L2':
            return xi ** 2
        elif self.svm_type == 'Fair':
            return np.sum(self.c**2 * ((xi / self.c) - np.log(1 + xi / self.c)))
        elif self.svm_type == 'Cauchy':
            return np.sum((self.c**2 / 2) * np.log(1 + (xi / self.c)**2))
        elif self.svm_type == 'Welsch':
            return np.sum((self.c**2 / 2) * (1 - np.exp(-(xi / self.c)**2)))
        elif self.svm_type == 'Geman-McClure':
            return np.sum(((xi**2)/2) / (1 + xi**2))
        else:
            raise ValueError("Unknown SVM type. Choose from ['L1', 'L2', 'Cauchy', 'Fair', 'Welsch', 'Geman-McClure'].")

    def fitness(self, W, b, X, y):
        scores = X.dot(W.T) + b
        margins = np.maximum(0, 1 - (scores[np.arange(X.shape[0]), y] - scores.T).T)
        margins[np.arange(X.shape[0]), y] = 0
        slack_term = self.compute_slack_term(margins)
        regularization = np.sum(W**2) / 2
        return regularization + self.C * np.sum(slack_term)

    def fit(self, X, y):
        n_samples, n_features = X.shape
        W_particles = np.random.randn(self.num_particles, self.num_classes, n_features)
        b_particles = np.random.randn(self.num_particles, self.num_classes)
        velocities_W = np.random.randn(self.num_particles, self.num_classes, n_features) * 0.1
        velocities_b = np.random.randn(self.num_particles, self.num_classes) * 0.1

        p_best_W = np.copy(W_particles)
        p_best_b = np.copy(b_particles)
        p_best_scores = np.array([self.fitness(W, b, X, y) for W, b in zip(W_particles, b_particles)])

        g_best_index = np.argmin(p_best_scores)
        g_best_W = p_best_W[g_best_index]
        g_best_b = p_best_b[g_best_index]

        w_inertia = 0.7
        c1, c2 = 1.5, 1.5

        for _ in range(self.max_iter):
            for i in range(self.num_particles):
                r1, r2 = np.random.rand(), np.random.rand()
                velocities_W[i] = (w_inertia * velocities_W[i] +
                                  c1 * r1 * (p_best_W[i] - W_particles[i]) +
                                  c2 * r2 * (g_best_W - W_particles[i]))
                velocities_b[i] = (w_inertia * velocities_b[i] +
                                  c1 * r1 * (p_best_b[i] - b_particles[i]) +
                                  c2 * r2 * (g_best_b - b_particles[i]))

                W_particles[i] += velocities_W[i]
                b_particles[i] += velocities_b[i]
                new_fitness = self.fitness(W_particles[i], b_particles[i], X, y)

                if new_fitness < p_best_scores[i]:
                    p_best_W[i] = W_particles[i]
                    p_best_b[i] = b_particles[i]
                    p_best_scores[i] = new_fitness

            g_best_index = np.argmin(p_best_scores)
            g_best_W = p_best_W[g_best_index]
            g_best_b = p_best_b[g_best_index]

        self.W_best = g_best_W
        self.b_best = g_best_b

    def predict(self, X):
        scores = X.dot(self.W_best.T) + self.b_best
        return np.argmax(scores, axis=1)

In [None]:
# Direct SVM multiclass training function
def train_direct(X, y, C, num_classes, svm_type):
    model = PSO_SVM(C=C, svm_type=svm_type, num_classes=num_classes)
    model.fit(X, y)
    return model.W_best, model.b_best  # 모델의 가중치와 바이어스를 반환

# Multiclass prediction function
def predict_direct(X, weights, biases):
    scores = X.dot(weights.T) + biases
    return np.argmax(scores, axis=1)


###ACO (Ant Colony Optimization)

In [134]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from itertools import combinations
from ucimlrepo import fetch_ucirepo

class ACO_SVM:
    def __init__(self, C=1.0, num_ants=30, max_iter=100, decay=0.5, alpha=1, beta=2, svm_type='L2', c=1.0, num_classes=3):
        self.C = C  # 정규화 상수
        self.num_ants = num_ants  # 개미 개수
        self.max_iter = max_iter  # 반복 횟수
        self.decay = decay  # 페로몬 증발 계수
        self.alpha = alpha  # 페로몬 영향도
        self.beta = beta  # 휴리스틱 정보 영향도
        self.svm_type = svm_type  # L1 또는 L2
        self.num_classes = num_classes
        self.W_best = None
        self.b_best = None
        self.c = c
    def compute_slack_term(self, xi):
        if self.svm_type == 'L1':
            return np.abs(xi)
        elif self.svm_type == 'L2':
            return xi ** 2
        elif self.svm_type == 'Fair':
            return np.sum(self.c**2 * ((xi / self.c) - np.log(1 + xi / self.c)))
        elif self.svm_type == 'Cauchy':
            return np.sum((self.c**2 / 2) * np.log(1 + (xi / self.c)**2))
        elif self.svm_type == 'Welsch':
            return np.sum((self.c**2 / 2) * (1 - np.exp(-(xi / self.c)**2)))
        elif self.svm_type == 'Geman-McClure':
            return np.sum(((xi**2)/2) / (1 + xi**2))
        else:
            raise ValueError("Unknown SVM type. Choose from ['L1', 'L2', 'Cauchy', 'Fair', 'Welsch', 'Geman-McClure'].")

    def fitness(self, W, b, X, y):
        scores = X.dot(W.T) + b
        margins = np.maximum(0, 1 - (scores[np.arange(X.shape[0]), y] - scores.T).T)
        margins[np.arange(X.shape[0]), y] = 0
        slack_term = self.compute_slack_term(margins)
        regularization = np.sum(W**2) / 2
        return regularization + self.C * np.sum(slack_term)

    def fit(self, X, y):
        n_samples, n_features = X.shape
        W_ants = np.random.randn(self.num_ants, self.num_classes, n_features)
        b_ants = np.random.randn(self.num_ants, self.num_classes)
        pheromones = np.ones(self.num_ants)

        best_fitness = float('inf')
        g_best_W, g_best_b = None, None

        for _ in range(self.max_iter):
            fitness_values = np.array([self.fitness(W, b, X, y) for W, b in zip(W_ants, b_ants)])
            best_index = np.argmin(fitness_values)
            if fitness_values[best_index] < best_fitness:
                best_fitness = fitness_values[best_index]
                g_best_W, g_best_b = W_ants[best_index], b_ants[best_index]

            pheromones = (1 - self.decay) * pheromones
            pheromones[best_index] += 1 / (1 + best_fitness)

            probabilities = (pheromones ** self.alpha) * ((1 / (1 + fitness_values)) ** self.beta)
            probabilities /= np.sum(probabilities)

            selected_indices = np.random.choice(self.num_ants, size=self.num_ants, p=probabilities)
            W_ants = W_ants[selected_indices] + np.random.randn(self.num_ants, self.num_classes, n_features) * 0.1
            b_ants = b_ants[selected_indices] + np.random.randn(self.num_ants, self.num_classes) * 0.1

        self.W_best = g_best_W
        self.b_best = g_best_b

    def predict(self, X):
        scores = X.dot(self.W_best.T) + self.b_best
        return np.argmax(scores, axis=1)

In [135]:
# Direct SVM multiclass training function
def train_direct(X, y, C, num_classes, svm_type):
    model = ACO_SVM(C=C, svm_type=svm_type, num_classes=num_classes)
    model.fit(X, y)
    return model.W_best, model.b_best  # 모델의 가중치와 바이어스를 반환

# Multiclass prediction function
def predict_direct(X, weights, biases):
    scores = X.dot(weights.T) + biases
    return np.argmax(scores, axis=1)


###HS (Harmony Search)

In [73]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from itertools import combinations
from ucimlrepo import fetch_ucirepo

class HS_SVM:
    def __init__(self, C=1.0, hm_size=20, max_iter=100, hmcr=0.9, par=0.3, bw=0.1, svm_type='L2', c=1.0, num_classes=3):
        self.C = C  # 정규화 상수
        self.hm_size = hm_size  # 하모니 메모리 크기
        self.max_iter = max_iter  # 반복 횟수
        self.hmcr = hmcr  # 하모니 메모리 고려율 (기존 해를 선택할 확률)
        self.par = par  # 피치 조정 비율 (기존 해를 변형할 확률)
        self.bw = bw  # 변형 크기
        self.svm_type = svm_type  # 'L1' 또는 'L2'
        self.num_classes = num_classes
        self.W_best = None
        self.b_best = None
        self.c = c

    def compute_slack_term(self, xi):
        if self.svm_type == 'L1':
            return np.abs(xi)
        elif self.svm_type == 'L2':
            return xi ** 2
        elif self.svm_type == 'Fair':
            return np.sum(self.c**2 * ((xi / self.c) - np.log(1 + xi / self.c)))
        elif self.svm_type == 'Cauchy':
            return np.sum((self.c**2 / 2) * np.log(1 + (xi / self.c)**2))
        elif self.svm_type == 'Welsch':
            return np.sum((self.c**2 / 2) * (1 - np.exp(-(xi / self.c)**2)))
        elif self.svm_type == 'Geman-McClure':
            return np.sum(((xi**2)/2) / (1 + xi**2))
        else:
            raise ValueError("Unknown SVM type. Choose from ['L1', 'L2', 'Cauchy', 'Fair', 'Welsch', 'Geman-McClure'].")

    def fitness(self, W, b, X, y):
        scores = X.dot(W.T) + b
        margins = np.maximum(0, 1 - (scores[np.arange(X.shape[0]), y] - scores.T).T)
        margins[np.arange(X.shape[0]), y] = 0
        slack_term = self.compute_slack_term(margins)
        regularization = np.sum(W**2) / 2
        return regularization + self.C * np.sum(slack_term)

    def fit(self, X, y):
        n_samples, n_features = X.shape
        harmony_memory = [(np.random.randn(self.num_classes, n_features), np.random.randn(self.num_classes)) for _ in range(self.hm_size)]
        fitness_values = np.array([self.fitness(W, b, X, y) for W, b in harmony_memory])

        for _ in range(self.max_iter):
            new_W, new_b = np.zeros((self.num_classes, n_features)), np.zeros(self.num_classes)
            for j in range(n_features):
                if np.random.rand() < self.hmcr:
                    new_W[:, j] = harmony_memory[np.random.randint(self.hm_size)][0][:, j]
                    if np.random.rand() < self.par:
                        new_W[:, j] += np.random.uniform(-self.bw, self.bw, self.num_classes)
                else:
                    new_W[:, j] = np.random.randn(self.num_classes)

            if np.random.rand() < self.hmcr:
                new_b = harmony_memory[np.random.randint(self.hm_size)][1]
                if np.random.rand() < self.par:
                    new_b += np.random.uniform(-self.bw, self.bw, self.num_classes)
            else:
                new_b = np.random.randn(self.num_classes)

            new_fitness = self.fitness(new_W, new_b, X, y)
            worst_idx = np.argmax(fitness_values)
            if new_fitness < fitness_values[worst_idx]:
                harmony_memory[worst_idx] = (new_W, new_b)
                fitness_values[worst_idx] = new_fitness

        best_idx = np.argmin(fitness_values)
        self.W_best, self.b_best = harmony_memory[best_idx]

    def predict(self, X):
        scores = X.dot(self.W_best.T) + self.b_best
        return np.argmax(scores, axis=1)

In [74]:
# Direct SVM multiclass training function
def train_direct(X, y, C, num_classes, svm_type):
    model = HS_SVM(C=C, svm_type=svm_type, num_classes=num_classes)
    model.fit(X, y)
    return model.W_best, model.b_best  # 모델의 가중치와 바이어스를 반환

# Multiclass prediction function
def predict_direct(X, weights, biases):
    scores = X.dot(weights.T) + biases
    return np.argmax(scores, axis=1)


#**OvO with various outliers**

###iris data

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore

np.random.seed(42)

# 데이터셋 로딩
data = load_iris()
X = data.data
y = data.target
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y  # 타겟 변수 추가

# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=data.feature_names)

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

# 기존 이상치 수 계산
total_existing_outliers = is_existing_outlier.sum()

# 기존 이상치 비율 계산
total_samples = len(df)
existing_outlier_percentage = (total_existing_outliers / total_samples) * 100

print(f"기존 이상치 수: {total_existing_outliers}")
print(f"기존 이상치 비율: {existing_outlier_percentage:.2f}%")

기존 이상치 수: 1
기존 이상치 비율: 0.67%


In [176]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from scipy.stats import zscore

np.random.seed(42)

# 데이터셋 로딩
iris = load_iris()
X = iris.data
y = iris.target
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y  # 타겟 변수 추가

# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=iris.feature_names)

# 이상치 생성
num_outliers = int(0.03 * len(df))  # 이상치를 설정
outliers = []

for _ in range(num_outliers):
    outlier_sample = df_scaled.sample(n=1).copy()  # 임의의 샘플 선택
    for col in df_scaled.columns:
        if np.random.rand() > 0.5:
            # 최대값보다 큰 이상치 생성
            outlier_sample[col] = df_scaled[col].max() + np.random.rand() * 3
        else:
            # 최소값보다 작은 이상치 생성
            outlier_sample[col] = df_scaled[col].min() - np.random.rand() * 3
    outliers.append(outlier_sample)

outliers_df = pd.concat(outliers, ignore_index=True)

# 이상치의 타겟 값 랜덤 할당
outliers_df['target'] = np.random.choice(y, num_outliers)

# 원본 데이터에 이상치 추가
df_extended = pd.concat([df, outliers_df], ignore_index=True)

# 이상치 추가 후 다시 정규화
X_extended_scaled = scaler.fit_transform(df_extended.drop(columns=['target']))
df_extended_scaled = pd.DataFrame(X_extended_scaled, columns=iris.feature_names)

# 새로운 X, y 정의
X = df_extended_scaled.values  # NumPy 배열로 변환
y = df_extended['target'].values  # 타겟 값 추출

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

total_existing_outliers = is_existing_outlier.sum()  # 기존 이상치 수
total_outliers = total_existing_outliers + num_outliers  # 총 이상치 수
total_samples = len(df_extended)  # 전체 샘플 수
total_outlier_percentage = (total_outliers / total_samples) * 100  # 이상치 비율 계산

# 데이터 확인
print(f"확장된 데이터셋 크기: {X.shape}")
print(f"확장된 타겟 크기: {y.shape}")
print(f"이상치가 포함된 데이터셋의 총 샘플 수: {len(X)}")
print(f"전체 이상치 비율: {total_outlier_percentage:.2f}%")


확장된 데이터셋 크기: (154, 4)
확장된 타겟 크기: (154,)
이상치가 포함된 데이터셋의 총 샘플 수: 154
전체 이상치 비율: 3.25%


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from scipy.stats import zscore

np.random.seed(42)

# 데이터셋 로딩
iris = load_iris()
X = iris.data
y = iris.target
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y  # 타겟 변수 추가

# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=iris.feature_names)

# 이상치 생성
num_outliers = int(0.05 * len(df))  # 이상치를 설정
outliers = []

for _ in range(num_outliers):
    outlier_sample = df_scaled.sample(n=1).copy()  # 임의의 샘플 선택
    for col in df_scaled.columns:
        if np.random.rand() > 0.5:
            # 최대값보다 큰 이상치 생성
            outlier_sample[col] = df_scaled[col].max() + np.random.rand() * 3
        else:
            # 최소값보다 작은 이상치 생성
            outlier_sample[col] = df_scaled[col].min() - np.random.rand() * 3
    outliers.append(outlier_sample)

outliers_df = pd.concat(outliers, ignore_index=True)

# 이상치의 타겟 값 랜덤 할당
outliers_df['target'] = np.random.choice(y, num_outliers)

# 원본 데이터에 이상치 추가
df_extended = pd.concat([df, outliers_df], ignore_index=True)

# 이상치 추가 후 다시 정규화
X_extended_scaled = scaler.fit_transform(df_extended.drop(columns=['target']))
df_extended_scaled = pd.DataFrame(X_extended_scaled, columns=iris.feature_names)

# 새로운 X, y 정의
X = df_extended_scaled.values  # NumPy 배열로 변환
y = df_extended['target'].values  # 타겟 값 추출

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

total_existing_outliers = is_existing_outlier.sum()  # 기존 이상치 수
total_outliers = total_existing_outliers + num_outliers  # 총 이상치 수
total_samples = len(df_extended)  # 전체 샘플 수
total_outlier_percentage = (total_outliers / total_samples) * 100  # 이상치 비율 계산

# 데이터 확인
print(f"확장된 데이터셋 크기: {X.shape}")
print(f"확장된 타겟 크기: {y.shape}")
print(f"이상치가 포함된 데이터셋의 총 샘플 수: {len(X)}")
print(f"전체 이상치 비율: {total_outlier_percentage:.2f}%")


확장된 데이터셋 크기: (157, 4)
확장된 타겟 크기: (157,)
이상치가 포함된 데이터셋의 총 샘플 수: 157
전체 이상치 비율: 5.10%


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from scipy.stats import zscore

np.random.seed(42)

# 데이터셋 로딩
iris = load_iris()
X = iris.data
y = iris.target
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y  # 타겟 변수 추가

# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=iris.feature_names)

# 이상치 생성
num_outliers = int(0.11 * len(df))  # 이상치를 설정
outliers = []

for _ in range(num_outliers):
    outlier_sample = df_scaled.sample(n=1).copy()  # 임의의 샘플 선택
    for col in df_scaled.columns:
        if np.random.rand() > 0.5:
            # 최대값보다 큰 이상치 생성
            outlier_sample[col] = df_scaled[col].max() + np.random.rand() * 3
        else:
            # 최소값보다 작은 이상치 생성
            outlier_sample[col] = df_scaled[col].min() - np.random.rand() * 3
    outliers.append(outlier_sample)

outliers_df = pd.concat(outliers, ignore_index=True)

# 이상치의 타겟 값 랜덤 할당
outliers_df['target'] = np.random.choice(y, num_outliers)

# 원본 데이터에 이상치 추가
df_extended = pd.concat([df, outliers_df], ignore_index=True)

# 이상치 추가 후 다시 정규화
X_extended_scaled = scaler.fit_transform(df_extended.drop(columns=['target']))
df_extended_scaled = pd.DataFrame(X_extended_scaled, columns=iris.feature_names)

# 새로운 X, y 정의
X = df_extended_scaled.values  # NumPy 배열로 변환
y = df_extended['target'].values  # 타겟 값 추출

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

total_existing_outliers = is_existing_outlier.sum()  # 기존 이상치 수
total_outliers = total_existing_outliers + num_outliers  # 총 이상치 수
total_samples = len(df_extended)  # 전체 샘플 수
total_outlier_percentage = (total_outliers / total_samples) * 100  # 이상치 비율 계산

# 데이터 확인
print(f"확장된 데이터셋 크기: {X.shape}")
print(f"확장된 타겟 크기: {y.shape}")
print(f"이상치가 포함된 데이터셋의 총 샘플 수: {len(X)}")
print(f"전체 이상치 비율: {total_outlier_percentage:.2f}%")


확장된 데이터셋 크기: (166, 4)
확장된 타겟 크기: (166,)
이상치가 포함된 데이터셋의 총 샘플 수: 166
전체 이상치 비율: 10.24%


In [180]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from itertools import combinations

results = []

# 데이터 전처리 및 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# OvO 방식을 사용하여 각 SVM 유형에 대해 학습 및 평가
for svm_type in ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']:
    models_ovo = train_ovo(X_train, y_train, C=1.0, svm_type=svm_type)
    y_pred_ovo = predict_ovo(X_test, models_ovo)

    # y_pred_ovo 변수를 정의한 후에 데이터 타입 확인 및 변환
    if y_test.dtype != np.int64:
        y_test = y_test.astype(int)

    if y_pred_ovo.dtype != np.int64:
        y_pred_ovo = y_pred_ovo.astype(int)

    accuracy_ovo = accuracy_score(y_test, y_pred_ovo)
    results.append((svm_type, round(accuracy_ovo * 100, 2)))

# 결과를 DataFrame으로 변환 및 출력
results_df = pd.DataFrame(results, columns=['SVM Type', 'Accuracy'])
print(results_df)

        SVM Type  Accuracy
0             L1     97.87
1             L2     97.87
2           Fair     95.74
3         Cauchy     97.87
4         Welsch     95.74
5  Geman-McClure     95.74


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.25%' : [97.87, 97.87, 95.74, 97.87, 95.74, 95.74],
    '5.10%' : [95.83, 93.75, 93.75, 93.75, 91.67, 91.67],
    '10.24%' : [92.0, 76.0, 84.0, 94.0, 94.0, 94.0]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Iris data (L-BFGS-B)'

# 데이터프레임 출력
print(df)

                      3.25%  5.10%  10.24%
Iris data (L-BFGS-B)                      
L1                    97.87  95.83    92.0
L2                    97.87  93.75    76.0
Fair                  95.74  93.75    84.0
Cauchy                97.87  93.75    94.0
Welsch                95.74  91.67    94.0
Geman-McClure         95.74  91.67    94.0


In [None]:
import pandas as pd

outlier = ['3%', '5%', '10%']
svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']

accuracy_data = {
    '3.25%' : [85.11, 91.49, 80.85, 87.23, 80.85, 85.11],
    '5.10%' : [85.42, 93.75, 75.00, 89.58, 77.08, 81.25],
    '10.24%' : [74.0, 62.0, 84.0, 66.0, 70.0, 70.0]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Iris data (GA)'

# 데이터프레임 출력
print(df)

                3.25%  5.10%  10.24%
Iris data (GA)                      
L1              85.11  85.42    74.0
L2              91.49  93.75    62.0
Fair            80.85  75.00    84.0
Cauchy          87.23  89.58    66.0
Welsch          80.85  77.08    70.0
Geman-McClure   85.11  81.25    70.0


In [None]:
import pandas as pd

outlier = ['3%', '5%', '10%']
svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']

accuracy_data = {
    '3.25%' : [36.17, 36.17, 97.87, 97.87, 97.87, 97.87],
    '5.10%' : [37.50, 37.50, 91.67, 91.67, 91.67, 91.67],
    '10.24%' : [34.0, 34.0, 90.0, 90.0, 90.0, 90.0]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Iris data (SMO)'

# 데이터프레임 출력
print(df)

                 3.25%  5.10%  10.24%
Iris data (SMO)                      
L1               36.17  37.50    34.0
L2               36.17  37.50    34.0
Fair             97.87  91.67    90.0
Cauchy           97.87  91.67    90.0
Welsch           97.87  91.67    90.0
Geman-McClure    97.87  91.67    90.0


In [None]:
import pandas as pd

outlier = ['3%', '5%', '10%']
svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']

accuracy_data = {
    '3.25%' : [97.87, 97.87, 95.74, 97.87, 95.74, 95.74],
    '5.10%' : [95.83, 93.75, 93.75, 93.75, 91.67, 91.67],
    '10.24%' : [92.0, 76.0, 84.0, 94.0, 94.0, 94.0]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Iris data (PSO)'

# 데이터프레임 출력
print(df)

                 3.25%  5.10%  10.24%
Iris data (PSO)                      
L1               97.87  95.83    92.0
L2               97.87  93.75    76.0
Fair             95.74  93.75    84.0
Cauchy           97.87  93.75    94.0
Welsch           95.74  91.67    94.0
Geman-McClure    95.74  91.67    94.0


In [None]:
import pandas as pd

outlier = ['3%', '5%', '10%']
svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']

accuracy_data = {
    '3.25%' : [97.78, 97.78, 97.78, 97.78, 97.78, 97.78],
    '5.10%' : [97.87, 97.87, 97.87, 97.87, 95.74, 95.74],
    '10.24%' : [95.83, 93.75, 93.75, 93.75, 91.67, 91.67]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Iris data (ACO)'

# 데이터프레임 출력
print(df)

                 3.25%  5.10%  10.24%
Iris data (ACO)                      
L1               97.78  97.87   95.83
L2               97.78  97.87   93.75
Fair             97.78  97.87   93.75
Cauchy           97.78  97.87   93.75
Welsch           97.78  95.74   91.67
Geman-McClure    97.78  95.74   91.67


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.25%' : [91.49, 87.23, 95.74, 95.74, 91.49, 89.36],
    '5.10%' : [93.75, 91.67, 91.67, 95.83, 91.67, 85.42],
    '10.24%' : [84.0, 72.0, 66.0, 92.0, 92.0, 88.0]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Iris data (HS)'

# 데이터프레임 출력
print(df)

                3.25%  5.10%  10.24%
Iris data (HS)                      
L1              91.49  93.75    84.0
L2              87.23  91.67    72.0
Fair            95.74  91.67    66.0
Cauchy          95.74  95.83    92.0
Welsch          91.49  91.67    92.0
Geman-McClure   89.36  85.42    88.0


### segment data

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy.stats import zscore

np.random.seed(42)

# 데이터셋 로딩
data = fetch_openml(name='segment', version=1)
X = data.data
# Use LabelEncoder to convert categorical target to numerical
encoder = LabelEncoder()
y = encoder.fit_transform(data.target)
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y  # 타겟 변수 추가

# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=data.feature_names)

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

# 기존 이상치 수 계산
total_existing_outliers = is_existing_outlier.sum()

# 기존 이상치 비율 계산
total_samples = len(df)
existing_outlier_percentage = (total_existing_outliers / total_samples) * 100

print(f"기존 이상치 수: {total_existing_outliers}")
print(f"기존 이상치 비율: {existing_outlier_percentage:.2f}%")

기존 이상치 수: 175
기존 이상치 비율: 7.58%


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from scipy.stats import zscore

np.random.seed(42)

# 데이터셋 로딩
data = fetch_openml(name='segment', version=1)
X = data.data
# Use LabelEncoder to convert categorical target to numerical
encoder = LabelEncoder()
y = encoder.fit_transform(data.target)
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y  # 타겟 변수 추가


# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=data.feature_names)

# 이상치 생성
num_outliers = int(0.028 * len(df))  # 이상치를 설정
outliers = []

for _ in range(num_outliers):
    outlier_sample = df_scaled.sample(n=1).copy()  # 임의의 샘플 선택
    for col in df_scaled.columns:
        if np.random.rand() > 0.5:
            # 최대값보다 큰 이상치 생성
            outlier_sample[col] = df_scaled[col].max() + np.random.rand() * 3
        else:
            # 최소값보다 작은 이상치 생성
            outlier_sample[col] = df_scaled[col].min() - np.random.rand() * 3
    outliers.append(outlier_sample)

outliers_df = pd.concat(outliers, ignore_index=True)

# 이상치의 타겟 값 랜덤 할당
outliers_df['target'] = np.random.choice(y, num_outliers)

# 원본 데이터에 이상치 추가
df_extended = pd.concat([df, outliers_df], ignore_index=True)

# 이상치 추가 후 다시 정규화
X_extended_scaled = scaler.fit_transform(df_extended.drop(columns=['target']))
df_extended_scaled = pd.DataFrame(X_extended_scaled, columns=data.feature_names)

# 새로운 X, y 정의
X = df_extended_scaled.values  # NumPy 배열로 변환
y = df_extended['target'].values  # 타겟 값 추출

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

total_existing_outliers = is_existing_outlier.sum()  # 기존 이상치 수
total_outliers = total_existing_outliers + num_outliers  # 총 이상치 수
total_samples = len(df_extended)  # 전체 샘플 수
total_outlier_percentage = (total_outliers / total_samples) * 100  # 이상치 비율 계산

# 데이터 확인
print(f"확장된 데이터셋 크기: {X.shape}")
print(f"확장된 타겟 크기: {y.shape}")
print(f"이상치가 포함된 데이터셋의 총 샘플 수: {len(X)}")
print(f"전체 이상치 비율: {total_outlier_percentage:.2f}%")


확장된 데이터셋 크기: (2374, 19)
확장된 타겟 크기: (2374,)
이상치가 포함된 데이터셋의 총 샘플 수: 2374
전체 이상치 비율: 10.07%


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from itertools import combinations

results = []

# 데이터 전처리 및 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# OvO 방식을 사용하여 각 SVM 유형에 대해 학습 및 평가
for svm_type in ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']:
    models_ovo = train_ovo(X_train, y_train, C=1.0, svm_type=svm_type)
    y_pred_ovo = predict_ovo(X_test, models_ovo)

    # y_pred_ovo 변수를 정의한 후에 데이터 타입 확인 및 변환
    if y_test.dtype != np.int64:
        y_test = y_test.astype(int)

    if y_pred_ovo.dtype != np.int64:
        y_pred_ovo = y_pred_ovo.astype(int)

    accuracy_ovo = accuracy_score(y_test, y_pred_ovo)
    results.append((svm_type, round(accuracy_ovo * 100, 2)))

# 결과를 DataFrame으로 변환 및 출력
results_df = pd.DataFrame(results, columns=['SVM Type', 'Accuracy'])
print(results_df)

        SVM Type  Accuracy
0             L1     14.87
1             L2     14.87
2           Fair     94.39
3         Cauchy     94.25
4         Welsch     94.53
5  Geman-McClure     94.67


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '7.58%' : [93.65, 94.52, 93.51, 93.51, 93.80, 92.64],
    '10.07%' : [92.71, 93.27, 92.15, 92.29, 93.41, 92.99]
}



# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Segment data (L-BFGS-B)'

# 데이터프레임 출력
print(df)


                         7.58%  10.07%
Segment data (L-BFGS-B)               
L1                       93.65   92.71
L2                       94.52   93.27
Fair                     93.51   92.15
Cauchy                   93.51   92.29
Welsch                   93.80   93.41
Geman-McClure            92.64   92.99


In [None]:
import pandas as pd

outlier = ['3%', '5%', '10%']
svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']

accuracy_data = {
    '7.58%' : [85.57, 89.61, 87.59, 86.72, 84.85, 86.72],
    '10.07%' : [85.41, 85.55, 85.97, 87.80, 85.97, 85.41
]
}
# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Segment data (GA)'

# 데이터프레임 출력
print(df)

                   7.58%  10.07%
Segment data (GA)               
L1                 85.57   85.41
L2                 89.61   85.55
Fair               87.59   85.97
Cauchy             86.72   87.80
Welsch             84.85   85.97
Geman-McClure      86.72   85.41


In [None]:
import pandas as pd

outlier = ['3%', '5%', '10%']
svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']

accuracy_data = {
    '7.58%' : [15.87, 15.87, 94.37, 94.52, 94.52, 94.52],
    '10.07%' : [14.87, 14.87, 94.39, 94.25, 94.53, 94.67]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Segment data (SMO)'

# 데이터프레임 출력
print(df)

                    7.58%  10.07%
Segment data (SMO)               
L1                  15.87   14.87
L2                  15.87   14.87
Fair                94.37   94.39
Cauchy              94.52   94.25
Welsch              94.52   94.53
Geman-McClure       94.52   94.67


In [None]:
import pandas as pd

outlier = ['3%', '5%', '10%']
svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']

accuracy_data = {
    '7.58%' : [90.04, 90.91, 90.91, 89.47, 92.35, 90.91],
    '10.07%' : [91.87, 92.15, 91.73, 92.57, 91.73, 92.85]
}


# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Segment data (PSO)'

# 데이터프레임 출력
print(df)


                    7.58%  10.07%
Segment data (PSO)               
L1                  90.04   91.87
L2                  90.91   92.15
Fair                90.91   91.73
Cauchy              89.47   92.57
Welsch              92.35   91.73
Geman-McClure       90.91   92.85


In [None]:
import pandas as pd

outlier = ['3%', '5%', '10%']
svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']

accuracy_data = {
    '7.58%' : [93.07, 92.93, 93.51, 93.36, 93.22, 91.92],
    '10.07%' : [91.87, 92.15, 91.73, 92.57, 91.73, 92.85]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Segment data (ACO)'

# 데이터프레임 출력
print(df)

                    7.58%  10.07%
Segment data (ACO)               
L1                  93.07   92.15
L2                  92.93   89.76
Fair                93.51   92.29
Cauchy              93.36   93.13
Welsch              93.22   93.41
Geman-McClure       91.92   92.43


In [None]:
import pandas as pd

outlier = ['3%', '5%', '10%']
svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']

accuracy_data = {
    '7.58%' : [85.86, 78.64, 82.25, 84.42, 82.83, 81.10],
    '10.07%' : [84.71, 72.65, 85.55, 83.73, 86.12, 86.12]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Segment data (HS)'

# 데이터프레임 출력
print(df)

                   7.58%  10.07%
Segment data (HS)               
L1                 85.86   84.71
L2                 78.64   72.65
Fair               82.25   85.55
Cauchy             84.42   83.73
Welsch             82.83   86.12
Geman-McClure      81.10   86.12


### vehicle data

In [165]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy.stats import zscore
import random

random.seed(42)
np.random.seed(42)

# 데이터셋 로딩
data = fetch_openml(name='vehicle', version=1)
X = data.data
encoder = LabelEncoder()
y = encoder.fit_transform(data.target)
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y  # 타겟 변수 추가

# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=data.feature_names)

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

# 기존 이상치 수 계산
total_existing_outliers = is_existing_outlier.sum()

# 기존 이상치 비율 계산
total_samples = len(df)
existing_outlier_percentage = (total_existing_outliers / total_samples) * 100

print(f"기존 이상치 수: {total_existing_outliers}")
print(f"기존 이상치 비율: {existing_outlier_percentage:.2f}%")

기존 이상치 수: 22
기존 이상치 비율: 2.60%


In [166]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from scipy.stats import zscore
import random

random.seed(42)
np.random.seed(42)

# 데이터셋 로딩
data = fetch_openml(name='vehicle', version=1)
X = data.data
# Use LabelEncoder to convert categorical target to numerical
encoder = LabelEncoder()
y = encoder.fit_transform(data.target)
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y  # 타겟 변수 추가


# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=data.feature_names)

# 이상치 생성
num_outliers = int(0.0048 * len(df))  # 이상치를 설정
outliers = []

for _ in range(num_outliers):
    outlier_sample = df_scaled.sample(n=1).copy()  # 임의의 샘플 선택
    for col in df_scaled.columns:
        if np.random.rand() > 0.5:
            # 최대값보다 큰 이상치 생성
            outlier_sample[col] = df_scaled[col].max() + np.random.rand() * 3
        else:
            # 최소값보다 작은 이상치 생성
            outlier_sample[col] = df_scaled[col].min() - np.random.rand() * 3
    outliers.append(outlier_sample)

outliers_df = pd.concat(outliers, ignore_index=True)

# 이상치의 타겟 값 랜덤 할당
outliers_df['target'] = np.random.choice(y, num_outliers)

# 원본 데이터에 이상치 추가
df_extended = pd.concat([df, outliers_df], ignore_index=True)

# 이상치 추가 후 다시 정규화
X_extended_scaled = scaler.fit_transform(df_extended.drop(columns=['target']))
df_extended_scaled = pd.DataFrame(X_extended_scaled, columns=data.feature_names)

# 새로운 X, y 정의
X = df_extended_scaled.values  # NumPy 배열로 변환
y = df_extended['target'].values  # 타겟 값 추출

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

total_existing_outliers = is_existing_outlier.sum()  # 기존 이상치 수
total_outliers = total_existing_outliers + num_outliers  # 총 이상치 수
total_samples = len(df_extended)  # 전체 샘플 수
total_outlier_percentage = (total_outliers / total_samples) * 100  # 이상치 비율 계산

# 데이터 확인
print(f"확장된 데이터셋 크기: {X.shape}")
print(f"확장된 타겟 크기: {y.shape}")
print(f"이상치가 포함된 데이터셋의 총 샘플 수: {len(X)}")
print(f"전체 이상치 비율: {total_outlier_percentage:.2f}%")


확장된 데이터셋 크기: (850, 18)
확장된 타겟 크기: (850,)
이상치가 포함된 데이터셋의 총 샘플 수: 850
전체 이상치 비율: 3.06%


In [114]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from scipy.stats import zscore
import random

random.seed(42)
np.random.seed(42)

# 데이터셋 로딩
data = fetch_openml(name='vehicle', version=1)
X = data.data
# Use LabelEncoder to convert categorical target to numerical
encoder = LabelEncoder()
y = encoder.fit_transform(data.target)
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y  # 타겟 변수 추가


# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=data.feature_names)

# 이상치 생성
num_outliers = int(0.027 * len(df))  # 이상치를 설정
outliers = []

for _ in range(num_outliers):
    outlier_sample = df_scaled.sample(n=1).copy()  # 임의의 샘플 선택
    for col in df_scaled.columns:
        if np.random.rand() > 0.5:
            # 최대값보다 큰 이상치 생성
            outlier_sample[col] = df_scaled[col].max() + np.random.rand() * 3
        else:
            # 최소값보다 작은 이상치 생성
            outlier_sample[col] = df_scaled[col].min() - np.random.rand() * 3
    outliers.append(outlier_sample)

outliers_df = pd.concat(outliers, ignore_index=True)

# 이상치의 타겟 값 랜덤 할당
outliers_df['target'] = np.random.choice(y, num_outliers)

# 원본 데이터에 이상치 추가
df_extended = pd.concat([df, outliers_df], ignore_index=True)

# 이상치 추가 후 다시 정규화
X_extended_scaled = scaler.fit_transform(df_extended.drop(columns=['target']))
df_extended_scaled = pd.DataFrame(X_extended_scaled, columns=data.feature_names)

# 새로운 X, y 정의
X = df_extended_scaled.values  # NumPy 배열로 변환
y = df_extended['target'].values  # 타겟 값 추출

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

total_existing_outliers = is_existing_outlier.sum()  # 기존 이상치 수
total_outliers = total_existing_outliers + num_outliers  # 총 이상치 수
total_samples = len(df_extended)  # 전체 샘플 수
total_outlier_percentage = (total_outliers / total_samples) * 100  # 이상치 비율 계산

# 데이터 확인
print(f"확장된 데이터셋 크기: {X.shape}")
print(f"확장된 타겟 크기: {y.shape}")
print(f"이상치가 포함된 데이터셋의 총 샘플 수: {len(X)}")
print(f"전체 이상치 비율: {total_outlier_percentage:.2f}%")


확장된 데이터셋 크기: (868, 18)
확장된 타겟 크기: (868,)
이상치가 포함된 데이터셋의 총 샘플 수: 868
전체 이상치 비율: 5.07%


In [170]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from scipy.stats import zscore
import random

random.seed(42)
np.random.seed(42)

# 데이터셋 로딩
data = fetch_openml(name='vehicle', version=1)
X = data.data
# Use LabelEncoder to convert categorical target to numerical
encoder = LabelEncoder()
y = encoder.fit_transform(data.target)
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y  # 타겟 변수 추가


# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=data.feature_names)

# 이상치 생성
num_outliers = int(0.083 * len(df))  # 이상치를 설정
outliers = []

for _ in range(num_outliers):
    outlier_sample = df_scaled.sample(n=1).copy()  # 임의의 샘플 선택
    for col in df_scaled.columns:
        if np.random.rand() > 0.5:
            # 최대값보다 큰 이상치 생성
            outlier_sample[col] = df_scaled[col].max() + np.random.rand() * 3
        else:
            # 최소값보다 작은 이상치 생성
            outlier_sample[col] = df_scaled[col].min() - np.random.rand() * 3
    outliers.append(outlier_sample)

outliers_df = pd.concat(outliers, ignore_index=True)

# 이상치의 타겟 값 랜덤 할당
outliers_df['target'] = np.random.choice(y, num_outliers)

# 원본 데이터에 이상치 추가
df_extended = pd.concat([df, outliers_df], ignore_index=True)

# 이상치 추가 후 다시 정규화
X_extended_scaled = scaler.fit_transform(df_extended.drop(columns=['target']))
df_extended_scaled = pd.DataFrame(X_extended_scaled, columns=data.feature_names)

# 새로운 X, y 정의
X = df_extended_scaled.values  # NumPy 배열로 변환
y = df_extended['target'].values  # 타겟 값 추출

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

total_existing_outliers = is_existing_outlier.sum()  # 기존 이상치 수
total_outliers = total_existing_outliers + num_outliers  # 총 이상치 수
total_samples = len(df_extended)  # 전체 샘플 수
total_outlier_percentage = (total_outliers / total_samples) * 100  # 이상치 비율 계산

# 데이터 확인
print(f"확장된 데이터셋 크기: {X.shape}")
print(f"확장된 타겟 크기: {y.shape}")
print(f"이상치가 포함된 데이터셋의 총 샘플 수: {len(X)}")
print(f"전체 이상치 비율: {total_outlier_percentage:.2f}%")


확장된 데이터셋 크기: (916, 18)
확장된 타겟 크기: (916,)
이상치가 포함된 데이터셋의 총 샘플 수: 916
전체 이상치 비율: 10.04%


In [189]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from itertools import combinations

results = []

# 데이터 전처리 및 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=40, stratify=y )
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# OvO 방식을 사용하여 각 SVM 유형에 대해 학습 및 평가
for svm_type in ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']:
    models_ovo = train_ovo(X_train, y_train, C=1.0, svm_type=svm_type)
    y_pred_ovo = predict_ovo(X_test, models_ovo)

    # y_pred_ovo 변수를 정의한 후에 데이터 타입 확인 및 변환
    if y_test.dtype != np.int64:
        y_test = y_test.astype(int)

    if y_pred_ovo.dtype != np.int64:
        y_pred_ovo = y_pred_ovo.astype(int)

    accuracy_ovo = accuracy_score(y_test, y_pred_ovo)
    results.append((svm_type, round(accuracy_ovo * 100, 2)))

# 결과를 DataFrame으로 변환 및 출력
results_df = pd.DataFrame(results, columns=['SVM Type', 'Accuracy'])
print(results_df['Accuracy'].tolist())
[87.66, 86.68, 88.5, 87.24, 88.92, 88.78]

[88.64, 88.08, 89.62, 89.48, 89.34, 87.66]


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.06%' : [25.49, 25.49, 78.43, 78.04, 78.82, 78.43],
    '5.07%' : [73.56, 77.01, 73.95, 73.56, 73.18, 72.80], #해야함
    '10.04%' : [70.18, 72.36, 70.91, 70.91, 65.82, 69.45] #해야함
}
# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Vehicle data (SMO)'

# 데이터프레임 출력
print(df)


In [70]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.06%' : [76.86, 78.43, 77.65, 76.86, 72.55, 74.51],
    '5.07%' : [73.56, 77.01, 73.95, 73.56, 73.18, 72.80],
    '10.04%' : [70.18, 72.36, 70.91, 70.91, 65.82, 69.45]
}
# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Vehicle data (L-BFGS-B)'

# 데이터프레임 출력
print(df)


                         3.06%  5.07%  10.04%
Vehicle data (L-BFGS-B)                      
L1                       76.86  73.56   70.18
L2                       78.43  77.01   72.36
Fair                     77.65  73.95   70.91
Cauchy                   76.86  73.56   70.91
Welsch                   72.55  73.18   65.82
Geman-McClure            74.51  72.80   69.45


In [71]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.06%' : [50.98, 47.84, 56.08, 47.06, 46.67, 61.57],
    '5.07%' : [49.04, 55.56, 51.72, 51.34, 49.04, 56.32],
    '10.04%' : [51.27, 42.91, 45.09, 53.45, 51.27, 45.09]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Vehicle data (GA)'

# 데이터프레임 출력
print(df)


                   3.06%  5.07%  10.04%
Vehicle data (GA)                      
L1                 50.98  49.04   51.27
L2                 47.84  55.56   42.91
Fair               56.08  51.72   45.09
Cauchy             47.06  51.34   53.45
Welsch             46.67  49.04   51.27
Geman-McClure      61.57  56.32   45.09


In [122]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.06%' : [71.37, 69.02, 70.2, 72.55, 71.37, 70.59],
    '5.07%' : [67.82, 65.52, 67.82, 66.28, 68.97, 73.95],
    '10.04%' : [63.64, 65.09, 63.64, 67.64, 66.91, 65.82]
}
# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Vehicle data (PSO)'

# 데이터프레임 출력
print(df)


                    3.06%  5.07%  10.04%
Vehicle data (PSO)                      
L1                  71.37  67.82   63.64
L2                  69.02  65.52   65.09
Fair                70.20  67.82   63.64
Cauchy              72.55  66.28   67.64
Welsch              71.37  68.97   66.91
Geman-McClure       70.59  73.95   65.82


In [105]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.06%' : [76.86, 76.08, 75.69, 77.65, 74.90, 74.12],
    '5.07%' : [73.56, 74.71, 72.8, 71.26, 66.28, 69.73],
    '10.04%' : [68.73, 70.18, 69.45, 70.55, 69.09, 69.82]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Vehicle data (ACO)'

# 데이터프레임 출력
print(df)


                    3.06%  5.07%  10.04%
Vehicle data (ACO)                      
L1                  76.86  73.56   68.73
L2                  76.08  74.71   70.18
Fair                75.69  72.80   69.45
Cauchy              77.65  71.26   70.55
Welsch              74.90  66.28   69.09
Geman-McClure       74.12  69.73   69.82


In [83]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.06%' : [64.31, 49.41, 47.06, 59.61, 64.71, 64.31],
    '5.07%' : [50.57, 41.00, 55.56, 63.60, 59.00, 62.45],
    '10.04%' : [41.45, 43.27, 42.18, 53.82, 51.64, 56.00]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Vehicle data (HS)'
# 데이터프레임 출력
print(df)


                   3.06%  5.07%  10.04%
Vehicle data (HS)                      
L1                 64.31  50.57   41.45
L2                 49.41  41.00   43.27
Fair               47.06  55.56   42.18
Cauchy             59.61  63.60   53.82
Welsch             64.71  59.00   51.64
Geman-McClure      64.31  62.45   56.00


#**OvR with various outliers**

###iris data

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore

# 데이터셋 로딩
data = load_iris()
X = data.data
y = data.target
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y  # 타겟 변수 추가

# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=data.feature_names)

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

# 기존 이상치 수 계산
total_existing_outliers = is_existing_outlier.sum()

# 기존 이상치 비율 계산
total_samples = len(df)
existing_outlier_percentage = (total_existing_outliers / total_samples) * 100

print(f"기존 이상치 수: {total_existing_outliers}")
print(f"기존 이상치 비율: {existing_outlier_percentage:.2f}%")

기존 이상치 수: 1
기존 이상치 비율: 0.67%


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from scipy.stats import zscore

np.random.seed(42)

# 데이터셋 로딩
iris = load_iris()
X = iris.data
y = iris.target
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y  # 타겟 변수 추가

# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=iris.feature_names)

# 이상치 생성
num_outliers = int(0.03 * len(df))  # 이상치를 설정
outliers = []

for _ in range(num_outliers):
    outlier_sample = df_scaled.sample(n=1).copy()  # 임의의 샘플 선택
    for col in df_scaled.columns:
        if np.random.rand() > 0.5:
            # 최대값보다 큰 이상치 생성
            outlier_sample[col] = df_scaled[col].max() + np.random.rand() * 3
        else:
            # 최소값보다 작은 이상치 생성
            outlier_sample[col] = df_scaled[col].min() - np.random.rand() * 3
    outliers.append(outlier_sample)

outliers_df = pd.concat(outliers, ignore_index=True)

# 이상치의 타겟 값 랜덤 할당
outliers_df['target'] = np.random.choice(y, num_outliers)

# 원본 데이터에 이상치 추가
df_extended = pd.concat([df, outliers_df], ignore_index=True)

# 이상치 추가 후 다시 정규화
X_extended_scaled = scaler.fit_transform(df_extended.drop(columns=['target']))
df_extended_scaled = pd.DataFrame(X_extended_scaled, columns=iris.feature_names)

# 새로운 X, y 정의
X = df_extended_scaled.values  # NumPy 배열로 변환
y = df_extended['target'].values  # 타겟 값 추출

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

total_existing_outliers = is_existing_outlier.sum()  # 기존 이상치 수
total_outliers = total_existing_outliers + num_outliers  # 총 이상치 수
total_samples = len(df_extended)  # 전체 샘플 수
total_outlier_percentage = (total_outliers / total_samples) * 100  # 이상치 비율 계산

# 데이터 확인
print(f"확장된 데이터셋 크기: {X.shape}")
print(f"확장된 타겟 크기: {y.shape}")
print(f"이상치가 포함된 데이터셋의 총 샘플 수: {len(X)}")
print(f"전체 이상치 비율: {total_outlier_percentage:.2f}%")


확장된 데이터셋 크기: (154, 4)
확장된 타겟 크기: (154,)
이상치가 포함된 데이터셋의 총 샘플 수: 154
전체 이상치 비율: 3.25%


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from scipy.stats import zscore

np.random.seed(42)

# 데이터셋 로딩
iris = load_iris()
X = iris.data
y = iris.target
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y  # 타겟 변수 추가

# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=iris.feature_names)

# 이상치 생성
num_outliers = int(0.05 * len(df))  # 이상치를 설정
outliers = []

for _ in range(num_outliers):
    outlier_sample = df_scaled.sample(n=1).copy()  # 임의의 샘플 선택
    for col in df_scaled.columns:
        if np.random.rand() > 0.5:
            # 최대값보다 큰 이상치 생성
            outlier_sample[col] = df_scaled[col].max() + np.random.rand() * 3
        else:
            # 최소값보다 작은 이상치 생성
            outlier_sample[col] = df_scaled[col].min() - np.random.rand() * 3
    outliers.append(outlier_sample)

outliers_df = pd.concat(outliers, ignore_index=True)

# 이상치의 타겟 값 랜덤 할당
outliers_df['target'] = np.random.choice(y, num_outliers)

# 원본 데이터에 이상치 추가
df_extended = pd.concat([df, outliers_df], ignore_index=True)

# 이상치 추가 후 다시 정규화
X_extended_scaled = scaler.fit_transform(df_extended.drop(columns=['target']))
df_extended_scaled = pd.DataFrame(X_extended_scaled, columns=iris.feature_names)

# 새로운 X, y 정의
X = df_extended_scaled.values  # NumPy 배열로 변환
y = df_extended['target'].values  # 타겟 값 추출

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

total_existing_outliers = is_existing_outlier.sum()  # 기존 이상치 수
total_outliers = total_existing_outliers + num_outliers  # 총 이상치 수
total_samples = len(df_extended)  # 전체 샘플 수
total_outlier_percentage = (total_outliers / total_samples) * 100  # 이상치 비율 계산

# 데이터 확인
print(f"확장된 데이터셋 크기: {X.shape}")
print(f"확장된 타겟 크기: {y.shape}")
print(f"이상치가 포함된 데이터셋의 총 샘플 수: {len(X)}")
print(f"전체 이상치 비율: {total_outlier_percentage:.2f}%")


확장된 데이터셋 크기: (157, 4)
확장된 타겟 크기: (157,)
이상치가 포함된 데이터셋의 총 샘플 수: 157
전체 이상치 비율: 5.10%


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from scipy.stats import zscore

np.random.seed(42)

# 데이터셋 로딩
iris = load_iris()
X = iris.data
y = iris.target
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y  # 타겟 변수 추가

# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=iris.feature_names)

# 이상치 생성
num_outliers = int(0.11 * len(df))  # 이상치를 설정
outliers = []

for _ in range(num_outliers):
    outlier_sample = df_scaled.sample(n=1).copy()  # 임의의 샘플 선택
    for col in df_scaled.columns:
        if np.random.rand() > 0.5:
            # 최대값보다 큰 이상치 생성
            outlier_sample[col] = df_scaled[col].max() + np.random.rand() * 3
        else:
            # 최소값보다 작은 이상치 생성
            outlier_sample[col] = df_scaled[col].min() - np.random.rand() * 3
    outliers.append(outlier_sample)

outliers_df = pd.concat(outliers, ignore_index=True)

# 이상치의 타겟 값 랜덤 할당
outliers_df['target'] = np.random.choice(y, num_outliers)

# 원본 데이터에 이상치 추가
df_extended = pd.concat([df, outliers_df], ignore_index=True)

# 이상치 추가 후 다시 정규화
X_extended_scaled = scaler.fit_transform(df_extended.drop(columns=['target']))
df_extended_scaled = pd.DataFrame(X_extended_scaled, columns=iris.feature_names)

# 새로운 X, y 정의
X = df_extended_scaled.values  # NumPy 배열로 변환
y = df_extended['target'].values  # 타겟 값 추출

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

total_existing_outliers = is_existing_outlier.sum()  # 기존 이상치 수
total_outliers = total_existing_outliers + num_outliers  # 총 이상치 수
total_samples = len(df_extended)  # 전체 샘플 수
total_outlier_percentage = (total_outliers / total_samples) * 100  # 이상치 비율 계산

# 데이터 확인
print(f"확장된 데이터셋 크기: {X.shape}")
print(f"확장된 타겟 크기: {y.shape}")
print(f"이상치가 포함된 데이터셋의 총 샘플 수: {len(X)}")
print(f"전체 이상치 비율: {total_outlier_percentage:.2f}%")


확장된 데이터셋 크기: (166, 4)
확장된 타겟 크기: (166,)
이상치가 포함된 데이터셋의 총 샘플 수: 166
전체 이상치 비율: 10.24%


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from itertools import combinations

results = []

# 데이터 전처리 및 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# OvR 방식을 사용하여 각 SVM 유형에 대해 학습 및 평가
for svm_type in ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']:
    models_ovr = train_ovr(X_train, y_train, C=1.0, svm_type=svm_type)
    y_pred_ovr = predict_ovr(X_test, models_ovr)

    # y_pred_ovr 변수를 정의한 후에 데이터 타입 확인 및 변환
    if y_test.dtype != np.int64:
        y_test = y_test.astype(int)

    if y_pred_ovr.dtype != np.int64:
        y_pred_ovr = y_pred_ovr.astype(int)

    accuracy_ovr = accuracy_score(y_test, y_pred_ovr)
    results.append((svm_type, round(accuracy_ovr * 100, 2)))

# 결과를 DataFrame으로 변환 및 출력
results_df = pd.DataFrame(results, columns=['SVM Type', 'Accuracy'])
print(results_df)

        SVM Type  Accuracy
0             L1      64.0
1             L2      66.0
2           Fair      64.0
3         Cauchy      68.0
4         Welsch      64.0
5  Geman-McClure      64.0


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.25%' : [82.98, 91.49, 89.36, 85.11, 72.34, 78.72],
    '5.10%' : [87.50, 93.75, 91.67, 89.58, 68.75, 68.75],
    '10.24%' : [66.0, 64.0, 64.0, 76.0, 70.0, 68.0]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Iris data (L-BFGS-B)'

# 데이터프레임 출력
print(df)

                      3.25%  5.10%  10.24%
Iris data (L-BFGS-B)                      
L1                    82.98  87.50    66.0
L2                    91.49  93.75    64.0
Fair                  89.36  91.67    64.0
Cauchy                85.11  89.58    76.0
Welsch                72.34  68.75    70.0
Geman-McClure         78.72  68.75    68.0


In [None]:
import pandas as pd

outlier = ['3%', '5%', '10%']
svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']

accuracy_data = {
    '3.25%' : [74.47, 80.85, 76.60, 76.60, 74.47, 74.47],
    '5.10%' : [70.83, 93.75, 79.17, 72.92, 70.83, 64.58],
    '10.24%' : [64.0, 76.0, 74.0, 64.0, 64.0, 64.0]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Iris data (GA)'

# 데이터프레임 출력
print(df)

                3.25%  5.10%  10.24%
Iris data (GA)                      
L1              74.47  70.83    64.0
L2              80.85  93.75    76.0
Fair            76.60  79.17    74.0
Cauchy          76.60  72.92    64.0
Welsch          74.47  70.83    64.0
Geman-McClure   74.47  64.58    64.0


In [None]:
import pandas as pd

outlier = ['3%', '5%', '10%']
svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']

accuracy_data = {
    '3.25%' : [31.91,31.91, 87.23, 87.23, 87.23, 87.23],
    '5.10%' : [29.17, 29.17, 89.58, 89.58, 89.58, 89.58],
    '10.24%' : [34.0, 34.0, 70.0, 70.0, 70.0, 70.0]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Iris data (SMO)'

# 데이터프레임 출력
print(df)

                 3.25%  5.10%  10.24%
Iris data (SMO)                      
L1               31.91  29.17    34.0
L2               31.91  29.17    34.0
Fair             87.23  89.58    70.0
Cauchy           87.23  89.58    70.0
Welsch           87.23  89.58    70.0
Geman-McClure    87.23  89.58    70.0


In [None]:
import pandas as pd

outlier = ['3%', '5%', '10%']
svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']

accuracy_data = {
    '3.25%' : [82.98, 91.49, 89.36, 85.11, 74.47, 78.72],
    '5.10%' : [85.42, 93.75, 91.67, 89.58, 60.42, 68.75],
    '10.24%' : [66.0, 64.0, 64.0, 76.0, 64.0, 66.0]
}


# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Iris data (PSO)'

# 데이터프레임 출력
print(df)

                 3.25%  5.10%  10.24%
Iris data (PSO)                      
L1               82.98  85.42    66.0
L2               91.49  93.75    64.0
Fair             89.36  91.67    64.0
Cauchy           85.11  89.58    76.0
Welsch           74.47  60.42    64.0
Geman-McClure    78.72  68.75    66.0


In [None]:
import pandas as pd

outlier = ['3%', '5%', '10%']
svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']

accuracy_data = {
    '3.25%' : [82.98, 91.49, 87.23, 85.11, 74.47, 78.72],
    '5.10%' : [87.50, 93.75, 91.67, 89.58, 66.67, 72.92],
    '10.24%' : [66.0, 64.0, 64.0, 80.0, 64.0, 66.0]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Iris data (ACO)'

# 데이터프레임 출력
print(df)

                 3.25%  5.10%  10.24%
Iris data (ACO)                      
L1               82.98  87.50    66.0
L2               91.49  93.75    64.0
Fair             87.23  91.67    64.0
Cauchy           85.11  89.58    80.0
Welsch           74.47  66.67    64.0
Geman-McClure    78.72  72.92    66.0


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.25%' : [91.49, 87.23, 95.74, 95.74, 91.49, 89.36],
    '5.10%' : [78.72, 85.11, 80.85, 80.85, 74.47, 65.96],
    '10.24%' : [64.0, 66.0, 64.0, 68.0, 64.0, 64.0]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Iris data (HS)'


# 데이터프레임 출력
print(df)

                3.25%  5.10%  10.24%
Iris data (HS)                      
L1              91.49  78.72    64.0
L2              87.23  85.11    66.0
Fair            95.74  80.85    64.0
Cauchy          95.74  80.85    68.0
Welsch          91.49  74.47    64.0
Geman-McClure   89.36  65.96    64.0


###segment data

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy.stats import zscore

np.random.seed(42)

# 데이터셋 로딩
data = fetch_openml(name='segment', version=1)
X = data.data
# Use LabelEncoder to convert categorical target to numerical
encoder = LabelEncoder()
y = encoder.fit_transform(data.target)
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y  # 타겟 변수 추가

# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=data.feature_names)

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

# 기존 이상치 수 계산
total_existing_outliers = is_existing_outlier.sum()

# 기존 이상치 비율 계산
total_samples = len(df)
existing_outlier_percentage = (total_existing_outliers / total_samples) * 100

print(f"기존 이상치 수: {total_existing_outliers}")
print(f"기존 이상치 비율: {existing_outlier_percentage:.2f}%")

기존 이상치 수: 175
기존 이상치 비율: 7.58%


In [190]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy.stats import zscore

np.random.seed(42)

# 데이터셋 로딩
data = fetch_openml(name='segment', version=1)
X = data.data
# Use LabelEncoder to convert categorical target to numerical
encoder = LabelEncoder()
y = encoder.fit_transform(data.target)
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y  # 타겟 변수 추가

# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=data.feature_names)

# 이상치 생성
num_outliers = int(0.028 * len(df))  # 이상치를 설정
outliers = []

for _ in range(num_outliers):
    outlier_sample = df_scaled.sample(n=1).copy()  # 임의의 샘플 선택
    for col in df_scaled.columns:
        if np.random.rand() > 0.5:
            # 최대값보다 큰 이상치 생성
            outlier_sample[col] = df_scaled[col].max() + np.random.rand() * 3
        else:
            # 최소값보다 작은 이상치 생성
            outlier_sample[col] = df_scaled[col].min() - np.random.rand() * 3
    outliers.append(outlier_sample)

outliers_df = pd.concat(outliers, ignore_index=True)

# 이상치의 타겟 값 랜덤 할당
outliers_df['target'] = np.random.choice(y, num_outliers)

# 원본 데이터에 이상치 추가
df_extended = pd.concat([df, outliers_df], ignore_index=True)

# 이상치 추가 후 다시 정규화
X_extended_scaled = scaler.fit_transform(df_extended.drop(columns=['target']))
df_extended_scaled = pd.DataFrame(X_extended_scaled, columns=data.feature_names)

# 새로운 X, y 정의
X = df_extended_scaled.values  # NumPy 배열로 변환
y = df_extended['target'].values  # 타겟 값 추출

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

total_existing_outliers = is_existing_outlier.sum()  # 기존 이상치 수
total_outliers = total_existing_outliers + num_outliers  # 총 이상치 수
total_samples = len(df_extended)  # 전체 샘플 수
total_outlier_percentage = (total_outliers / total_samples) * 100  # 이상치 비율 계산

# 데이터 확인
print(f"확장된 데이터셋 크기: {X.shape}")
print(f"확장된 타겟 크기: {y.shape}")
print(f"이상치가 포함된 데이터셋의 총 샘플 수: {len(X)}")
print(f"전체 이상치 비율: {total_outlier_percentage:.2f}%")


확장된 데이터셋 크기: (2374, 19)
확장된 타겟 크기: (2374,)
이상치가 포함된 데이터셋의 총 샘플 수: 2374
전체 이상치 비율: 10.07%


In [193]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from itertools import combinations

results = []

# 데이터 전처리 및 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# OvR 방식을 사용하여 각 SVM 유형에 대해 학습 및 평가
for svm_type in ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']:
    models_ovr = train_ovr(X_train, y_train, C=1.0, svm_type=svm_type)
    y_pred_ovr = predict_ovr(X_test, models_ovr)

    # y_pred_ovr 변수를 정의한 후에 데이터 타입 확인 및 변환
    if y_test.dtype != np.int64:
        y_test = y_test.astype(int)

    if y_pred_ovr.dtype != np.int64:
        y_pred_ovr = y_pred_ovr.astype(int)

    accuracy_ovr = accuracy_score(y_test, y_pred_ovr)
    results.append((svm_type, round(accuracy_ovr * 100, 2)))

# 결과를 DataFrame으로 변환 및 출력
results_df = pd.DataFrame(results, columns=['SVM Type', 'Accuracy'])
print(results_df)

        SVM Type  Accuracy
0             L1     67.18
1             L2     50.91
2           Fair     60.59
3         Cauchy     68.86
4         Welsch     63.25
5  Geman-McClure     64.66


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '7.58%' : [90.91, 90.76, 90.19, 90.76, 89.47, 89.47],
    '10.07%' : [89.06, 89.76, 89.48, 90.32, 84.85, 88.22]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Segment data (L-BFGS-B)'

# 데이터프레임 출력
print(df)

                         7.58%  10.07%
Segment data (L-BFGS-B)               
L1                       90.91   89.06
L2                       90.76   89.76
Fair                     90.19   89.48
Cauchy                   90.76   90.32
Welsch                   89.47   84.85
Geman-McClure            89.47   88.22


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '7.58%' : [69.99, 70.56, 76.19, 57.14, 36.36, 52.67],
    '10.07%' : [75.60, 68.44, 66.06, 64.38, 53.58, 67.32]
}


# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Segment data (GA)'

# 데이터프레임 출력
print(df)

                   7.58%  10.07%
Segment data (GA)               
L1                 69.99   75.60
L2                 70.56   68.44
Fair               76.19   66.06
Cauchy             57.14   64.38
Welsch             36.36   53.58
Geman-McClure      52.67   67.32


In [10]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '7.58%' : [12.41, 12.41, 91.05, 90.62, 90.76, 91.05],
    '10.07%' : [14.87, 14.87, 91.02, 90.46, 91.16, 90.74]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Segment data (SMO)'

# 데이터프레임 출력
print(df)

                    7.58%  10.07%
Segment data (SMO)               
L1                  12.41   14.87
L2                  12.41   14.87
Fair                91.05   91.02
Cauchy              90.62   90.46
Welsch              90.76   91.16
Geman-McClure       91.05   90.74


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '7.58%' : [85.71, 87.88, 87.59, 83.12, 71.57, 83.84],
    '10.07%' : [82.75, 83.31, 85.41, 86.96, 76.72, 74.19]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Segment data (PSO)'

# 데이터프레임 출력
print(df)

                    7.58%  10.07%
Segment data (PSO)               
L1                  85.71   82.75
L2                  87.88   83.31
Fair                87.59   85.41
Cauchy              83.12   86.96
Welsch              71.57   76.72
Geman-McClure       83.84   74.19


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '7.58%' : [89.75, 90.04, 90.62, 89.18, 89.03, 86.72],
    '10.07%' : [88.36, 89.48, 88.50, 88.78, 84.29, 86.82]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Segment data (ACO)'

# 데이터프레임 출력
print(df)

                    7.58%  10.07%
Segment data (ACO)               
L1                  89.75   88.36
L2                  90.04   89.48
Fair                90.62   88.50
Cauchy              89.18   88.78
Welsch              89.03   84.29
Geman-McClure       86.72   86.82


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '7.58%' : [68.25, 54.83, 61.04, 55.56, 55.12, 53.54],
    '10.07%' : [67.18, 50.91, 60.59, 68.86, 63.25, 64.66]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Segment data (HS)'

# 데이터프레임 출력
print(df)

                   7.58%  10.07%
Segment data (HS)               
L1                 68.25   67.18
L2                 54.83   50.91
Fair               61.04   60.59
Cauchy             55.56   68.86
Welsch             55.12   63.25
Geman-McClure      53.54   64.66


### vehicle data

In [132]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy.stats import zscore

np.random.seed(42)

# 데이터셋 로딩
data = fetch_openml(name='vehicle', version=1)
X = data.data
encoder = LabelEncoder()
y = encoder.fit_transform(data.target)
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y  # 타겟 변수 추가

# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=data.feature_names)

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

# 기존 이상치 수 계산
total_existing_outliers = is_existing_outlier.sum()

# 기존 이상치 비율 계산
total_samples = len(df)
existing_outlier_percentage = (total_existing_outliers / total_samples) * 100

print(f"기존 이상치 수: {total_existing_outliers}")
print(f"기존 이상치 비율: {existing_outlier_percentage:.2f}%")

기존 이상치 수: 22
기존 이상치 비율: 2.60%


In [153]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from scipy.stats import zscore

np.random.seed(42)

# 데이터셋 로딩
data = fetch_openml(name='vehicle', version=1)
X = data.data
# Use LabelEncoder to convert categorical target to numerical
encoder = LabelEncoder()
y = encoder.fit_transform(data.target)
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y  # 타겟 변수 추가


# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=data.feature_names)

# 이상치 생성
num_outliers = int(0.0048 * len(df))  # 이상치를 설정
outliers = []

for _ in range(num_outliers):
    outlier_sample = df_scaled.sample(n=1).copy()  # 임의의 샘플 선택
    for col in df_scaled.columns:
        if np.random.rand() > 0.5:
            # 최대값보다 큰 이상치 생성
            outlier_sample[col] = df_scaled[col].max() + np.random.rand() * 3
        else:
            # 최소값보다 작은 이상치 생성
            outlier_sample[col] = df_scaled[col].min() - np.random.rand() * 3
    outliers.append(outlier_sample)

outliers_df = pd.concat(outliers, ignore_index=True)

# 이상치의 타겟 값 랜덤 할당
outliers_df['target'] = np.random.choice(y, num_outliers)

# 원본 데이터에 이상치 추가
df_extended = pd.concat([df, outliers_df], ignore_index=True)

# 이상치 추가 후 다시 정규화
X_extended_scaled = scaler.fit_transform(df_extended.drop(columns=['target']))
df_extended_scaled = pd.DataFrame(X_extended_scaled, columns=data.feature_names)

# 새로운 X, y 정의
X = df_extended_scaled.values  # NumPy 배열로 변환
y = df_extended['target'].values  # 타겟 값 추출

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

total_existing_outliers = is_existing_outlier.sum()  # 기존 이상치 수
total_outliers = total_existing_outliers + num_outliers  # 총 이상치 수
total_samples = len(df_extended)  # 전체 샘플 수
total_outlier_percentage = (total_outliers / total_samples) * 100  # 이상치 비율 계산

# 데이터 확인
print(f"확장된 데이터셋 크기: {X.shape}")
print(f"확장된 타겟 크기: {y.shape}")
print(f"이상치가 포함된 데이터셋의 총 샘플 수: {len(X)}")
print(f"전체 이상치 비율: {total_outlier_percentage:.2f}%")


확장된 데이터셋 크기: (850, 18)
확장된 타겟 크기: (850,)
이상치가 포함된 데이터셋의 총 샘플 수: 850
전체 이상치 비율: 3.06%


In [148]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from scipy.stats import zscore

np.random.seed(42)

# 데이터셋 로딩
data = fetch_openml(name='vehicle', version=1)
X = data.data
# Use LabelEncoder to convert categorical target to numerical
encoder = LabelEncoder()
y = encoder.fit_transform(data.target)
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y  # 타겟 변수 추가


# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=data.feature_names)

# 이상치 생성
num_outliers = int(0.027 * len(df))  # 이상치를 설정
outliers = []

for _ in range(num_outliers):
    outlier_sample = df_scaled.sample(n=1).copy()  # 임의의 샘플 선택
    for col in df_scaled.columns:
        if np.random.rand() > 0.5:
            # 최대값보다 큰 이상치 생성
            outlier_sample[col] = df_scaled[col].max() + np.random.rand() * 3
        else:
            # 최소값보다 작은 이상치 생성
            outlier_sample[col] = df_scaled[col].min() - np.random.rand() * 3
    outliers.append(outlier_sample)

outliers_df = pd.concat(outliers, ignore_index=True)

# 이상치의 타겟 값 랜덤 할당
outliers_df['target'] = np.random.choice(y, num_outliers)

# 원본 데이터에 이상치 추가
df_extended = pd.concat([df, outliers_df], ignore_index=True)

# 이상치 추가 후 다시 정규화
X_extended_scaled = scaler.fit_transform(df_extended.drop(columns=['target']))
df_extended_scaled = pd.DataFrame(X_extended_scaled, columns=data.feature_names)

# 새로운 X, y 정의
X = df_extended_scaled.values  # NumPy 배열로 변환
y = df_extended['target'].values  # 타겟 값 추출

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

total_existing_outliers = is_existing_outlier.sum()  # 기존 이상치 수
total_outliers = total_existing_outliers + num_outliers  # 총 이상치 수
total_samples = len(df_extended)  # 전체 샘플 수
total_outlier_percentage = (total_outliers / total_samples) * 100  # 이상치 비율 계산

# 데이터 확인
print(f"확장된 데이터셋 크기: {X.shape}")
print(f"확장된 타겟 크기: {y.shape}")
print(f"이상치가 포함된 데이터셋의 총 샘플 수: {len(X)}")
print(f"전체 이상치 비율: {total_outlier_percentage:.2f}%")


확장된 데이터셋 크기: (868, 18)
확장된 타겟 크기: (868,)
이상치가 포함된 데이터셋의 총 샘플 수: 868
전체 이상치 비율: 5.07%


In [141]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from scipy.stats import zscore

np.random.seed(42)

# 데이터셋 로딩
data = fetch_openml(name='vehicle', version=1)
X = data.data
# Use LabelEncoder to convert categorical target to numerical
encoder = LabelEncoder()
y = encoder.fit_transform(data.target)
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y  # 타겟 변수 추가


# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=data.feature_names)

# 이상치 생성
num_outliers = int(0.083 * len(df))  # 이상치를 설정
outliers = []

for _ in range(num_outliers):
    outlier_sample = df_scaled.sample(n=1).copy()  # 임의의 샘플 선택
    for col in df_scaled.columns:
        if np.random.rand() > 0.5:
            # 최대값보다 큰 이상치 생성
            outlier_sample[col] = df_scaled[col].max() + np.random.rand() * 3
        else:
            # 최소값보다 작은 이상치 생성
            outlier_sample[col] = df_scaled[col].min() - np.random.rand() * 3
    outliers.append(outlier_sample)

outliers_df = pd.concat(outliers, ignore_index=True)

# 이상치의 타겟 값 랜덤 할당
outliers_df['target'] = np.random.choice(y, num_outliers)

# 원본 데이터에 이상치 추가
df_extended = pd.concat([df, outliers_df], ignore_index=True)

# 이상치 추가 후 다시 정규화
X_extended_scaled = scaler.fit_transform(df_extended.drop(columns=['target']))
df_extended_scaled = pd.DataFrame(X_extended_scaled, columns=data.feature_names)

# 새로운 X, y 정의
X = df_extended_scaled.values  # NumPy 배열로 변환
y = df_extended['target'].values  # 타겟 값 추출

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

total_existing_outliers = is_existing_outlier.sum()  # 기존 이상치 수
total_outliers = total_existing_outliers + num_outliers  # 총 이상치 수
total_samples = len(df_extended)  # 전체 샘플 수
total_outlier_percentage = (total_outliers / total_samples) * 100  # 이상치 비율 계산

# 데이터 확인
print(f"확장된 데이터셋 크기: {X.shape}")
print(f"확장된 타겟 크기: {y.shape}")
print(f"이상치가 포함된 데이터셋의 총 샘플 수: {len(X)}")
print(f"전체 이상치 비율: {total_outlier_percentage:.2f}%")


확장된 데이터셋 크기: (916, 18)
확장된 타겟 크기: (916,)
이상치가 포함된 데이터셋의 총 샘플 수: 916
전체 이상치 비율: 10.04%


In [161]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from itertools import combinations

results = []

# 데이터 전처리 및 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# OvO 방식을 사용하여 각 SVM 유형에 대해 학습 및 평가
for svm_type in ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']:
    models_ovo = train_ovo(X_train, y_train, C=1.0, svm_type=svm_type)
    y_pred_ovo = predict_ovo(X_test, models_ovo)

    # y_pred_ovo 변수를 정의한 후에 데이터 타입 확인 및 변환
    if y_test.dtype != np.int64:
        y_test = y_test.astype(int)

    if y_pred_ovo.dtype != np.int64:
        y_pred_ovo = y_pred_ovo.astype(int)

    accuracy_ovo = accuracy_score(y_test, y_pred_ovo)
    results.append((svm_type, round(accuracy_ovo * 100, 2)))

# 결과를 DataFrame으로 변환 및 출력
results_df = pd.DataFrame(results, columns=['SVM Type', 'Accuracy'])
print(results_df['Accuracy'].tolist())

[76.47, 72.16, 72.94, 71.76, 67.45, 68.24]


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.06%' : [71.37, 69.02, 70.2, 72.55, 71.37, 70.59],
    '5.07%' : [73.56, 77.01, 73.95, 73.56, 73.18, 72.80],
    '10.04%' : [70.18, 72.36, 70.91, 70.91, 65.82, 69.45]
}
# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Vehicle data (ACO)'

# 데이터프레임 출력
print(df)


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.06%' : [25.49, 25.49, 78.43, 78.04, 78.82, 78.43],
    '5.07%' : [21.07, 21.07, 78.54, 77.78, 77.78, 78.93],
    '10.04%' : [70.18, 72.36, 70.91, 70.91, 65.82, 69.45]#하기
}
# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Vehicle data (L-BFGS-B)'

# 데이터프레임 출력
print(df)


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.06%' : [76.86, 78.43, 77.65, 76.86, 72.55, 74.51],
    '5.07%' : [73.56, 77.01, 73.95, 73.56, 73.18, 72.80],
    '10.04%' : [70.18, 72.36, 70.91, 70.91, 65.82, 69.45]
}
# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Vehicle data (L-BFGS-B)'

# 데이터프레임 출력
print(df)


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.06%' : [76.86, 78.43, 77.65, 76.86, 72.55, 74.51],
    '5.07%' : [73.56, 77.01, 73.95, 73.56, 73.18, 72.80],
    '10.04%' : [70.18, 72.36, 70.91, 70.91, 65.82, 69.45]
}
# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Vehicle data (HS)'

# 데이터프레임 출력
print(df)


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.06%' : [76.86, 78.43, 77.65, 76.86, 72.55, 74.51],
    '5.07%' : [73.56, 77.01, 73.95, 73.56, 73.18, 72.80],
    '10.04%' : [70.18, 72.36, 70.91, 70.91, 65.82, 69.45]
}
# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Vehicle data (SMO)'

# 데이터프레임 출력
print(df)


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.06%' : [76.86, 78.43, 77.65, 76.86, 72.55, 74.51],
    '5.07%' : [73.56, 77.01, 73.95, 73.56, 73.18, 72.80],
    '10.04%' : [70.18, 72.36, 70.91, 70.91, 65.82, 69.45]
}
# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Vehicle data (GA)'

# 데이터프레임 출력
print(df)


In [143]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.06%' : [71.37, 69.02, 70.2, 72.55, 71.37, 70.59],
    '5.07%' : [67.82, 65.52, 67.82, 66.28, 68.97, 73.95],
    '10.04%' : [63.64, 65.09, 63.64, 67.64, 66.91, 65.82]
}
# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Vehicle data (PSO)'

# 데이터프레임 출력
print(df)


                    3.06%  5.07%  10.04%
Vehicle data (PSO)                      
L1                  71.37  67.82   63.64
L2                  69.02  65.52   65.09
Fair                70.20  67.82   63.64
Cauchy              72.55  66.28   67.64
Welsch              71.37  68.97   66.91
Geman-McClure       70.59  73.95   65.82


#**Direct with various outliers**

###iris data

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore

# 데이터셋 로딩
data = load_iris()
X = data.data
y = data.target
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y  # 타겟 변수 추가

# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=data.feature_names)

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

# 기존 이상치 수 계산
total_existing_outliers = is_existing_outlier.sum()

# 기존 이상치 비율 계산
total_samples = len(df)
existing_outlier_percentage = (total_existing_outliers / total_samples) * 100

print(f"기존 이상치 수: {total_existing_outliers}")
print(f"기존 이상치 비율: {existing_outlier_percentage:.2f}%")

기존 이상치 수: 1
기존 이상치 비율: 0.67%


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from scipy.stats import zscore

np.random.seed(42)

# 데이터셋 로딩
iris = load_iris()
X = iris.data
y = iris.target
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y  # 타겟 변수 추가

# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=iris.feature_names)

# 이상치 생성
num_outliers = int(0.03 * len(df))  # 이상치를 설정
outliers = []

for _ in range(num_outliers):
    outlier_sample = df_scaled.sample(n=1).copy()  # 임의의 샘플 선택
    for col in df_scaled.columns:
        if np.random.rand() > 0.5:
            # 최대값보다 큰 이상치 생성
            outlier_sample[col] = df_scaled[col].max() + np.random.rand() * 3
        else:
            # 최소값보다 작은 이상치 생성
            outlier_sample[col] = df_scaled[col].min() - np.random.rand() * 3
    outliers.append(outlier_sample)

outliers_df = pd.concat(outliers, ignore_index=True)

# 이상치의 타겟 값 랜덤 할당
outliers_df['target'] = np.random.choice(y, num_outliers)

# 원본 데이터에 이상치 추가
df_extended = pd.concat([df, outliers_df], ignore_index=True)

# 이상치 추가 후 다시 정규화
X_extended_scaled = scaler.fit_transform(df_extended.drop(columns=['target']))
df_extended_scaled = pd.DataFrame(X_extended_scaled, columns=iris.feature_names)

# 새로운 X, y 정의
X = df_extended_scaled.values  # NumPy 배열로 변환
y = df_extended['target'].values  # 타겟 값 추출

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

total_existing_outliers = is_existing_outlier.sum()  # 기존 이상치 수
total_outliers = total_existing_outliers + num_outliers  # 총 이상치 수
total_samples = len(df_extended)  # 전체 샘플 수
total_outlier_percentage = (total_outliers / total_samples) * 100  # 이상치 비율 계산

# 데이터 확인
print(f"확장된 데이터셋 크기: {X.shape}")
print(f"확장된 타겟 크기: {y.shape}")
print(f"이상치가 포함된 데이터셋의 총 샘플 수: {len(X)}")
print(f"전체 이상치 비율: {total_outlier_percentage:.2f}%")


확장된 데이터셋 크기: (154, 4)
확장된 타겟 크기: (154,)
이상치가 포함된 데이터셋의 총 샘플 수: 154
전체 이상치 비율: 3.25%


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from scipy.stats import zscore

np.random.seed(42)

# 데이터셋 로딩
iris = load_iris()
X = iris.data
y = iris.target
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y  # 타겟 변수 추가

# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=iris.feature_names)

# 이상치 생성
num_outliers = int(0.05 * len(df))  # 이상치를 설정
outliers = []

for _ in range(num_outliers):
    outlier_sample = df_scaled.sample(n=1).copy()  # 임의의 샘플 선택
    for col in df_scaled.columns:
        if np.random.rand() > 0.5:
            # 최대값보다 큰 이상치 생성
            outlier_sample[col] = df_scaled[col].max() + np.random.rand() * 3
        else:
            # 최소값보다 작은 이상치 생성
            outlier_sample[col] = df_scaled[col].min() - np.random.rand() * 3
    outliers.append(outlier_sample)

outliers_df = pd.concat(outliers, ignore_index=True)

# 이상치의 타겟 값 랜덤 할당
outliers_df['target'] = np.random.choice(y, num_outliers)

# 원본 데이터에 이상치 추가
df_extended = pd.concat([df, outliers_df], ignore_index=True)

# 이상치 추가 후 다시 정규화
X_extended_scaled = scaler.fit_transform(df_extended.drop(columns=['target']))
df_extended_scaled = pd.DataFrame(X_extended_scaled, columns=iris.feature_names)

# 새로운 X, y 정의
X = df_extended_scaled.values  # NumPy 배열로 변환
y = df_extended['target'].values  # 타겟 값 추출

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

total_existing_outliers = is_existing_outlier.sum()  # 기존 이상치 수
total_outliers = total_existing_outliers + num_outliers  # 총 이상치 수
total_samples = len(df_extended)  # 전체 샘플 수
total_outlier_percentage = (total_outliers / total_samples) * 100  # 이상치 비율 계산

# 데이터 확인
print(f"확장된 데이터셋 크기: {X.shape}")
print(f"확장된 타겟 크기: {y.shape}")
print(f"이상치가 포함된 데이터셋의 총 샘플 수: {len(X)}")
print(f"전체 이상치 비율: {total_outlier_percentage:.2f}%")


확장된 데이터셋 크기: (157, 4)
확장된 타겟 크기: (157,)
이상치가 포함된 데이터셋의 총 샘플 수: 157
전체 이상치 비율: 5.10%


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from scipy.stats import zscore

np.random.seed(42)

# 데이터셋 로딩
iris = load_iris()
X = iris.data
y = iris.target
df = pd.DataFrame(X, columns=iris.feature_names)
df['target'] = y  # 타겟 변수 추가

# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=iris.feature_names)

# 이상치 생성
num_outliers = int(0.11 * len(df))  # 이상치를 설정
outliers = []

for _ in range(num_outliers):
    outlier_sample = df_scaled.sample(n=1).copy()  # 임의의 샘플 선택
    for col in df_scaled.columns:
        if np.random.rand() > 0.5:
            # 최대값보다 큰 이상치 생성
            outlier_sample[col] = df_scaled[col].max() + np.random.rand() * 3
        else:
            # 최소값보다 작은 이상치 생성
            outlier_sample[col] = df_scaled[col].min() - np.random.rand() * 3
    outliers.append(outlier_sample)

outliers_df = pd.concat(outliers, ignore_index=True)

# 이상치의 타겟 값 랜덤 할당
outliers_df['target'] = np.random.choice(y, num_outliers)

# 원본 데이터에 이상치 추가
df_extended = pd.concat([df, outliers_df], ignore_index=True)

# 이상치 추가 후 다시 정규화
X_extended_scaled = scaler.fit_transform(df_extended.drop(columns=['target']))
df_extended_scaled = pd.DataFrame(X_extended_scaled, columns=iris.feature_names)

# 새로운 X, y 정의
X = df_extended_scaled.values  # NumPy 배열로 변환
y = df_extended['target'].values  # 타겟 값 추출

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

total_existing_outliers = is_existing_outlier.sum()  # 기존 이상치 수
total_outliers = total_existing_outliers + num_outliers  # 총 이상치 수
total_samples = len(df_extended)  # 전체 샘플 수
total_outlier_percentage = (total_outliers / total_samples) * 100  # 이상치 비율 계산

# 데이터 확인
print(f"확장된 데이터셋 크기: {X.shape}")
print(f"확장된 타겟 크기: {y.shape}")
print(f"이상치가 포함된 데이터셋의 총 샘플 수: {len(X)}")
print(f"전체 이상치 비율: {total_outlier_percentage:.2f}%")


확장된 데이터셋 크기: (166, 4)
확장된 타겟 크기: (166,)
이상치가 포함된 데이터셋의 총 샘플 수: 166
전체 이상치 비율: 10.24%


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from itertools import combinations

results = []

# 데이터 전처리 및 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Direct 방식을 사용하여 각 SVM 유형에 대해 학습 및 평가
for svm_type in ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']:
    num_classes = len(np.unique(y))
    weights, biases = train_direct(X_train, y_train, C=1.0, svm_type=svm_type, num_classes=num_classes)
    y_pred_direct = predict_direct(X_test, weights, biases)


    # y_pred_direct 변수를 정의한 후에 데이터 타입 확인 및 변환
    if y_test.dtype != np.int64:
        y_test = y_test.astype(int)

    if y_pred_direct.dtype != np.int64:
        y_pred_direct = y_pred_direct.astype(int)

    accuracy_direct = accuracy_score(y_test, y_pred_direct)
    results.append((svm_type, round(accuracy_direct * 100, 2)))

# 결과를 DataFrame으로 변환 및 출력
results_df = pd.DataFrame(results, columns=['SVM Type', 'Accuracy'])
print(results_df)

        SVM Type  Accuracy
0             L1      84.0
1             L2      18.0
2           Fair      70.0
3         Cauchy      74.0
4         Welsch      66.0
5  Geman-McClure      78.0


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.25%' : [100.00, 97.87, 95.74, 95.74, 95.74, 95.74],
    '5.10%' : [91.67, 93.75, 91.67, 91.67, 91.67, 91.67],
    '10.24%' : [64.0, 76.0, 76.0, 94.0, 92.0, 92.0]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Iris data (L-BFGS-B)'

# 데이터프레임 출력
print(df)

                       3.25%  5.10%  10.24%
Iris data (L-BFGS-B)                       
L1                    100.00  91.67    64.0
L2                     97.87  93.75    76.0
Fair                   95.74  91.67    76.0
Cauchy                 95.74  91.67    94.0
Welsch                 95.74  91.67    92.0
Geman-McClure          95.74  91.67    92.0


In [None]:
import pandas as pd

outlier = ['3%', '5%', '10%']
svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']

accuracy_data = {
    '3.25%' : [87.23, 78.72, 78.72, 78.72, 74.47, 74.47],
    '5.10%' : [70.83, 41.67, 72.92, 81.25, 70.83, 60.42],
    '10.24%' : [70.0, 64.0, 80.0, 68.0, 72.0, 90.0]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Iris data (GA)'

# 데이터프레임 출력
print(df)

                3.25%  5.10%  10.24%
Iris data (GA)                      
L1              87.23  70.83    70.0
L2              78.72  41.67    64.0
Fair            78.72  72.92    80.0
Cauchy          78.72  81.25    68.0
Welsch          74.47  70.83    72.0
Geman-McClure   74.47  60.42    90.0


In [None]:
import pandas as pd

outlier = ['3%', '5%', '10%']
svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']

accuracy_data = {
    '3.25%' : [89.36, 89.36, 87.23, 87.23, 89.36, 89.36],
    '5.10%' : [93.75, 93.75, 89.58, 87.50, 89.58, 89.58],
    '10.24%' : [88.0, 88.0, 94.0, 94.0, 94.0, 94.0]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Iris data (SMO)'

# 데이터프레임 출력
print(df)

                 3.25%  5.10%  10.24%
Iris data (SMO)                      
L1               89.36  93.75    88.0
L2               89.36  93.75    88.0
Fair             87.23  89.58    94.0
Cauchy           87.23  87.50    94.0
Welsch           89.36  89.58    94.0
Geman-McClure    89.36  89.58    94.0


In [None]:
import pandas as pd

outlier = ['3%', '5%', '10%']
svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']

accuracy_data = {
    '3.25%' : [91.49, 95.74, 91.49, 97.87, 89.36, 95.74],
    '5.10%' : [91.67, 93.75, 93.75, 89.58, 91.67, 91.67],
    '10.24%' : [68.0, 64.0, 80.0, 92.0, 64.0, 90.0]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Iris data (PSO)'

# 데이터프레임 출력
print(df)

                 3.25%  5.10%  10.24%
Iris data (PSO)                      
L1               91.49  91.67    68.0
L2               95.74  93.75    64.0
Fair             91.49  93.75    80.0
Cauchy           97.87  89.58    92.0
Welsch           89.36  91.67    64.0
Geman-McClure    95.74  91.67    90.0


In [None]:
import pandas as pd

outlier = ['3%', '5%', '10%']
svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']

accuracy_data = {
    '3.25%' : [95.74, 97.87, 95.74, 97.87, 95.74, 95.74],
    '5.10%' : [93.75, 93.75, 93.75, 91.67, 91.67, 91.67],
    '10.24%' : [64.0, 86.0, 82.0, 94.0, 92.0, 92.0]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Iris data (ACO)'

# 데이터프레임 출력
print(df)

                 3.25%  5.10%  10.24%
Iris data (ACO)                      
L1               82.98  87.50    66.0
L2               91.49  93.75    64.0
Fair             87.23  91.67    64.0
Cauchy           85.11  89.58    80.0
Welsch           74.47  66.67    64.0
Geman-McClure    78.72  72.92    66.0


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.25%' : [78.72, 87.23, 76.60, 89.36, 74.47, 80.85],
    '5.10%' : [66.67, 68.75, 70.83, 75.00, 81.25, 83.33],
    '10.24%' : [84.0, 18.0, 70.0, 74.0, 66.0, 78.0]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Iris data (HS)'


# 데이터프레임 출력
print(df)

                3.25%  5.10%  10.24%
Iris data (HS)                      
L1              78.72  66.67    84.0
L2              87.23  68.75    18.0
Fair            76.60  70.83    70.0
Cauchy          89.36  75.00    74.0
Welsch          74.47  81.25    66.0
Geman-McClure   80.85  83.33    78.0


### segment data

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy.stats import zscore

np.random.seed(42)

# 데이터셋 로딩
data = fetch_openml(name='segment', version=1)
X = data.data
# Use LabelEncoder to convert categorical target to numerical
encoder = LabelEncoder()
y = encoder.fit_transform(data.target)
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y  # 타겟 변수 추가

# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=data.feature_names)

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

# 기존 이상치 수 계산
total_existing_outliers = is_existing_outlier.sum()

# 기존 이상치 비율 계산
total_samples = len(df)
existing_outlier_percentage = (total_existing_outliers / total_samples) * 100

print(f"기존 이상치 수: {total_existing_outliers}")
print(f"기존 이상치 비율: {existing_outlier_percentage:.2f}%")

기존 이상치 수: 175
기존 이상치 비율: 7.58%


In [184]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy.stats import zscore

np.random.seed(42)

# 데이터셋 로딩
data = fetch_openml(name='segment', version=1)
X = data.data
# Use LabelEncoder to convert categorical target to numerical
encoder = LabelEncoder()
y = encoder.fit_transform(data.target)
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y  # 타겟 변수 추가

# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=data.feature_names)

# 이상치 생성
num_outliers = int(0.028 * len(df))  # 이상치를 설정
outliers = []

for _ in range(num_outliers):
    outlier_sample = df_scaled.sample(n=1).copy()  # 임의의 샘플 선택
    for col in df_scaled.columns:
        if np.random.rand() > 0.5:
            # 최대값보다 큰 이상치 생성
            outlier_sample[col] = df_scaled[col].max() + np.random.rand() * 3
        else:
            # 최소값보다 작은 이상치 생성
            outlier_sample[col] = df_scaled[col].min() - np.random.rand() * 3
    outliers.append(outlier_sample)

outliers_df = pd.concat(outliers, ignore_index=True)

# 이상치의 타겟 값 랜덤 할당
outliers_df['target'] = np.random.choice(y, num_outliers)

# 원본 데이터에 이상치 추가
df_extended = pd.concat([df, outliers_df], ignore_index=True)

# 이상치 추가 후 다시 정규화
X_extended_scaled = scaler.fit_transform(df_extended.drop(columns=['target']))
df_extended_scaled = pd.DataFrame(X_extended_scaled, columns=data.feature_names)

# 새로운 X, y 정의
X = df_extended_scaled.values  # NumPy 배열로 변환
y = df_extended['target'].values  # 타겟 값 추출

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

total_existing_outliers = is_existing_outlier.sum()  # 기존 이상치 수
total_outliers = total_existing_outliers + num_outliers  # 총 이상치 수
total_samples = len(df_extended)  # 전체 샘플 수
total_outlier_percentage = (total_outliers / total_samples) * 100  # 이상치 비율 계산

# 데이터 확인
print(f"확장된 데이터셋 크기: {X.shape}")
print(f"확장된 타겟 크기: {y.shape}")
print(f"이상치가 포함된 데이터셋의 총 샘플 수: {len(X)}")
print(f"전체 이상치 비율: {total_outlier_percentage:.2f}%")


확장된 데이터셋 크기: (2374, 19)
확장된 타겟 크기: (2374,)
이상치가 포함된 데이터셋의 총 샘플 수: 2374
전체 이상치 비율: 10.07%


In [187]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from itertools import combinations

results = []

# 데이터 전처리 및 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Direct 방식을 사용하여 각 SVM 유형에 대해 학습 및 평가
for svm_type in ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']:
    num_classes = len(np.unique(y))
    weights, biases = train_direct(X_train, y_train, C=1.0, svm_type=svm_type, num_classes=num_classes)
    y_pred_direct = predict_direct(X_test, weights, biases)


    # y_pred_direct 변수를 정의한 후에 데이터 타입 확인 및 변환
    if y_test.dtype != np.int64:
        y_test = y_test.astype(int)

    if y_pred_direct.dtype != np.int64:
        y_pred_direct = y_pred_direct.astype(int)

    accuracy_direct = accuracy_score(y_test, y_pred_direct)
    results.append((svm_type, round(accuracy_direct * 100, 2)))

# 결과를 DataFrame으로 변환 및 출력
results_df = pd.DataFrame(results, columns=['SVM Type', 'Accuracy'])
print(results_df)

        SVM Type  Accuracy
0             L1     37.03
1             L2     32.54
2           Fair     54.84
3         Cauchy     53.72
4         Welsch     41.51
5  Geman-McClure     40.25


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '7.58%' : [21.79,17.75, 18.04, 18.47, 19.91, 18.04],
    '10.07%' : [19.07, 20.06, 19.21, 19.07, 19.50, 19.35]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Segment data (L-BFGS-B)'

# 데이터프레임 출력
print(df)

                         7.58%  10.07%
Segment data (L-BFGS-B)               
L1                       21.79   19.07
L2                       17.75   20.06
Fair                     18.04   19.21
Cauchy                   18.47   19.07
Welsch                   19.91   19.50
Geman-McClure            18.04   19.35


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '7.58%' : [ 50.94,37.81, 35.93, 34.92, 40.26, 49.64],
    '10.07%' : [37.03, 32.54, 54.84, 53.72, 41.51, 40.25
]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Segment data (GA)'

# 데이터프레임 출력
print(df)

                   7.58%  10.07%
Segment data (GA)               
L1                 50.94   37.03
L2                 37.81   32.54
Fair               35.93   54.84
Cauchy             34.92   53.72
Welsch             40.26   41.51
Geman-McClure      49.64   40.25


In [11]:
import pandas as pd #해야함

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '7.58%' : [ 85.86, 85.86, 12.41, 12.41, 12.41, 12.41],
    '10.07%' : [84.99, 84.99, 14.87, 14.87, 14.87, 14.87]
}


# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Segment data (SMO)'

# 데이터프레임 출력
print(df)

                    7.58%  10.07%
Segment data (SMO)               
L1                  85.86   84.99
L2                  85.86   84.99
Fair                12.41   14.87
Cauchy              12.41   14.87
Welsch              12.41   14.87
Geman-McClure       12.41   14.87


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '7.58%' : [48.34, 33.77, 68.83, 43.87, 72.29, 49.35],
    '10.07%' : [44.74, 48.39, 60.73, 59.05, 43.90, 59.47]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Segment data (PSO)'

# 데이터프레임 출력
print(df)

                    7.58%  10.07%
Segment data (PSO)               
L1                  48.34   44.74
L2                  33.77   48.39
Fair                68.83   60.73
Cauchy              43.87   59.05
Welsch              72.29   43.90
Geman-McClure       49.35   59.47


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '7.58%' : [85.71, 86.72, 85.71, 87.01, 82.97, 83.98],
    '10.07%' : [83.17, 67.04, 85.69, 85.13, 84.85, 87.52]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Segment data (ACO)'

# 데이터프레임 출력
print(df)

                    7.58%  10.07%
Segment data (ACO)               
L1                  85.71   83.17
L2                  86.72   67.04
Fair                85.71   85.69
Cauchy              87.01   85.13
Welsch              82.97   84.85
Geman-McClure       83.98   87.52


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '7.58%' : [ 38.82, 31.60, 56.71, 45.31, 50.36, 40.40],
    '10.07%' : [53.16, 36.19, 38.99, 39.97, 46.28, 52.45]
}

# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Segment data (HS)'

# 데이터프레임 출력
print(df)

                   7.58%  10.07%
Segment data (HS)               
L1                 38.82   53.16
L2                 31.60   36.19
Fair               56.71   38.99
Cauchy             45.31   39.97
Welsch             50.36   46.28
Geman-McClure      40.40   52.45


### vehicle data

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy.stats import zscore

np.random.seed(42)

# 데이터셋 로딩
data = fetch_openml(name='vehicle', version=1)
X = data.data
encoder = LabelEncoder()
y = encoder.fit_transform(data.target)
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y  # 타겟 변수 추가

# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=data.feature_names)

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

# 기존 이상치 수 계산
total_existing_outliers = is_existing_outlier.sum()

# 기존 이상치 비율 계산
total_samples = len(df)
existing_outlier_percentage = (total_existing_outliers / total_samples) * 100

print(f"기존 이상치 수: {total_existing_outliers}")
print(f"기존 이상치 비율: {existing_outlier_percentage:.2f}%")

기존 이상치 수: 22
기존 이상치 비율: 2.60%


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from scipy.stats import zscore

np.random.seed(42)

# 데이터셋 로딩
data = fetch_openml(name='vehicle', version=1)
X = data.data
# Use LabelEncoder to convert categorical target to numerical
encoder = LabelEncoder()
y = encoder.fit_transform(data.target)
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y  # 타겟 변수 추가


# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=data.feature_names)

# 이상치 생성
num_outliers = int(0.0048 * len(df))  # 이상치를 설정
outliers = []

for _ in range(num_outliers):
    outlier_sample = df_scaled.sample(n=1).copy()  # 임의의 샘플 선택
    for col in df_scaled.columns:
        if np.random.rand() > 0.5:
            # 최대값보다 큰 이상치 생성
            outlier_sample[col] = df_scaled[col].max() + np.random.rand() * 3
        else:
            # 최소값보다 작은 이상치 생성
            outlier_sample[col] = df_scaled[col].min() - np.random.rand() * 3
    outliers.append(outlier_sample)

outliers_df = pd.concat(outliers, ignore_index=True)

# 이상치의 타겟 값 랜덤 할당
outliers_df['target'] = np.random.choice(y, num_outliers)

# 원본 데이터에 이상치 추가
df_extended = pd.concat([df, outliers_df], ignore_index=True)

# 이상치 추가 후 다시 정규화
X_extended_scaled = scaler.fit_transform(df_extended.drop(columns=['target']))
df_extended_scaled = pd.DataFrame(X_extended_scaled, columns=data.feature_names)

# 새로운 X, y 정의
X = df_extended_scaled.values  # NumPy 배열로 변환
y = df_extended['target'].values  # 타겟 값 추출

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

total_existing_outliers = is_existing_outlier.sum()  # 기존 이상치 수
total_outliers = total_existing_outliers + num_outliers  # 총 이상치 수
total_samples = len(df_extended)  # 전체 샘플 수
total_outlier_percentage = (total_outliers / total_samples) * 100  # 이상치 비율 계산

# 데이터 확인
print(f"확장된 데이터셋 크기: {X.shape}")
print(f"확장된 타겟 크기: {y.shape}")
print(f"이상치가 포함된 데이터셋의 총 샘플 수: {len(X)}")
print(f"전체 이상치 비율: {total_outlier_percentage:.2f}%")


확장된 데이터셋 크기: (850, 18)
확장된 타겟 크기: (850,)
이상치가 포함된 데이터셋의 총 샘플 수: 850
전체 이상치 비율: 3.06%


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from scipy.stats import zscore

np.random.seed(42)

# 데이터셋 로딩
data = fetch_openml(name='vehicle', version=1)
X = data.data
# Use LabelEncoder to convert categorical target to numerical
encoder = LabelEncoder()
y = encoder.fit_transform(data.target)
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y  # 타겟 변수 추가


# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=data.feature_names)

# 이상치 생성
num_outliers = int(0.027 * len(df))  # 이상치를 설정
outliers = []

for _ in range(num_outliers):
    outlier_sample = df_scaled.sample(n=1).copy()  # 임의의 샘플 선택
    for col in df_scaled.columns:
        if np.random.rand() > 0.5:
            # 최대값보다 큰 이상치 생성
            outlier_sample[col] = df_scaled[col].max() + np.random.rand() * 3
        else:
            # 최소값보다 작은 이상치 생성
            outlier_sample[col] = df_scaled[col].min() - np.random.rand() * 3
    outliers.append(outlier_sample)

outliers_df = pd.concat(outliers, ignore_index=True)

# 이상치의 타겟 값 랜덤 할당
outliers_df['target'] = np.random.choice(y, num_outliers)

# 원본 데이터에 이상치 추가
df_extended = pd.concat([df, outliers_df], ignore_index=True)

# 이상치 추가 후 다시 정규화
X_extended_scaled = scaler.fit_transform(df_extended.drop(columns=['target']))
df_extended_scaled = pd.DataFrame(X_extended_scaled, columns=data.feature_names)

# 새로운 X, y 정의
X = df_extended_scaled.values  # NumPy 배열로 변환
y = df_extended['target'].values  # 타겟 값 추출

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

total_existing_outliers = is_existing_outlier.sum()  # 기존 이상치 수
total_outliers = total_existing_outliers + num_outliers  # 총 이상치 수
total_samples = len(df_extended)  # 전체 샘플 수
total_outlier_percentage = (total_outliers / total_samples) * 100  # 이상치 비율 계산

# 데이터 확인
print(f"확장된 데이터셋 크기: {X.shape}")
print(f"확장된 타겟 크기: {y.shape}")
print(f"이상치가 포함된 데이터셋의 총 샘플 수: {len(X)}")
print(f"전체 이상치 비율: {total_outlier_percentage:.2f}%")


확장된 데이터셋 크기: (868, 18)
확장된 타겟 크기: (868,)
이상치가 포함된 데이터셋의 총 샘플 수: 868
전체 이상치 비율: 5.07%


In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from scipy.stats import zscore

np.random.seed(42)

# 데이터셋 로딩
data = fetch_openml(name='vehicle', version=1)
X = data.data
# Use LabelEncoder to convert categorical target to numerical
encoder = LabelEncoder()
y = encoder.fit_transform(data.target)
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y  # 타겟 변수 추가


# 데이터 정규화
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
df_scaled = pd.DataFrame(X_scaled, columns=data.feature_names)

# 이상치 생성
num_outliers = int(0.083 * len(df))  # 이상치를 설정
outliers = []

for _ in range(num_outliers):
    outlier_sample = df_scaled.sample(n=1).copy()  # 임의의 샘플 선택
    for col in df_scaled.columns:
        if np.random.rand() > 0.5:
            # 최대값보다 큰 이상치 생성
            outlier_sample[col] = df_scaled[col].max() + np.random.rand() * 3
        else:
            # 최소값보다 작은 이상치 생성
            outlier_sample[col] = df_scaled[col].min() - np.random.rand() * 3
    outliers.append(outlier_sample)

outliers_df = pd.concat(outliers, ignore_index=True)

# 이상치의 타겟 값 랜덤 할당
outliers_df['target'] = np.random.choice(y, num_outliers)

# 원본 데이터에 이상치 추가
df_extended = pd.concat([df, outliers_df], ignore_index=True)

# 이상치 추가 후 다시 정규화
X_extended_scaled = scaler.fit_transform(df_extended.drop(columns=['target']))
df_extended_scaled = pd.DataFrame(X_extended_scaled, columns=data.feature_names)

# 새로운 X, y 정의
X = df_extended_scaled.values  # NumPy 배열로 변환
y = df_extended['target'].values  # 타겟 값 추출

# 기존 이상치 식별
df_zscores = df_scaled.apply(zscore)  # Z-score 계산
is_existing_outlier = (df_zscores.abs() > 3).any(axis=1)  # 절대값이 3 이상인 경우 이상치로 판단

total_existing_outliers = is_existing_outlier.sum()  # 기존 이상치 수
total_outliers = total_existing_outliers + num_outliers  # 총 이상치 수
total_samples = len(df_extended)  # 전체 샘플 수
total_outlier_percentage = (total_outliers / total_samples) * 100  # 이상치 비율 계산

# 데이터 확인
print(f"확장된 데이터셋 크기: {X.shape}")
print(f"확장된 타겟 크기: {y.shape}")
print(f"이상치가 포함된 데이터셋의 총 샘플 수: {len(X)}")
print(f"전체 이상치 비율: {total_outlier_percentage:.2f}%")


확장된 데이터셋 크기: (916, 18)
확장된 타겟 크기: (916,)
이상치가 포함된 데이터셋의 총 샘플 수: 916
전체 이상치 비율: 10.04%


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from itertools import combinations

results = []

# 데이터 전처리 및 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# OvO 방식을 사용하여 각 SVM 유형에 대해 학습 및 평가
for svm_type in ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']:
    models_ovo = train_ovo(X_train, y_train, C=1.0, svm_type=svm_type)
    y_pred_ovo = predict_ovo(X_test, models_ovo)

    # y_pred_ovo 변수를 정의한 후에 데이터 타입 확인 및 변환
    if y_test.dtype != np.int64:
        y_test = y_test.astype(int)

    if y_pred_ovo.dtype != np.int64:
        y_pred_ovo = y_pred_ovo.astype(int)

    accuracy_ovo = accuracy_score(y_test, y_pred_ovo)
    results.append((svm_type, round(accuracy_ovo * 100, 2)))

# 결과를 DataFrame으로 변환 및 출력
results_df = pd.DataFrame(results, columns=['SVM Type', 'Accuracy'])
print(results_df['Accuracy'].tolist())

In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.06%' : [76.86, 78.43, 77.65, 76.86, 72.55, 74.51],
    '5.07%' : [73.56, 77.01, 73.95, 73.56, 73.18, 72.80],
    '10.04%' : [70.18, 72.36, 70.91, 70.91, 65.82, 69.45]
}
# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Vehicle data (L-BFGS-B)'

# 데이터프레임 출력
print(df)


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.06%' : [76.86, 78.43, 77.65, 76.86, 72.55, 74.51],
    '5.07%' : [73.56, 77.01, 73.95, 73.56, 73.18, 72.80],
    '10.04%' : [70.18, 72.36, 70.91, 70.91, 65.82, 69.45]
}
# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Vehicle data (HS)'

# 데이터프레임 출력
print(df)


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.06%' : [76.86, 78.43, 77.65, 76.86, 72.55, 74.51],
    '5.07%' : [73.56, 77.01, 73.95, 73.56, 73.18, 72.80],
    '10.04%' : [70.18, 72.36, 70.91, 70.91, 65.82, 69.45]
}
# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Vehicle data (ACO)'

# 데이터프레임 출력
print(df)


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.06%' : [76.86, 78.43, 77.65, 76.86, 72.55, 74.51],
    '5.07%' : [73.56, 77.01, 73.95, 73.56, 73.18, 72.80],
    '10.04%' : [70.18, 72.36, 70.91, 70.91, 65.82, 69.45]
}
# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Vehicle data (PSO)'

# 데이터프레임 출력
print(df)


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.06%' : [76.86, 78.43, 77.65, 76.86, 72.55, 74.51],
    '5.07%' : [73.56, 77.01, 73.95, 73.56, 73.18, 72.80],
    '10.04%' : [70.18, 72.36, 70.91, 70.91, 65.82, 69.45]
}
# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Vehicle data (SMO)'

# 데이터프레임 출력
print(df)


In [None]:
import pandas as pd

svm_types = ['L1', 'L2', 'Fair', 'Cauchy', 'Welsch', 'Geman-McClure']
accuracy_data = {
    '3.06%' : [76.86, 78.43, 77.65, 76.86, 72.55, 74.51],
    '5.07%' : [73.56, 77.01, 73.95, 73.56, 73.18, 72.80],
    '10.04%' : [70.18, 72.36, 70.91, 70.91, 65.82, 69.45]
}
# 데이터프레임 생성
df = pd.DataFrame(accuracy_data, index=svm_types)
df.index.name = 'Vehicle data (GA)'

# 데이터프레임 출력
print(df)
