# Reconhecimento de Padrões (TIP8311) - Trabalho 2


**Professor:** Guilherme de Alencar Barreto  

<img src="https://loop.frontiersin.org/images/profile/243428/203" alt="Foto do Professor" width="150"/>


**Aluno:** Luis Felipe Carneiro de Souza    **Matrícula:** 593034

In [1]:
import numpy as np
from tqdm import tqdm
from time import time
import matplotlib.pyplot as plt

In [2]:
data_path = "vertebral+column\column_3C.dat"

In [26]:
data = np.genfromtxt(fname=data_path, delimiter=None, dtype=str, encoding='utf-8')
data

X = data[:, :-1].astype(float) 
y = data[:, -1]


In [4]:
class KNN:
    def __init__(self, k=1, m=2):
        self.X_train = None
        self.y_train = None
        self.k = k
        self.m = m

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):

        n_test = len(X)
        n_train = len(self.X_train)
        y_pred = []

        for i in range(n_test):

            dist = []

            for j in range(n_train):

                diff = np.abs(X[i, :] - self.X_train[j , :])
                dist.append(np.sum(diff ** (self.m)))

            dist = np.array(dist)
            idx = np.argsort(dist)[:self.k]
            neighbors, count = np.unique(self.y_train[idx], return_counts=True)
            y_pred.append(neighbors[np.argmax(count)])

        return np.array(y_pred).reshape(-1, 1)
        

In [5]:
class DMC:
    def __init__(self):
        self.centroides = None
        self.classes = None

    def fit(self, X, y):

        self.classes = np.sort(np.unique(y))
        self.centroides = []

        for c in self.classes:

            mask = (y == c)
            self.centroides.append(X[mask].mean(axis=0))

        self.centroides = np.array(self.centroides)

    def predict(self, X):

        n_test = len(X)
        y_pred = []

        for i in range(n_test):

            dist = []

            for c in self.centroides:

                diff = np.abs(c - X[i, :])
                dist.append(np.sum(diff ** 2))

            dist = np.array(dist)
            y_pred.append(self.classes[np.argmin(dist)])

        return np.array(y_pred).reshape(-1, 1)

In [7]:
class DMCR:
    def __init__(self):
        self.centroides = None
        self.classes = None

    def fit(self, X, y):

        self.classes = np.sort(np.unique(y))
        self.centroides = []

        for c in self.classes:

            mask = (y == c)
            self.centroides.append(np.median(X[mask], axis=0))

        self.centroides = np.array(self.centroides)

    def predict(self, X):

        n_test = len(X)
        y_pred = []

        for i in range(n_test):

            dist = []

            for c in self.centroides:

                diff = np.abs(c - X[i, :])
                dist.append(np.sum(diff))

            dist = np.array(dist)
            y_pred.append(self.classes[np.argmin(dist)])

        return np.array(y_pred).reshape(-1, 1)

In [6]:
class MaxCorr:
    def __init__(self):
        self.centroides = None
        self.classes = None

    def fit(self, X, y):

        self.classes = np.sort(np.unique(y))
        self.centroides = []

        for c in self.classes:

            mask = (y == c)
            self.centroides.append(X[mask].mean(axis=0))

        self.centroides = np.array(self.centroides)

    def predict(self, X):

        n_test = len(X)
        y_pred = []

        for i in range(n_test):

            corr_array = []

            for c in self.centroides:

                corr = np.dot(c, X[i, :])
                corr_array.append(corr)

            corr_array = np.array(corr_array)
            y_pred.append(self.classes[np.argmax(corr_array)])

        return np.array(y_pred).reshape(-1, 1)

In [8]:
class StandardScaler:
    def __init__(self):
        self.X_mu = None
        self.X_sigma = None
        self.y_mu = None
        self.y_sigma = None

    def fit(self, X, y=None):
        self.X_mu = np.mean(X, axis=0)
        self.X_sigma = np.std(X, axis=0)
        self.X_sigma = np.where(self.X_sigma == 0, 1.0, self.X_sigma)

        if y is not None:
            self.y_mu = np.mean(y, axis=0)
            self.y_sigma = np.std(y, axis=0)

    def transform(self, X, y=None):
        
        X_scaled = (X - self.X_mu) / self.X_sigma

        if y is not None:
            y_scaled = (y - self.y_mu) / self.y_sigma

        if y is None:
            return X_scaled
        else:
            return X_scaled, y_scaled

In [None]:
class MinMaxScaler:
    def __init__(self):
        pass

    def fit(self, X, y=None):
        pass

    def predict(self, X, y=None):
        pass

**Questão 1 (Classificação de Padrões)** - Objetivo: Abordar o clássico problema de classificação de patologias da coluna vertebral. O banco de dados está disponível no site abaixo:

https://archive.ics.uci.edu/dataset/212/vertebral+column     

Usando o problema com 3 classes, implementar e avaliar os classificadores estudados na disciplina, listados abaixo

Classificadores a Implementar
1. Classificador Vizinho Mais Próximo (distância de Minkowski de ordens $m \in {0,5; 2/3; 1; 3/2; 2; 5;2}$ )
2. Classificador Distância Mínima ao Centróide
3. Classificador Distância Mínima ao Centróide (versão robusta a outliers)
4. Classificador de Máxima Correlação

**OBS 1**: Computar os tempos de treinamento e teste de cada modelo para ajudar a decidir pelo melhor modelo.

**OBS 2**: Realizar, no mínimo, 100 rodadas de treinamento-teste independentes dos modelos e obter as estatísticas de desempenho (acurácia média, desvio-padrão, acurácia máxima, acurácia mínima e mediana da acurácia). Determinar a matriz de confusão para a melhor e pior rodada.

**OBS 3**:  Avaliar   o   efeito   da   porcentagem   de   separação   dos   dados   de   treino-teste   no   desempenho   dos classificadores. Sugestão: testar as seguintes possibilidades: 20/80, 30/70, 50/50, 70/30 e 80/20.

**OBS 4**: Calcular também as estatísticas de desempenho por classe, com o objetivo de entender se há classes mais fáceis de categorizar que outras. Boa sorte!

### Classificador Vizinho Mais Próximo (distância de Minkowski de ordens m ∈ {0,5; 2/3; 1; 3/2; 2; 5/2})

In [9]:
sep = {'20/80': {'train': 0.2, 'test': 0.8},
       '30/70': {'train': 0.3, 'test': 0.7},
       '50/50': {'train': 0.5, 'test': 0.5},
       '70/30': {'train': 0.7, 'test': 0.3},
       '80/20': {'train': 0.8, 'test': 0.2}}

In [13]:
resultados_knn = {}

for k, v in sep.items():
    for m in [0.5, 2/3, 1, 3/2, 2, 5/2]:
        for _ in tqdm(range(30), desc=f"Rodada sep: {s}, m: {m:.2f}"):

            idx = np.random.permutation(X.shape[0])
            X, y = X[idx], y[idx]
            split = int(v['train'] * len(X))
            X_train, X_test = X[:split], X[split:]
            y_train, y_test = y[:split], y[split:]

            scaler = StandardScaler()
            scaler.fit(X_train)
            X_train_scaled = scaler.transform(X_train)

            model = KNN(k=1, m=m)
            model.fit(X=X_train_scaled, y=y_train)

            scaler_test = StandardScaler()
            scaler_test.fit(X_test)
            X_test_scaled = scaler_test.transform(X_test)

            y_pred = model.predict(X_test_scaled)            

Rodada sep: 80/20, m: 0.50: 100%|██████████| 30/30 [00:01<00:00, 23.82it/s]
Rodada sep: 80/20, m: 0.67: 100%|██████████| 30/30 [00:01<00:00, 21.89it/s]
Rodada sep: 80/20, m: 1.00: 100%|██████████| 30/30 [00:01<00:00, 21.08it/s]
Rodada sep: 80/20, m: 1.50: 100%|██████████| 30/30 [00:01<00:00, 21.57it/s]
Rodada sep: 80/20, m: 2.00: 100%|██████████| 30/30 [00:01<00:00, 23.89it/s]
Rodada sep: 80/20, m: 2.50: 100%|██████████| 30/30 [00:01<00:00, 21.76it/s]
Rodada sep: 80/20, m: 0.50: 100%|██████████| 30/30 [00:01<00:00, 18.09it/s]
Rodada sep: 80/20, m: 0.67: 100%|██████████| 30/30 [00:01<00:00, 16.88it/s]
Rodada sep: 80/20, m: 1.00: 100%|██████████| 30/30 [00:01<00:00, 16.49it/s]
Rodada sep: 80/20, m: 1.50: 100%|██████████| 30/30 [00:01<00:00, 16.99it/s]
Rodada sep: 80/20, m: 2.00: 100%|██████████| 30/30 [00:01<00:00, 18.62it/s]
Rodada sep: 80/20, m: 2.50: 100%|██████████| 30/30 [00:01<00:00, 17.08it/s]
Rodada sep: 80/20, m: 0.50: 100%|██████████| 30/30 [00:01<00:00, 15.41it/s]
Rodada sep: 

### Classificador Distância Mínima ao Centróide

In [33]:
resultados_mdc = {}

for k, v in sep.items():
    for _ in tqdm(range(100), desc=f"Rodada sep: {k}"):

        idx = np.random.permutation(X.shape[0])
        X, y = X[idx], y[idx]

        split = int(v['train'] * len(X))
        X_train, X_test = X[:split], X[split:]
        y_train, y_test = y[:split], y[split:]

        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train_scaled = scaler.transform(X_train)

        model = DMC()
        model.fit(X=X_train_scaled, y=y_train)

        scaler_test = StandardScaler()
        scaler_test.fit(X_test)
        X_test_scaled = scaler_test.transform(X_test)

        y_pred = model.predict(X_test_scaled)            

Rodada sep: 20/80: 100%|██████████| 100/100 [00:00<00:00, 416.02it/s]
Rodada sep: 30/70: 100%|██████████| 100/100 [00:00<00:00, 472.93it/s]
Rodada sep: 50/50: 100%|██████████| 100/100 [00:00<00:00, 650.48it/s]
Rodada sep: 70/30: 100%|██████████| 100/100 [00:00<00:00, 1020.68it/s]
Rodada sep: 80/20: 100%|██████████| 100/100 [00:00<00:00, 1402.01it/s]


### Classificador Distância Mínima ao Centróide (versão robusta a outliers)

In [35]:
resultados_mdcr = {}

for k, v in sep.items():
    for _ in tqdm(range(100), desc=f"Rodada sep: {k}"):

        idx = np.random.permutation(X.shape[0])
        X, y = X[idx], y[idx]

        split = int(v['train'] * len(X))
        X_train, X_test = X[:split], X[split:]
        y_train, y_test = y[:split], y[split:]

        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train_scaled = scaler.transform(X_train)

        model = DMCR()
        model.fit(X=X_train_scaled, y=y_train)

        scaler_test = StandardScaler()
        scaler_test.fit(X_test)
        X_test_scaled = scaler_test.transform(X_test)

        y_pred = model.predict(X_test_scaled)            

Rodada sep: 20/80: 100%|██████████| 100/100 [00:00<00:00, 450.06it/s]
Rodada sep: 30/70: 100%|██████████| 100/100 [00:00<00:00, 491.33it/s]
Rodada sep: 50/50: 100%|██████████| 100/100 [00:00<00:00, 618.23it/s]
Rodada sep: 70/30: 100%|██████████| 100/100 [00:00<00:00, 1044.13it/s]
Rodada sep: 80/20: 100%|██████████| 100/100 [00:00<00:00, 1458.31it/s]


### Classificador de Máxima Correlação

In [36]:
resultados_mdcr = {}

for k, v in sep.items():
    for _ in tqdm(range(100), desc=f"Rodada sep: {k}"):

        idx = np.random.permutation(X.shape[0])
        X, y = X[idx], y[idx]

        split = int(v['train'] * len(X))
        X_train, X_test = X[:split], X[split:]
        y_train, y_test = y[:split], y[split:]

        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train_scaled = scaler.transform(X_train)

        model = MaxCorr()
        model.fit(X=X_train_scaled, y=y_train)

        scaler_test = StandardScaler()
        scaler_test.fit(X_test)
        X_test_scaled = scaler_test.transform(X_test)

        y_pred = model.predict(X_test_scaled)            

Rodada sep: 20/80: 100%|██████████| 100/100 [00:00<00:00, 1035.79it/s]
Rodada sep: 30/70: 100%|██████████| 100/100 [00:00<00:00, 1191.10it/s]
Rodada sep: 50/50: 100%|██████████| 100/100 [00:00<00:00, 1567.60it/s]
Rodada sep: 70/30: 100%|██████████| 100/100 [00:00<00:00, 2296.78it/s]
Rodada sep: 80/20: 100%|██████████| 100/100 [00:00<00:00, 3034.34it/s]
