<a href="https://colab.research.google.com/github/Giovannacm/machine-learning/blob/main/SVM_StochasticGradientDescent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score

In [2]:
# Remove colunas que possuem similaridade maior que um dado limiar de similaridade
# A remoção das colunas muito relacionadas implica em uma velocidade de aprendizagem melhor
# O cálculo da similaridade é com base na correlação entre as colunas
def remove_correlated_features(X):
    corr_threshold = 0.9
    corr = X.corr()
    drop_columns = np.full(corr.shape[0], False, dtype=bool)
    for i in range(corr.shape[0]):
        for j in range(i + 1, corr.shape[0]):
            if corr.iloc[i, j] >= corr_threshold:
                drop_columns[j] = True
    columns_dropped = X.columns[drop_columns]
    X.drop(columns_dropped, axis=1, inplace=True)
    return columns_dropped

In [3]:
# Remove colunas menos significantes com base em p-values e backward elimination
# Se a coluna tiver um p-value maior que um limiar, ela será descartada pois tem menos significância para explicar a variação
def remove_less_significant_features(X, Y):
    sl = 0.05
    regression_ols = None
    columns_dropped = np.array([])
    for itr in range(0, len(X.columns)):
        regression_ols = sm.OLS(Y, X).fit()
        max_col = regression_ols.pvalues.idxmax()
        max_val = regression_ols.pvalues.max()
        if max_val > sl:
            X.drop(max_col, axis='columns', inplace=True)
            columns_dropped = np.append(columns_dropped, [max_col])
        else:
            break
    regression_ols.summary()
    return columns_dropped

In [4]:
# Calculando a função de custo
def compute_cost(W, X, Y, regularization_strength):
    N = X.shape[0]
    distances = 1 - Y * (np.dot(X, W))
    distances[distances < 0] = 0 
    hinge_loss = regularization_strength * (np.sum(distances) / N)
    cost = 1 / 2 * np.dot(W, W) + hinge_loss
    return cost

In [5]:
# Calculando o gradiente da função de custo
def calculate_cost_gradient(W, X_batch, Y_batch, regularization_strength):
    if type(Y_batch) == np.float64: # Caso um exemplo seja passado (no caso do SGD)
        Y_batch = np.array([Y_batch])
        X_batch = np.array([X_batch]) 

    distance = 1 - (Y_batch * np.dot(X_batch, W))
    dw = np.zeros(len(W))

    for ind, d in enumerate(distance):
        if max(0, d) == 0:
            di = W
        else:
            di = W - (regularization_strength * Y_batch[ind] * X_batch[ind])
        dw += di

    dw = dw/len(Y_batch)  # Média
    return dw

In [6]:
# Função de treinamento usando Stochastic Gradient Descent
def sgd(features, outputs, learning_rate, regularization_strength):
    max_epochs = 5000
    weights = np.zeros(features.shape[1])
    nth = 0
    prev_cost = float("inf")
    cost_threshold = 0.01
    
    for epoch in range(1, max_epochs):
        X, Y = shuffle(features, outputs) # A cada época é feito um embaralhamento das colunas
        for ind, x in enumerate(X):
            ascent = calculate_cost_gradient(weights, x, Y[ind], regularization_strength)
            weights = weights - (learning_rate * ascent)

        # A verificacao de convergência será feita nas 2^nth epocas
        if epoch == 2 ** nth or epoch == max_epochs - 1:
            cost = compute_cost(weights, features, outputs, regularization_strength)
            print("Epoca:", epoch, "| Custo: ", cost)
            # O treinamento é parado quando o custo atual não tiver diminuído muito em comparação com o custo anterior
            if abs(prev_cost - cost) < cost_threshold * prev_cost:
                return weights
            prev_cost = cost
            nth += 1
    return weights

In [None]:
! mkdir ~/.kaggle/
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download 'uciml/breast-cancer-wisconsin-data'
! unzip breast-cancer-wisconsin-data.zip

In [8]:
data = pd.read_csv('/content/data.csv')
data.drop(data.columns[[-1, 0]], axis=1, inplace=True)

In [9]:
# Alterando os valores categóricos para inteiros
diag_map = {'M': 1.0, 'B': -1.0}
data['diagnosis'] = data['diagnosis'].map(diag_map)

Y = data.loc[:, 'diagnosis']
X = data.iloc[:, 1:]

# Selecao de características
remove_correlated_features(X)
remove_less_significant_features(X, Y)

# Nomalizando os dados para uma melhor convergencia e evitar overflow
X_normalized = MinMaxScaler().fit_transform(X.values)
X = pd.DataFrame(X_normalized)

# Inserindo uma nova coluna com valor 1
X.insert(loc=len(X.columns), column='intercept', value=1)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [10]:
regularization_strength = 10000
learning_rate = 0.000001

# Treinamento do modelo
W = sgd(X_train.to_numpy(), y_train.to_numpy(), learning_rate, regularization_strength)
print('Pesos:', W)

Epoca: 1 | Custo:  7251.523553705103
Epoca: 2 | Custo:  6797.95934072895
Epoca: 4 | Custo:  5435.661102253089
Epoca: 8 | Custo:  3851.183491320182
Epoca: 16 | Custo:  2638.2566973821026
Epoca: 32 | Custo:  1963.8834812562388
Epoca: 64 | Custo:  1587.2701306067388
Epoca: 128 | Custo:  1340.6907588181793
Epoca: 256 | Custo:  1161.9637173709496
Epoca: 512 | Custo:  1076.2486932736542
Epoca: 1024 | Custo:  1047.335331708416
Epoca: 2048 | Custo:  1044.157552742564
Pesos: [ 3.54848437 11.05517057 -2.27044786 -7.89762102 10.14666784 -1.27449024
 -6.43898788  2.26969891 -3.87758567  3.24104506  4.93641316  4.83938996
 -4.71481003]


In [11]:
# Testando o modelo
y_train_predicted = np.array([])
for i in range(X_train.shape[0]):
    yp = np.sign(np.dot(X_train.to_numpy()[i], W))
    y_train_predicted = np.append(y_train_predicted, yp)

y_test_predicted = np.array([])
for i in range(X_test.shape[0]):
    yp = np.sign(np.dot(X_test.to_numpy()[i], W))
    y_test_predicted = np.append(y_test_predicted, yp)

In [12]:
print("Acurácia:", accuracy_score(y_test, y_test_predicted))
print("Recall:", recall_score(y_test, y_test_predicted))
print("Precisão:", recall_score(y_test, y_test_predicted))

Acurácia: 0.9912280701754386
Recall: 0.9767441860465116
Precisão: 0.9767441860465116


Reference: https://towardsdatascience.com/svm-implementation-from-scratch-python-2db2fc52e5c2#:~:text=The%20SVM%20(Support%20Vector%20Machine,examples%20(x%E1%B5%A2%2C%20y%E1%B5%A2).