## Dataset

En las siguientes celdas se importa el dataset y se prepara para la implementación de los modelos de aprendizaje.

In [1]:
import pandas as pd
import kagglehub

path = kagglehub.dataset_download("dragonheir/logistic-regression")

print("Path to dataset files:", path)
dataset = pd.read_csv(r'C:\Users\FLopezP\.cache\kagglehub\datasets\dragonheir\logistic-regression\versions\1\Social_Network_Ads.csv')

Path to dataset files: C:\Users\FLopezP\.cache\kagglehub\datasets\dragonheir\logistic-regression\versions\1


In [2]:
# Female = 1
# Male = 0
aux = []
for i in range(len(dataset["Gender"])):
    if dataset["Gender"][i] == "Female":
        aux.append(1)
    else:
        aux.append(0)

dataset["GenderNum"] = aux
dataset = dataset.loc[:,["User ID", "Gender", "GenderNum", "Age", "EstimatedSalary", "Purchased"]]
dataset

Unnamed: 0,User ID,Gender,GenderNum,Age,EstimatedSalary,Purchased
0,15624510,Male,0,19,19000,0
1,15810944,Male,0,35,20000,0
2,15668575,Female,1,26,43000,0
3,15603246,Female,1,27,57000,0
4,15804002,Male,0,19,76000,0
...,...,...,...,...,...,...
395,15691863,Female,1,46,41000,1
396,15706071,Male,0,51,23000,1
397,15654296,Female,1,50,20000,1
398,15755018,Male,0,36,33000,0


In [3]:
# train test split
ds_train = dataset[0:280]
ds_test = dataset[280:400].reset_index()
ds_test = ds_test.drop(columns = ["index"])
labels_train = ds_train["Purchased"]
labels_test = ds_test["Purchased"].reset_index()

## Regresión Logística

Buscamos predecir si dadas 3 características (Género, Edad, Salario Estimado) se compra un elemento.

In [4]:
# Dimensiones:
# Vector de entrada: (M/F, Edad, Salario)
# Vector de pesos: (a, b, c)^T
# Sesgo: d
# z = Entrada.Pesos + Sesgo
# Proba = 1/(1+e^-z)

import numpy as np
def mix_char(ds):
    """
    ds = DataFrame, en este caso asumimos que tiene las características adecuadas del problema

    return: lista de características normalizadas
    """
    df = ds.drop(columns = ["User ID", "Gender"])
    for column in df.columns:
        if column != "GenderNum":
            df[column] = df[column] / df[column].max()
    
    char = []
    for i in range(len(ds)):
        aux = [df["GenderNum"][i], df["Age"][i], df["EstimatedSalary"][i]]
        arr_aux = np.array(aux)
        arr_aux = np.reshape(arr_aux, (1,3))
        char.append(arr_aux)
    return char

In [5]:
#Tenemos los conjuntos de entrenamiento ya agrupados. Ahora tenemos que realizar la multiplicación por los pesos.
train_char = mix_char(ds_train)
test_char = mix_char(ds_test)

print(train_char[0], test_char[0])

[[0.         0.31666667 0.12666667]] [[1.         0.98333333 0.61111111]]


In [6]:
def logit(train_instance, weight, bias):
    """
    train_instance = Vector único de 1x3.
    weight = Vector de pesos (1x3)
    bias = Vector del sesgo

    returns logit del vector
    """
    weight = weight.reshape(3,1)
    z = np.dot(train_instance, weight) + bias
    log = 1 / (1 + np.exp(-z))
    return log[0][0]

def cross_entropy_pog(y, logit):
    """
    y = valor del label
    muestra = logit de un valor del dataset

    returns CrossEntropy(Label, Muestra)
    """
    cross_entropy = -((y*np.log(logit)) + ((1-y)*np.log(1-logit)))
    return cross_entropy

In [7]:
def logistic_regression_w_b(lr, labels, train, epochs):
    """
    lr = learning rate
    labels = labels de entrenamiento
    train = conjunto de datos de entrenamiento
    epochs = épocas de entrenamiento

    returns w, bias los vectores de peso y sesgo
    """
    w = np.zeros((1,3))
    bias = np.random.rand(1)
    labels = labels
    train = train
    for i in range(epochs):
        for i in range(len(train)):
            a = logit(train[i], w, bias) # Predicción del modelo
            ce_loss = cross_entropy_pog(labels[i], a)
            grad = (a - labels[i])*train[i]
            grad_bias = (a - labels[i])
            w -= lr*grad
            bias -= lr*grad_bias
    return w, bias

In [8]:
w_, b_ = logistic_regression_w_b(0.05, labels_train, train_char, 10)
print(w_, b_)

[[0.03854298 2.31403476 3.03862118]] [-3.26575638]


In [9]:
eval = [logit(i, w_, b_) for i in test_char]
eval_bin = []
for i in eval:
    if i >= 0.5:
        eval_bin.append(1)
    else:
        eval_bin.append(0)

print(len(eval_bin))

120


In [10]:
test_labels_perrones = [i for i in labels_test["Purchased"]]
print(len(test_labels_perrones))

120


In [11]:
import sklearn
from sklearn import metrics
print(metrics.classification_report(test_labels_perrones, eval_bin))
print(metrics.confusion_matrix(test_labels_perrones, eval_bin))

              precision    recall  f1-score   support

           0       0.55      0.89      0.68        46
           1       0.89      0.55      0.68        74

    accuracy                           0.68       120
   macro avg       0.72      0.72      0.68       120
weighted avg       0.76      0.68      0.68       120

[[41  5]
 [33 41]]


## Red Neuronal

La arquitectura de esta red neuronal tendrá 3 valores de entrada, uno por cada valor del dataset. Tendremos **una** capa oculta de **n** unidades, y finalmente una salida binaria.

In [12]:
in1, in2, in3 = train_char[0][0]
print(in1, in2, in3)

0.0 0.31666666666666665 0.12666666666666668


In [26]:
#BNU = Basic Neural Unit
class BNU:
    w = np.random.rand(1,3)
    def __init__(self, weight, bias, *inputs):
        self.inputs = inputs
        self.weight = weight
        self.bias = bias

    def logit(self):
        input_vector = np.array(self.inputs)
        peso = self.weight.reshape(3,1)
        z = np.dot(input_vector, peso) + self.bias
        log = 1 / (1 + np.exp(-z))
        return log[0]        

w1 = np.random.rand(1,3)
b1 = np.random.rand(1)
celda1 = BNU(w1, b1, in1, in2, in3)

w2 = np.random.rand(1,3)
b2 = np.random.rand(1)
celda2 = BNU(w2, b2, in1, in2, in3)

w3 = np.random.rand(1,3)
b3 = np.random.rand(1)
celda3 = BNU(w3, b3, in1, in2, in3)

print(celda1.logit(), celda2.logit(), celda3.logit())

0.6839923391877291 0.7531336273917066 0.6335066516071446


In [28]:
w4 = np.random.rand(1, 3)
b4 = np.random.rand(1)
celda4 = BNU(w4, b4, celda1.logit(), celda2.logit(), celda3.logit())
print(celda4.inputs)

(0.6839923391877291, 0.7531336273917066, 0.6335066516071446)


**Okay no vamos mal, pero tenemos que optimizar la forma de definir los pesos y demás de manera no estúpida**

## K-Means