In [4]:
from ucimlrepo import fetch_ucirepo
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import pandas as pd

In [2]:
def qp_intpoint(Q, c, F, d, iter = 100, tol = 1e-5, decreasing_factor = 0.5, starting_point = -1):
    m, n = F.shape
    # v represents vector of all variables [x, z, mu]
    # the initial guess must have z, mu > 0
    if type(starting_point) == np.ndarray and starting_point.shape == (n, 1):
      print('starter given')
      v = np.block([[starting_point],[np.ones((2*m, 1))]
])
    else:
      v = np.ones((n + 2*m, 1))

    def FNewton(v_k):
        # x is the first n entries
        # z is the next m entries
        # mu is the next m entries
        x, z, mu = np.split(v_k, [n, n+m, n+m+m])[:-1]
        complementary_measure = ((z.T@mu)/m)[0][0]
        disturbance = decreasing_factor*complementary_measure
        return np.block([[Q@x + c - F.T@mu],
                         [np.multiply(z, mu) - disturbance],
                         [-F@x + z + d]])

    # The Jacobian of the matrix given by the FNewton() function
    def Jacobian_FNewton(v_k):
        Z = np.diag(v_k[n:n+m, :].flatten())
        MU = np.diag(v_k[n+m:n+m+m, :].flatten())
        return np.block(
                   [[Q               , np.zeros((n, m)) , -F.T],
                    [np.zeros((m, n)), MU               , Z],
                    [-F              , np.eye(m)        , np.zeros((m, m)) ]])

    def get_delta_v(v_k, F_vk):
        J = Jacobian_FNewton(v_k)
        adjustedF = F_vk
        adjustedJ = J
        z = v_k[n: n+m, :].flatten()
        ZInv = np.diag(1/z)
        adjustedF[n:n+m, :] = ZInv @ adjustedF[n:n+m, :]
        adjustedJ[n:n+m, :] = ZInv @ adjustedJ[n:n+m, :]
        return np.linalg.solve(adjustedJ, -adjustedF)

    def get_alpha(v_k, delta_v_k):
        alpha = 1
        for i in range(n, n+m+m):
            if delta_v_k[i, 0] < 0:
                alpha = min(alpha, -v_k[i, 0]/delta_v_k[i, 0])
        return alpha*0.95

    it = 0
    current_F = FNewton(v)
    while (np.linalg.norm(current_F.flatten())**2 > tol) and it < iter:
        delta_v = get_delta_v(v, current_F)
        alpha = get_alpha(v, delta_v)
        v += alpha*delta_v
        current_F = FNewton(v)
        it += 1
        decreasing_factor *= 1/2
        if(it%20==0):
          print(it, np.linalg.norm(current_F.flatten()))
    return v[0:n, :], v[n+m:n+2*m, :], it

In [7]:
# fetch dataset
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17)

# # data (as pandas dataframes)
x = breast_cancer_wisconsin_diagnostic.data.features
y = breast_cancer_wisconsin_diagnostic.data.targets

# scaling_factors = x.max()
# custom_scale = 1 / scaling_factors
# x_scaled = x.mul(custom_scale, axis='columns')
# scaler = StandardScaler()
# x_scaled = scaler.fit_transform(x)
# x = pd.DataFrame(x_scaled, columns=x.columns)

# metadata
#print(breast_cancer_wisconsin_diagnostic.metadata)

# variable information
print(breast_cancer_wisconsin_diagnostic.variables)

                  name     role         type demographic description units  \
0                   ID       ID  Categorical        None        None  None   
1            Diagnosis   Target  Categorical        None        None  None   
2              radius1  Feature   Continuous        None        None  None   
3             texture1  Feature   Continuous        None        None  None   
4           perimeter1  Feature   Continuous        None        None  None   
5                area1  Feature   Continuous        None        None  None   
6          smoothness1  Feature   Continuous        None        None  None   
7         compactness1  Feature   Continuous        None        None  None   
8           concavity1  Feature   Continuous        None        None  None   
9      concave_points1  Feature   Continuous        None        None  None   
10           symmetry1  Feature   Continuous        None        None  None   
11  fractal_dimension1  Feature   Continuous        None        

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [26]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_train_scaled = pd.DataFrame(x_train_scaled, columns=x.columns)

In [27]:
print(x_train_scaled.shape, y_train.shape)
y_train

(455, 30) (455, 1)


Unnamed: 0,Diagnosis
68,B
181,M
63,B
248,B
60,B
...,...
71,B
106,B
270,B
435,M


In [29]:
z = pd.concat([x_train_scaled,y_train],axis = 1)

rows = x_train_scaled.shape[0]
cols = x_train_scaled.shape[1]

x_train_scaled_reset = x_train_scaled.reset_index(drop=True)
y_train_reset = y_train.reset_index(drop=True)
z = pd.concat([x_train_scaled_reset, y_train_reset], axis=1)

zB = z[z['Diagnosis'] == 'B']
zM = z[z['Diagnosis'] == 'M']

zB


Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,fractal_dimension1,...,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3,Diagnosis
0,-1.440753,-0.435319,-1.362085,-1.139118,0.780573,0.718921,2.823135,-0.119150,1.092662,2.458173,...,-0.476309,-1.247920,-0.973968,0.722894,1.186732,4.672828,0.932012,2.097242,1.886450,B
2,-1.399982,-1.249622,-1.345209,-1.109785,-1.332645,-0.307355,-0.365558,-0.696502,1.930333,0.954379,...,-1.040811,-1.245220,-0.999715,-1.438693,-0.548564,-0.644911,-0.970239,0.597602,0.057894,B
3,-0.981797,1.416222,-0.982587,-0.866944,0.059390,-0.596788,-0.820203,-0.845115,0.313264,0.074041,...,1.593530,-0.873572,-0.742947,0.796624,-0.729392,-0.774950,-0.809483,0.798928,-0.134497,B
4,-1.117700,-1.010259,-1.125002,-0.965942,1.269511,-0.439002,-0.983341,-0.930600,3.394436,0.950213,...,-1.334616,-1.117138,-0.896549,-0.174876,-0.995079,-1.209146,-1.354582,1.033544,-0.205732,B
6,0.082778,0.127887,0.170723,-0.009526,-0.587734,1.220707,0.595888,0.466620,-0.353231,1.272355,...,-0.562140,-0.162071,-0.193435,-1.374939,-0.035790,-0.251447,-0.185156,-1.003494,0.231214,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449,-0.293787,-0.815484,-0.257344,-0.379006,0.845286,0.446087,-0.544443,-0.451884,0.568207,0.743320,...,-0.832837,-0.330348,-0.434684,-0.014405,0.160537,-0.409215,-0.633709,0.442248,-0.114304,B
450,-1.480675,-1.066580,-1.362085,-1.157451,0.149987,0.944057,-0.035754,-0.514485,0.331474,3.755073,...,-1.628421,-1.336108,-1.045037,-0.469795,-0.059039,-0.627221,-1.016366,-1.032028,1.376025,B
451,-0.701497,-0.200650,-0.687880,-0.682204,1.327033,-0.036619,-0.229252,-0.353247,-0.036372,0.339253,...,0.614731,-0.647704,-0.626555,1.616328,0.085623,0.060743,0.116740,-0.156974,0.398365,B
452,0.048802,-0.555001,-0.065125,-0.061423,-2.261627,-1.466613,-1.028567,-1.105515,-1.103492,-1.249242,...,-0.806427,-0.379841,-0.339278,-1.989065,-1.307006,-1.127968,-1.239034,-0.708639,-1.271455,B


In [30]:
B = zB.iloc[:, :-1].to_numpy()
M = zM.iloc[:, :-1].to_numpy()

In [31]:
F = np.block([[B, np.ones((B.shape[0], 1))], [-M, -np.ones((M.shape[0], 1))]])
F.shape

(455, 31)

In [32]:
d = np.ones((z.shape[0],1))
Q = np.eye(cols+1)
Q[cols,cols] = 0
c = np.zeros((cols+1,1))

In [33]:
xsol, mu, iter = qp_intpoint(Q, c, F, d, 10000, 1e-5, 0.5)

20 523.0044108437489
40 304.8354902558555
60 231.23112851958967
80 46.35195381529414


In [34]:
iter

97

In [35]:
b_scaled = xsol[-1,-1]
b_scaled

-2.6128955059252146

In [36]:
xsol
w_scaled = xsol[:-1, :]
print(w_scaled)

[[  2.52478536]
 [ -3.26625609]
 [  4.46239564]
 [ -3.55352351]
 [  1.63292678]
 [ 21.12360606]
 [-11.49542897]
 [-16.48644161]
 [  2.97557718]
 [ -8.70917751]
 [-20.0235225 ]
 [  0.59625012]
 [  7.50013626]
 [  3.19491412]
 [ -1.17559074]
 [ -5.02791871]
 [ 10.12293201]
 [ -4.95401374]
 [  5.76766034]
 [  5.92456432]
 [-11.27917375]
 [ -2.16970024]
 [  9.4599487 ]
 [ -8.04273349]
 [  1.48284737]
 [ -3.67238306]
 [ -6.76913743]
 [ -3.10740153]
 [ -9.97213416]
 [  4.74733052]]


In [60]:
# test training data complies
import matplotlib.pyplot as plt
values_M = np.dot(M, w_scaled) + b_scaled
values_B = np.dot(B, w_scaled) + b_scaled
# Assert that all values in values_M are negative
assert np.all( values_M + 1 <= 1e-4), "Not all values in values_M are less then -1."

# Assert that all values in values_B are positive
assert np.all(values_B - 1 >= -1e-4), "Not all values in values_B are greater than 1."

# If the code reaches this point, the assertions have passed
print("Assertion passed: values_M and values_B comply the restrictions")

Assertion passed: values_M and values_B comply the restrictions


#Test the test set

In [67]:
x_test_scaled = scaler.transform(x_test)
x_test_scaled = pd.DataFrame(x_test_scaled, columns=x.columns)
x_test_scaled

Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,fractal_dimension1,...,radius3,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3
0,-0.466497,-0.137289,-0.444211,-0.486465,0.280850,0.041606,-0.111465,-0.264869,0.415241,0.135137,...,-0.263235,-0.147842,-0.331548,-0.351093,0.480019,-0.096496,-0.035830,-0.194351,0.172757,0.203730
1,1.365363,0.498665,1.305511,1.341471,-0.406539,-0.013724,0.240637,0.821449,-0.833981,-1.131215,...,1.794619,0.172372,1.763661,1.744141,-0.530514,-0.123620,-0.028181,0.991779,-0.561211,-1.008389
2,0.380066,0.069220,0.404101,0.266596,0.967520,0.356414,0.726902,0.857221,0.437094,-0.666053,...,0.629403,0.076638,0.533832,0.492044,1.000466,-0.086163,0.499625,0.570350,-0.107831,-0.206293
3,-0.486317,-0.353185,-0.428570,-0.526233,0.694290,0.533852,-0.144722,-0.533686,0.000048,1.147386,...,-0.698111,-0.433394,-0.524721,-0.636959,0.584109,0.065603,-0.163957,-0.620377,-0.553285,0.545322
4,-0.729810,-1.113514,-0.709283,-0.709281,0.294512,0.159898,-0.271202,-0.587608,0.025542,0.703052,...,-0.827117,-0.966535,-0.849575,-0.739243,0.128718,-0.264407,-0.453677,-0.689644,-0.913135,-0.141789
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109,0.147898,-0.547961,0.095811,0.032780,-0.671141,-0.699053,-0.465834,-0.535790,-1.464055,-1.278401,...,0.046795,-0.015795,-0.033089,-0.081099,-0.751704,-0.295406,-0.147702,-0.550190,-0.713394,-1.004463
110,0.552776,0.109114,0.502886,0.460643,-0.292214,-0.369744,0.110756,0.476089,-0.047299,-1.228413,...,0.735521,-0.161047,0.650816,0.610200,0.800961,-0.311551,0.039707,0.579545,-0.404272,-1.121693
111,-0.735472,-0.998526,-0.741388,-0.699691,0.399489,-0.487272,-0.574677,-0.499492,0.262275,-0.149514,...,-0.745968,-0.717296,-0.786584,-0.679284,0.319548,-0.610562,-0.450330,-0.277411,-0.382078,-0.324083
112,0.028983,2.033403,0.027485,-0.085961,-0.971694,0.005355,0.269611,-0.110470,-1.708072,-0.203667,...,-0.100937,2.476596,-0.138074,-0.199255,-1.016264,-0.065497,0.152057,-0.490270,-1.609057,-0.331375


In [68]:
y_test

Unnamed: 0,Diagnosis
204,B
70,M
131,M
431,B
540,B
...,...
486,B
75,M
249,B
238,B


In [76]:
z_test = pd.concat([x_test_scaled,y_test],axis = 1)

# rows = x_test_scaled.shape[0]
# cols = x_train_scaled.shape[1]

x_test_scaled_reset = x_test_scaled.reset_index(drop=True)
y_test_reset = y_test.reset_index(drop=True)
z_test = pd.concat([x_test_scaled_reset, y_test_reset], axis=1)

ztestB = z_test[z_test['Diagnosis'] == 'B']
ztestM = z_test[z_test['Diagnosis'] == 'M']

ztestB

Unnamed: 0,radius1,texture1,perimeter1,area1,smoothness1,compactness1,concavity1,concave_points1,symmetry1,fractal_dimension1,...,texture3,perimeter3,area3,smoothness3,compactness3,concavity3,concave_points3,symmetry3,fractal_dimension3,Diagnosis
0,-0.466497,-0.137289,-0.444211,-0.486465,0.280850,0.041606,-0.111465,-0.264869,0.415241,0.135137,...,-0.147842,-0.331548,-0.351093,0.480019,-0.096496,-0.035830,-0.194351,0.172757,0.203730,B
3,-0.486317,-0.353185,-0.428570,-0.526233,0.694290,0.533852,-0.144722,-0.533686,0.000048,1.147386,...,-0.433394,-0.524721,-0.636959,0.584109,0.065603,-0.163957,-0.620377,-0.553285,0.545322,B
4,-0.729810,-1.113514,-0.709283,-0.709281,0.294512,0.159898,-0.271202,-0.587608,0.025542,0.703052,...,-0.966535,-0.849575,-0.739243,0.128718,-0.264407,-0.453677,-0.689644,-0.913135,-0.141789,B
8,-0.220173,-0.780283,-0.221946,-0.379006,0.866856,0.951689,0.352755,0.567887,0.477156,0.869678,...,-0.387178,-0.313250,-0.462195,0.957095,1.461848,1.009741,0.867649,0.985987,0.994608,B
9,-0.061619,-0.601935,-0.119869,-0.155062,-1.974735,-0.962156,-0.839729,-0.917711,0.007332,-1.027074,...,-0.616610,-0.177069,-0.278789,-1.634728,-0.336738,-0.651126,-0.796304,-0.358300,-0.386344,B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,-0.503305,0.712214,-0.495250,-0.535541,0.392299,-0.040435,-0.443410,-0.527636,-0.946884,0.673893,...,0.518996,-0.592211,-0.571357,0.592783,-0.122328,-0.505788,-0.494101,-1.009835,0.389951,B
108,0.091272,-0.939859,0.085933,-0.040269,0.284445,-0.028988,-0.050745,0.185177,-0.316810,-0.270318,...,-0.938475,0.044900,-0.194140,0.002943,0.134059,0.170702,0.700610,-0.339277,-0.394758,B
109,0.147898,-0.547961,0.095811,0.032780,-0.671141,-0.699053,-0.465834,-0.535790,-1.464055,-1.278401,...,-0.015795,-0.033089,-0.081099,-0.751704,-0.295406,-0.147702,-0.550190,-0.713394,-1.004463,B
111,-0.735472,-0.998526,-0.741388,-0.699691,0.399489,-0.487272,-0.574677,-0.499492,0.262275,-0.149514,...,-0.717296,-0.786584,-0.679284,0.319548,-0.610562,-0.450330,-0.277411,-0.382078,-0.324083,B


In [78]:
B_test = ztestB.iloc[:, :-1].to_numpy()
M_test = ztestM.iloc[:, :-1].to_numpy()

In [83]:
values_test_M = np.dot(M_test, w_scaled) + b_scaled
values_test_B = np.dot(B_test, w_scaled) + b_scaled

comply_M = np.sum((values_test_M + 1) <= 1e-4)
# Count how many do not comply with the condition for M
not_comply_M = np.sum((values_test_M + 1) > 1e-4)

# Count how many comply with values_B being greater than 1 (considering a tolerance)
comply_B = np.sum((values_test_B - 1) >= -1e-4)
# Count how many do not comply with the condition for B
not_comply_B = np.sum((values_test_B - 1) < -1e-4)

# Printing the results
print(f"Values in M that comply with the restriction: {comply_M}")
print(f"Values in M that do not comply with the restriction: {not_comply_M}")
print(f"Values in B that comply with the restriction: {comply_B}")
print(f"Values in B that do not comply with the restriction: {not_comply_B}")

Values in M that comply with the restriction: 42
Values in M that do not comply with the restriction: 1
Values in B that comply with the restriction: 64
Values in B that do not comply with the restriction: 7


In [84]:
TP = comply_M  # M values that are correctly less than -1
FN = not_comply_M  # M values that are incorrectly not less than -1
TN = comply_B  # B values that are correctly greater than 1
FP = not_comply_B  # B values that are incorrectly not greater than 1

# Calculate accuracy
accuracy = (TP + TN) / (TP + TN + FP + FN)

# Calculate precision
precision = TP / (TP + FP) if (TP + FP) > 0 else 0

# Calculate recall
recall = TP / (TP + FN) if (TP + FN) > 0 else 0

# Calculate F1 score
F1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

# Printing the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {F1:.4f}")

Accuracy: 0.9298
Precision: 0.8571
Recall: 0.9767
F1 Score: 0.9130


### Precisión (Accuracy): 0.9298 (92.98%)
Lo que nos dice: Esta es la proporción de resultados verdaderos (tanto verdaderos positivos como verdaderos negativos) en tu conjunto de datos total. Una precisión del 92.98% significa que tu modelo predijo correctamente la clase (ya sea M o B) para el 92.98% de los casos de prueba. Es una alta precisión que indica un buen rendimiento general.

### Precisión (Precision): 0.8571 (85.71%)
Lo que nos dice: La precisión mide la exactitud de las predicciones positivas. Una precisión del 85.71% significa que cuando tu modelo predice la clase M, es correcto el 85.71% de las veces. Esto es bastante alto, lo que indica que el modelo es fiable cuando predice que una observación pertenece a la clase M. Sin embargo, también sugiere que hay cierto margen de mejora para reducir los falsos positivos (casos clasificados erróneamente como M).

### Sensibilidad (Recall): 0.9767 (97.67%)
Lo que nos dice: El recuerdo (o sensibilidad) mide la capacidad del modelo para encontrar todos los casos relevantes dentro de un conjunto de datos. Un recuerdo del 97.67% para la clase M es excelente, indicando que tu modelo identifica con éxito el 97.67% de todos los casos M reales. Esta alta tasa de recuerdo es particularmente importante en escenarios donde perder un caso positivo (un falso negativo) tiene consecuencias graves.

### Puntuación F1: 0.9130 (91.30%)
Lo que nos dice: La puntuación F1 es la media armónica de la precisión y el recuerdo, y es una medida de la exactitud de una prueba. Considera tanto la precisión como el recuerdo para calcular la puntuación. Una puntuación F1 del 91.30% es muy alta y sugiere que tu modelo tiene un buen equilibrio entre precisión y recuerdo. Esto es especialmente valioso cuando necesitas un equilibrio entre identificar efectivamente los casos positivos (clase M en este contexto) y minimizar los falsos positivos.

### Interpretación
En general, tu modelo muestra un fuerte rendimiento al clasificar values_M como la clase positiva y values_B como la clase negativa, con una capacidad particularmente destacada para identificar casos positivos (alto recuerdo). La precisión ligeramente inferior en comparación con el recuerdo sugiere que hay algunas instancias en las que el modelo predice un caso como M cuando no lo es, pero este número es relativamente bajo. La alta puntuación F1 indica un equilibrio robusto entre precisión y recuerdo, haciendo este modelo fiable para tareas donde es importante tanto identificar positivos de manera precisa como minimizar los falsos positivos. Dadas estas métricas, tu modelo es muy adecuado para aplicaciones donde perder un caso positivo tiene un costo más alto que lidiar con unos pocos falsos positivos, aunque los esfuerzos para mejorar la precisión sin comprometer significativamente el recuerdo podrían hacerlo aún más efectivo.