In [1]:
import pandas as pd

# Import and process the dataset

In [2]:
columns = [
    "Class", "cap-shape", "cap-surface", "cap-color", "bruises?", "odor",
    "gill-attachment", "gill-spacing", "gill-size", "gill-color", "stalk-shape",
    "stalk-root", "stalk-surface-above-ring", "stalk-surface-below-ring", "stalk-color-above-ring",
    "stalk-color-below-ring", "veil-type", "veil-color", "ring-number", "ring-type",
    "spore-print-color", "population", "habitat"
    ]

dataset = pd.read_csv("./src/agaricus-lepiota.data", sep=",", names=columns)

print(dataset)

     Class cap-shape cap-surface cap-color bruises? odor gill-attachment  \
0        p         x           s         n        t    p               f   
1        e         x           s         y        t    a               f   
2        e         b           s         w        t    l               f   
3        p         x           y         w        t    p               f   
4        e         x           s         g        f    n               f   
...    ...       ...         ...       ...      ...  ...             ...   
8119     e         k           s         n        f    n               a   
8120     e         x           s         n        f    n               a   
8121     e         f           s         n        f    n               a   
8122     p         k           y         n        f    y               f   
8123     e         x           s         n        f    n               a   

     gill-spacing gill-size gill-color  ... stalk-surface-below-ring  \
0              

In [3]:
clases_values = {"positive": "e", "negative": "p"}

print(clases_values["positive"])

e


## Separate positive and negative clases

In [4]:
from homemaid_libs.OCAT.procesamiento_datasets import extraer_muestra, valores_observados, repeticiones_de_valor

total_clases_original = []

clases = valores_observados(dataset["Class"].values)
proporcion = []

for clase in clases:
    total = repeticiones_de_valor(clase, dataset["Class"].values)
    total_clases_original.append([clase, total])

    proporcion.append([clase, total/len(dataset["Class"].values)])

print(total_clases_original)
print(clases)
print(proporcion)

[['p', 3916], ['e', 4208]]
['p', 'e']
[['p', 0.48202855736090594], ['e', 0.517971442639094]]


In [5]:
train, test = extraer_muestra(dataset.drop("Class", axis=1), dataset["Class"], proporcion, dataset.__len__(), 0.7)

print(train)

5686
[['p', 2740], ['e', 2945]]
['p', 2740]
3916 3916
['e', 2945]
5384 5384
column size: 22
     cap-shape cap-surface cap-color bruises? odor gill-attachment  \
0            x           s         g        f    c               f   
1            x           y         y        f    f               f   
2            x           f         g        f    f               f   
3            f           s         e        f    f               f   
4            x           s         n        f    s               f   
...        ...         ...       ...      ...  ...             ...   
5680         k           s         w        f    n               f   
5681         x           s         n        f    n               f   
5682         x           f         n        f    n               f   
5683         x           y         n        t    n               f   
5684         f           y         y        t    a               f   

     gill-spacing gill-size gill-color stalk-shape  ...  \
0       

## Let's Binarize our dataframe!!!

In [6]:
from homemaid_libs.OCAT.Binarizacion import binarizar_categoricos

In [7]:
binarized_dataframe:list[pd.DataFrame] = []

for i, label in enumerate(columns):
    if i == 0: continue

    binarized_dataframe.append(binarizar_categoricos(train[label].values.tolist(), i))

#let's concatenate to our binarized dataframe the class value in order to know this value in furder progress
binarized_dataframe.append(train["Class"])
binary_train = pd.concat(binarized_dataframe, axis=1)

print(binary_train)

      x1,1  x1,2  x1,3  x1,4  x1,5  x1,6 valor_real 1  x2,1  x2,2  x2,3  ...  \
0        1     0     0     0     0     0            x     1     0     0  ...   
1        1     0     0     0     0     0            x     0     1     0  ...   
2        1     0     0     0     0     0            x     0     0     1  ...   
3        0     1     0     0     0     0            f     1     0     0  ...   
4        1     0     0     0     0     0            x     1     0     0  ...   
...    ...   ...   ...   ...   ...   ...          ...   ...   ...   ...  ...   
5680     0     0     1     0     0     0            k     1     0     0  ...   
5681     1     0     0     0     0     0            x     1     0     0  ...   
5682     1     0     0     0     0     0            x     0     0     1  ...   
5683     1     0     0     0     0     0            x     0     1     0  ...   
5684     0     1     0     0     0     0            f     0     1     0  ...   

      valor_real 21 x22,1  x22,2  x22,3

In [8]:
# positive_train = binary_train[binary_train["Class"] == clases_values["positive"]]
# negative_train = binary_train[binary_train["Class"] == clases_values["negative"]]

# print(positive_train)
# print(negative_train)

## Time for the candidate elimination

In [9]:
garbage:list[str] = []
for i in range(1,23):
    if i == 22:
        garbage.append(f"valor_real {i}")
    else:
        garbage.append(f"valor_real {i}")

In [10]:
#generamos las hipotesis general y especifica

from homemaid_libs.Candidate_Elimination.hipotesis import General, Especifica as Specific, predicciones, probarHipotesis

# import homemaid_libs.Candidate_Elimination.hipotesis as hypotesis


#recordemos que no hablamos de hipotesis general y especifica como unicas hipoesis, sino mas bien nos referimos a un conjunto de hipotesis
columns_without_class = binary_train.drop(["Class", *garbage], axis=1).columns.tolist()
# columns_without_class.pop(len(columns_without_class)-1)
general_hypotesis = [General(columns_without_class)]
specific_hypotesis = [Specific(columns_without_class)]

# print(corpus.columns)
print(len(general_hypotesis[0].aceptados))
print(general_hypotesis[0])
print(specific_hypotesis[0])

117
<*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*>
<0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0>


In [11]:
from homemaid_libs.Candidate_Elimination.candidate_elimination import Candidate_Elimination

general_hypotesis, specific_hypotesis = Candidate_Elimination(specific_hypotesis, general_hypotesis, binary_train.drop(["Class", *garbage], axis=1), binary_train["Class"], clases_values)

In [12]:
print(len(specific_hypotesis.aceptados[1]))
print(specific_hypotesis.aceptados)

2
[[0, 1], [1, 0], [0, 1], [0, 1], [0], [0, 1], [0, 1], [1, 0], [0, 1], [0], [0, 1], [0, 1], [0, 1], [1, 0], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [1, 0], [0], [0], [0], [0], [1, 0], [0], [0], [0, 1], [0, 1], [1, 0], [0, 1], [1, 0], [0, 1], [0, 1], [1, 0], [0, 1], [0, 1], [1, 0], [0], [0, 1], [0, 1], [0], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [1, 0], [1, 0], [0, 1], [0, 1], [0, 1], [0, 1], [1, 0], [0, 1], [0, 1], [0, 1], [1, 0], [0, 1], [0, 1], [0, 1], [1, 0], [0, 1], [0, 1], [0], [0], [0], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0], [0, 1], [0], [0], [1, 0], [0, 1], [0, 1], [1], [1, 0], [0], [0, 1], [0, 1], [1, 0], [0, 1], [0], [1, 0], [0], [0, 1], [0], [0, 1], [1, 0], [0, 1], [0, 1], [0, 1], [0], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [1, 0], [0, 1], [0, 1], [0, 1], [1, 0], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1], [0, 1]]


In [13]:
contador = 0
for aceptados in specific_hypotesis.aceptados:
    for aceptado in aceptados:
        if aceptado == 1: contador +=1

print(contador)

96


In [14]:
print(len(general_hypotesis[0].aceptados))
print(general_hypotesis[0])

117
<*, *, *, *, [0], *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *, *>


## Now we can check our Hypotesis

In [15]:
binarized_test_dataframe:list[pd.DataFrame] = []

for i, label in enumerate(columns):
    if i == 0: continue

    binarized_test_dataframe.append(binarizar_categoricos(test[label].values.tolist(), i))

#let's concatenate to our binarized dataframe the class value in order to know this value in furder progress
binarized_test_dataframe.append(test["Class"])
binary_test = pd.concat(binarized_test_dataframe, axis=1)

print(binary_test)

      x1,1  x1,2  x1,3  x1,4  x1,5  x1,6 valor_real 1  x2,1  x2,2  x2,3  ...  \
0        1     0     0     0     0     0            f     1     0     0  ...   
1        0     1     0     0     0     0            x     0     1     0  ...   
2        0     1     0     0     0     0            x     0     1     0  ...   
3        0     1     0     0     0     0            x     1     0     0  ...   
4        0     1     0     0     0     0            x     1     0     0  ...   
...    ...   ...   ...   ...   ...   ...          ...   ...   ...   ...  ...   
2434     0     1     0     0     0     0            x     1     0     0  ...   
2435     0     0     0     1     0     0            k     1     0     0  ...   
2436     0     1     0     0     0     0            x     1     0     0  ...   
2437     1     0     0     0     0     0            f     1     0     0  ...   
2438     0     1     0     0     0     0            x     1     0     0  ...   

      valor_real 21 x22,1  x22,2  x22,3

In [16]:
prediction:list[str] = []

binary_test_cleaned = binary_test.drop(["Class", *garbage], axis=1).values.tolist()

for row in binary_test_cleaned:
    prediction.append(predicciones(row, hipotesis_especifica=specific_hypotesis, asignacion=["e", "p"]))

print(prediction)

['e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'p', 'e', 'p', 'e', 'e', 'e', 'e', 'e', 'e', 'p', 'p', 'e', 'e', 'e', 'e', 'p', 'e', 'p', 'e', 'e', 'e', 'p', 'e', 'e', 'e', 'e', 'p', 'e', 'p', 'p', 'e', 'p', 'p', 'e', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'e', 'p', 'p', 'p', 'p', 'p', 'e', 'p', 'e', 'p', 'e', 'e', 'p', 'p', 'p', 'p', 'p', 'e', 'p', 'p', 'p', 'p', 'e', 'p', 'e', 'p', 'e', 'e', 'p', 'p', 'e', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'e', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'e', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'e', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'e', 'p', 'e', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'e', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'e', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'e', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'e', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p',

### The evaluation metrics

In [17]:
probarHipotesis(prediction, binary_test["Class"], ["e"])

vp: 1468	fp: 314
fn: 439	 vn: 218
accuracy: 0.6912669126691267
precision: 0.8237934904601572
recall: 0.7697954902988988
f1: 0.7958796421794524


0