In [31]:
import pandas as pd

In [32]:
# from google.colab import drive
# drive.mount('/content/drive')

In [33]:
def get_domains(df):
    domains=[]
    for col in df:
      domains.append(list(df[col].unique()))
    return domains

In [34]:
def more_general(h1, h2):
    more_general_parts = []
    for x, y in zip(h1, h2):
        mg = x == "?" or (x != "0" and (x == y or y == "0"))
        more_general_parts.append(mg)
    return all(more_general_parts)

In [35]:
def min_generalizations(h, x):
    h_new = list(h)
    for i in range(len(h)):
        if not more_general( h[i:i+1],x[i:i+1]):
            h_new[i] = '?' if h[i] != '0' else x[i]
    return [tuple(h_new)]

In [36]:
def min_specializations(h, domains, x):
    results = []
    for i in range(len(h)):
        if h[i] == "?":
            for val in domains[i]:
                if x[i] != val:
                    h_new = h[:i] + (val,) + h[i+1:]
                    results.append(h_new)
        elif h[i] != "0":
            h_new = h[:i] + ('0',) + h[i+1:]
            results.append(h_new)
    return results

In [37]:
def generalize_S(x, G, S):
    S_prev = list(S)
    for s in S_prev:
        if s not in S:
            continue
        if not more_general(s, x):
            S.remove(s)
            Splus = min_generalizations(s, x)
            ## keep only generalizations that have a counterpart in G
            S.update([h for h in Splus if any([more_general(g,h) 
                                               for g in G])])
            ## remove hypotheses less specific than any other in S
            S.difference_update([h for h in S if 
                                 any([more_general(h, h1) 
                                      for h1 in S if h != h1])])
    return S

In [38]:
def specialize_G(x, domains, G, S):
    G_prev = list(G)
    for g in G_prev:
        if g not in G:
            continue
        if more_general(g, x):
            G.remove(g)
            Gminus = min_specializations(g, domains, x)
            ## keep only specializations that have a conuterpart in S
            G.update([h for h in Gminus if any([more_general(h, s)
                                                for s in S])])
            ## remove hypotheses less general than any other in G
            G.difference_update([h for h in G if 
                                 any([more_general(g1, h) 
                                      for g1 in G if h != g1])])
    return G

In [39]:
def candidate_elimination(df):
    domains = get_domains(df)[:-1]
    G = set([("?",)*len(domains)])
    S = set([('0',)*len(domains)])
    print(f"\n G[0]:{G}")
    print(f"\n S[0]:{S}")
    for index, row in df.iterrows():
        attribute,target=list(row[:-1]),row[-1] # Splitting data into attributes and decisions
        if target=='Yes': # x is positive example
            G = {g for g in G if more_general(g, attribute)}
            S = generalize_S(attribute, G, S)
        else: # x is negative example
            S = {s for s in S if not more_general(s, attribute)}
            G = specialize_G(attribute, domains, G, S)
        print("-"*100)
        print(f"\n G[{index+1}]:{G}")
        print(f"\n S[{index+1}]:{S}")
    return 

In [40]:
df=pd.read_excel('data.xlsx', sheet_name='Library')
df

Unnamed: 0,Citations,Size,InLibrary,Price,Editions,Class
0,Some,Small,No,Affordable,One,No
1,Many,Big,No,Expensive,Many,Yes
2,Many,Medium,No,Expensive,Few,Yes
3,Many,Small,No,Affordable,Many,Yes


In [41]:
get_domains(df)

[['Some', 'Many'],
 ['Small', 'Big', 'Medium'],
 ['No'],
 ['Affordable', 'Expensive'],
 ['One', 'Many', 'Few'],
 ['No', 'Yes']]

In [42]:
candidate_elimination(df)


 G[0]:{('?', '?', '?', '?', '?')}

 S[0]:{('0', '0', '0', '0', '0')}
----------------------------------------------------------------------------------------------------

 G[1]:{('?', 'Medium', '?', '?', '?'), ('?', '?', '?', '?', 'Few'), ('?', '?', '?', '?', 'Many'), ('Many', '?', '?', '?', '?'), ('?', 'Big', '?', '?', '?'), ('?', '?', '?', 'Expensive', '?')}

 S[1]:{('0', '0', '0', '0', '0')}
----------------------------------------------------------------------------------------------------

 G[2]:{('?', 'Big', '?', '?', '?'), ('Many', '?', '?', '?', '?'), ('?', '?', '?', 'Expensive', '?'), ('?', '?', '?', '?', 'Many')}

 S[2]:{('Many', 'Big', 'No', 'Expensive', 'Many')}
----------------------------------------------------------------------------------------------------

 G[3]:{('?', '?', '?', 'Expensive', '?'), ('Many', '?', '?', '?', '?')}

 S[3]:{('Many', '?', 'No', 'Expensive', '?')}
-----------------------------------------------------------------------------------------------

In [43]:
df=pd.read_excel('data.xlsx', sheet_name='Disease')
df

Unnamed: 0,Example,Shape,Size,Color,Surface,Thickness,Class
0,1,Circular,Large,Light,Smooth,Thick,Yes
1,2,Circular,Large,Light,Irregular,Thick,Yes
2,3,Oval,Large,Dark,Smooth,Thin,No
3,4,Oval,Large,Light,Irregular,Thick,Yes


In [44]:
get_domains(df)

[[1, 2, 3, 4],
 ['Circular', 'Oval'],
 ['Large'],
 ['Light', 'Dark'],
 ['Smooth', 'Irregular'],
 ['Thick', 'Thin'],
 ['Yes', 'No']]

In [45]:
candidate_elimination(df)


 G[0]:{('?', '?', '?', '?', '?', '?')}

 S[0]:{('0', '0', '0', '0', '0', '0')}
----------------------------------------------------------------------------------------------------

 G[1]:{('?', '?', '?', '?', '?', '?')}

 S[1]:{(1, 'Circular', 'Large', 'Light', 'Smooth', 'Thick')}
----------------------------------------------------------------------------------------------------

 G[2]:{('?', '?', '?', '?', '?', '?')}

 S[2]:{('?', 'Circular', 'Large', 'Light', '?', 'Thick')}
----------------------------------------------------------------------------------------------------

 G[3]:{('?', '?', '?', 'Light', '?', '?'), ('?', '?', '?', '?', '?', 'Thick'), ('?', 'Circular', '?', '?', '?', '?')}

 S[3]:{('?', 'Circular', 'Large', 'Light', '?', 'Thick')}
----------------------------------------------------------------------------------------------------

 G[4]:{('?', '?', '?', 'Light', '?', '?'), ('?', '?', '?', '?', '?', 'Thick')}

 S[4]:{('?', '?', 'Large', 'Light', '?', 'Thick')}


In [46]:
df=pd.read_excel('data.xlsx', sheet_name='Type')
df

Unnamed: 0,Size,Color,Shape,Class
0,Big,Red,Circle,No
1,Small,Red,Triangle,No
2,Small,Red,Circle,Yes
3,Big,Blue,Circle,No
4,Small,Blue,Circle,Yes


In [47]:
get_domains(df)

[['Big', 'Small'], ['Red', 'Blue'], ['Circle', 'Triangle'], ['No', 'Yes']]

In [48]:
candidate_elimination(df)


 G[0]:{('?', '?', '?')}

 S[0]:{('0', '0', '0')}
----------------------------------------------------------------------------------------------------

 G[1]:{('?', '?', 'Triangle'), ('?', 'Blue', '?'), ('Small', '?', '?')}

 S[1]:{('0', '0', '0')}
----------------------------------------------------------------------------------------------------

 G[2]:{('Small', '?', 'Circle'), ('?', 'Blue', '?'), ('Big', '?', 'Triangle')}

 S[2]:{('0', '0', '0')}
----------------------------------------------------------------------------------------------------

 G[3]:{('Small', '?', 'Circle')}

 S[3]:{('Small', 'Red', 'Circle')}
----------------------------------------------------------------------------------------------------

 G[4]:{('Small', '?', 'Circle')}

 S[4]:{('Small', 'Red', 'Circle')}
----------------------------------------------------------------------------------------------------

 G[5]:{('Small', '?', 'Circle')}

 S[5]:{('Small', '?', 'Circle')}


In [49]:
df=pd.read_excel('data.xlsx', sheet_name='Face')
df

Unnamed: 0,Eyes,Nose,Head,Fcolor,Hair,Class
0,Round,Triangle,Round,Purple,Yes,Yes
1,Square,Square,Square,Green,Yes,No
2,Square,Triangle,Round,Yellow,Yes,Yes
3,Round,Triangle,Round,Green,No,No
4,Square,Square,Round,Yellow,Yes,Yes


In [50]:
get_domains(df)

[['Round', 'Square'],
 ['Triangle', 'Square'],
 ['Round', 'Square'],
 ['Purple', 'Green', 'Yellow'],
 ['Yes', 'No'],
 ['Yes', 'No']]

In [51]:
candidate_elimination(df)


 G[0]:{('?', '?', '?', '?', '?')}

 S[0]:{('0', '0', '0', '0', '0')}
----------------------------------------------------------------------------------------------------

 G[1]:{('?', '?', '?', '?', '?')}

 S[1]:{('Round', 'Triangle', 'Round', 'Purple', 'Yes')}
----------------------------------------------------------------------------------------------------

 G[2]:{('?', 'Triangle', '?', '?', '?'), ('?', '?', 'Round', '?', '?'), ('?', '?', '?', 'Purple', '?'), ('Round', '?', '?', '?', '?')}

 S[2]:{('Round', 'Triangle', 'Round', 'Purple', 'Yes')}
----------------------------------------------------------------------------------------------------

 G[3]:{('?', '?', 'Round', '?', '?'), ('?', 'Triangle', '?', '?', '?')}

 S[3]:{('?', 'Triangle', 'Round', '?', 'Yes')}
----------------------------------------------------------------------------------------------------

 G[4]:{('?', '?', 'Round', '?', 'Yes'), ('?', 'Triangle', '?', '?', 'Yes')}

 S[4]:{('?', 'Triangle', 'Round', '?', 'Ye

In [52]:
df=pd.read_excel('data.xlsx', sheet_name='Cars')
df

Unnamed: 0,Origin,Manufacturer,Color,Decade,Type,Class
0,Japan,Honda,Blue,1980,Economy,Yes
1,Japan,Toyota,Green,1970,Sports,No
2,Japan,Toyota,Blue,1990,Economy,Yes
3,USA,Chrysler,Red,1980,Economy,No
4,Japan,Honda,White,1980,Economy,Yes
5,Japan,Toyota,Green,1980,Economy,Yes
6,Japan,Honda,Red,1990,Economy,No


In [53]:
get_domains(df)

[['Japan', 'USA'],
 ['Honda', 'Toyota', 'Chrysler'],
 ['Blue', 'Green', 'Red', 'White'],
 [1980, 1970, 1990],
 ['Economy', 'Sports'],
 ['Yes', 'No']]

In [54]:
candidate_elimination(df)


 G[0]:{('?', '?', '?', '?', '?')}

 S[0]:{('0', '0', '0', '0', '0')}
----------------------------------------------------------------------------------------------------

 G[1]:{('?', '?', '?', '?', '?')}

 S[1]:{('Japan', 'Honda', 'Blue', 1980, 'Economy')}
----------------------------------------------------------------------------------------------------

 G[2]:{('?', '?', 'Blue', '?', '?'), ('?', '?', '?', '?', 'Economy'), ('?', '?', '?', 1980, '?'), ('?', 'Honda', '?', '?', '?')}

 S[2]:{('Japan', 'Honda', 'Blue', 1980, 'Economy')}
----------------------------------------------------------------------------------------------------

 G[3]:{('?', '?', 'Blue', '?', '?'), ('?', '?', '?', '?', 'Economy')}

 S[3]:{('Japan', '?', 'Blue', '?', 'Economy')}
----------------------------------------------------------------------------------------------------

 G[4]:{('Japan', '?', '?', '?', 'Economy'), ('?', '?', 'Blue', '?', '?')}

 S[4]:{('Japan', '?', 'Blue', '?', 'Economy')}
-------------

In [55]:
df=pd.read_excel('data.xlsx', sheet_name='Day')
df

Unnamed: 0,sky,airtemp,humidity,wind,water,forcast,Class
0,sunny,warm,normal,strong,warm,same,Yes
1,sunny,warm,high,strong,warm,same,Yes
2,rainy,cold,high,strong,warm,change,No
3,sunny,warm,high,strong,cool,change,Yes


In [56]:
get_domains(df)

[['sunny', 'rainy'],
 ['warm', 'cold'],
 ['normal', 'high'],
 ['strong'],
 ['warm', 'cool'],
 ['same', 'change'],
 ['Yes', 'No']]

In [57]:
candidate_elimination(df)


 G[0]:{('?', '?', '?', '?', '?', '?')}

 S[0]:{('0', '0', '0', '0', '0', '0')}
----------------------------------------------------------------------------------------------------

 G[1]:{('?', '?', '?', '?', '?', '?')}

 S[1]:{('sunny', 'warm', 'normal', 'strong', 'warm', 'same')}
----------------------------------------------------------------------------------------------------

 G[2]:{('?', '?', '?', '?', '?', '?')}

 S[2]:{('sunny', 'warm', '?', 'strong', 'warm', 'same')}
----------------------------------------------------------------------------------------------------

 G[3]:{('?', 'warm', '?', '?', '?', '?'), ('?', '?', '?', '?', '?', 'same'), ('sunny', '?', '?', '?', '?', '?')}

 S[3]:{('sunny', 'warm', '?', 'strong', 'warm', 'same')}
----------------------------------------------------------------------------------------------------

 G[4]:{('?', 'warm', '?', '?', '?', '?'), ('sunny', '?', '?', '?', '?', '?')}

 S[4]:{('sunny', 'warm', '?', 'strong', '?', '?')}


In [58]:
import pandas as pd
df=pd.read_excel('data.xlsx', sheet_name='Cars')

print(f"\nThe total number of training instances are : {df.shape[0]}")
n_attribute = df.shape[1]-1

S = ['0']*n_attribute
print("\nThe initial Specific hypothesis is : ",S)
G = [['?' for i in range(n_attribute)] for i in range(n_attribute)]
print("\nThe initial general hypothesis is : ",  G)

for index, row in df.iterrows():
  if(row["Class"]=="No"):
    print (f"\nInstance {index+1} is {row.values} => Negative Instance")
    j=0
    for col_name,value in row.iteritems():
       if(j<n_attribute):
         if(S[j]!=value):
            G[j][j]=S[j]
         if(S[j]=='?'):
            G[j][j]='?'
       j+=1
    
  if(row["Class"]=='Yes'):
        print (f"\nInstance {index+1} is {row.values} => Positive Instance")
        j=0
        for col_name,value in row.iteritems():
            if(j<n_attribute):
              S[j]=value if (S[j] == '0' or S[j] == value) else '?'
              G[j][j]=value if (G[j][j] == value) else '?'
            j+=1
  print(f"S{index+1} = {S}\n") 
  print(f"G{index+1} = {G}\n") 

print('=' * 100)
G= list(filter(lambda x: x!=['?']*n_attribute , G))
print(f"S = {S}\n") 
print(f"G = {G}\n") 


The total number of training instances are : 7

The initial Specific hypothesis is :  ['0', '0', '0', '0', '0']

The initial general hypothesis is :  [['?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?']]

Instance 1 is ['Japan' 'Honda' 'Blue' 1980 'Economy' 'Yes'] => Positive Instance
S1 = ['Japan', 'Honda', 'Blue', 1980, 'Economy']

G1 = [['?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?']]


Instance 2 is ['Japan' 'Toyota' 'Green' 1970 'Sports' 'No'] => Negative Instance
S2 = ['Japan', 'Honda', 'Blue', 1980, 'Economy']

G2 = [['?', '?', '?', '?', '?'], ['?', 'Honda', '?', '?', '?'], ['?', '?', 'Blue', '?', '?'], ['?', '?', '?', 1980, '?'], ['?', '?', '?', '?', 'Economy']]


Instance 3 is ['Japan' 'Toyota' 'Blue' 1990 'Economy' 'Yes'] => Positive Instance
S3 = ['Japan', '?', 'Blue', '?', 'Economy']

G3 = [['?', '?', '?', '

In [59]:
import numpy as np
np.random.normal(scale=0.5, size=(4, 2))

array([[ 0.32534058,  0.51597727],
       [ 0.27721672,  0.11320288],
       [-0.0346035 ,  0.06238397],
       [-0.31888357,  0.21548955]])

In [60]:
np.random.normal(scale=0.5,size=(2,3))

array([[ 0.59820425, -0.737821  , -0.65441909],
       [ 0.17909421,  0.65801256,  0.20745868]])