In [32]:
## Imports
import numpy as np
import pandas as pd



## Functions
def load_data(datafile):
    """Loads the data in a matrix"""
    data = np.loadtxt(datafile, dtype=str, delimiter=',')
    for i in range(0,np.size(data[:,0])):                  # This double for-loop gets rid of the b's and double quotes
        for j in range(0,np.size(data[0,:])):
            string = data[i,j]
            val = len(string) - 1 
            data[i,j] = string[2:val]
    return data

def count_missing(data):
    """Counts the missing values per row and per column"""
    missing_in_row = []
    missing_in_column =[]
    for i in range(data.shape[0]) :
        missing_in_row.append(np.count_nonzero(data[i, :] == '?'))
    for j in range(data.shape[1]) :
        missing_in_column.append(np.count_nonzero(data[:, j] == '?'))
    return missing_in_row, missing_in_column

def remove_missing(data):
    """Removes all rows with missing data""" 
    missing_rows, _ = count_missing(data)
    to_remove = []
    for i in range (len(missing_rows)) :
        if missing_rows[i] > 0 : 
            #print(missing_columns[i])
            to_remove.append(i)
    data_new =  np.delete(data, to_remove, axis = 0)
    #print(data_new.shape)
    #print(data_new)
    return data_new

def unique_vals(data):
    """Counts number of different values per feature"""
    M,N = data.shape
    ua = [[] for _ in range(N)]
    for row in data:
        for n in range(N):
            val = row[n]
            a = ua[n]
            if val not in a:
                a.append(val)
    return np.array([len(ua[n]) for n in range(N)])

def transform_data(data):
    """Transforms the linguistic data into numeric data"""
    M,N = data.shape
    for m in range(M):
        for n in range(N):
            # edibility
            if n == 0 and data[m,n] == 'p':
                data[m,n] = 0
            elif n == 0 and data[m,n] == 'e':
                data[m,n] = 1
            # cap-shape
            if n == 1 and data[m,n] == 'c':
                data[m,n] = 0
            elif n == 1 and data[m,n] == 'b':
                data[m,n] = 1
            elif n == 1 and data[m,n] == 'x':
                data[m,n] = 2
            elif n == 1 and data[m,n] == 'k':
                data[m,n] = 3
            elif n == 1 and data[m,n] == 'f':
                data[m,n] = 4
            elif n == 1 and data[m,n] == 's':
                data[m,n] = 5
            # cap-surface
            if n == 2 and data[m,n] == 'g':
                data[m,n] = 0
            elif n == 2 and data[m,n] == 'f':
                data[m,n] = 1
            elif n == 2 and data[m,n] == 'y':
                data[m,n] = 2
            elif n == 2 and data[m,n] == 's':
                data[m,n] = 3
            # cap-color
            if n == 3 and data[m,n] == 'e':
                data[m,n] = 0
            elif n == 3 and data[m,n] == 'p':
                data[m,n] = 1
            elif n == 3 and data[m,n] == 'u':
                data[m,n] = 2
            elif n == 3 and data[m,n] == 'n':
                data[m,n] = 3
            elif n == 3 and data[m,n] == 'c':
                data[m,n] = 4
            elif n == 3 and data[m,n] == 'b':
                data[m,n] = 5
            elif n == 3 and data[m,n] == 'y':
                data[m,n] = 6
            elif n == 3 and data[m,n] == 'w':
                data[m,n] = 7
            elif n == 3 and data[m,n] == 'g':
                data[m,n] = 8
            elif n == 3 and data[m,n] == 'r':
                data[m,n] = 9
            # bruises
            if n == 4 and data[m,n] == 't':
                data[m,n] = 0
            elif n == 4 and data[m,n] == 'f':
                data[m,n] = 1
            # odor
            if n == 5  and data[m,n] == 'n':
                data[m,n] = 0
            elif n == 5  and data[m,n] == 'a' :
                data[m,n] = 1
            elif n == 5  and data[m,n] == 'l':
                data[m,n] = 2
            elif n == 5  and data[m,n] =='c':
                data[m,n] = 3
            elif n == 5  and data[m,n] == 'y':
                data[m,n] = 4
            elif n == 5  and data[m,n] == 'f':
                data[m,n] = 5
            elif n == 5  and data[m,n] == 'm':
                data[m,n] = 6
            elif n == 5  and data[m,n] == 'p':
                data[m,n] = 7
            elif n == 5  and data[m,n] == 's':
                data[m,n] = 8
            # gill-attachment
            if n == 6 and data[m,n] == 'd':
                data[m,n] = 0
            elif n == 6 and data[m,n] == 'a':
                data[m,n] = 1
            elif n == 6 and data[m,n] == 'n':
                data[m,n] = 2
            elif n == 6 and data[m,n] == 'f':
                data[m,n] = 3
            # gill-spacing
            if n == 7 and data[m,n] == 'c':
                data[m,n] = 0
            elif n == 7 and data[m,n] == 'w':
                data[m,n] = 1
            elif n == 7 and data[m,n] == 'd':
                data[m,n] = 2
            # gill-size
            if n == 8 and data[m,n] == 'b':
                data[m,n] = 0
            elif n == 8 and data[m,n] == 'n':
                data[m,n] = 1
            # gill-color
            if n == 9 and data[m,n] == 'e':
                data[m,n] = 0
            elif n == 9 and data[m,n] == 'p':
                data[m,n] = 1
            elif n == 9 and data[m,n] == 'u':
                data[m,n] = 2
            elif n == 9 and data[m,n] == 'k':
                data[m,n] = 3
            elif n == 9 and data[m,n] == 'h':
                data[m,n] = 4
            elif n == 9 and data[m,n] == 'n':
                data[m,n] = 5
            elif n == 9 and data[m,n] == 'b':
                data[m,n] = 6
            elif n == 9 and data[m,n] == 'o':
                data[m,n] = 7
            elif n == 9 and data[m,n] == 'y':
                data[m,n] = 8
            elif n == 9 and data[m,n] == 'w':
                data[m,n] = 9
            elif n == 9 and data[m,n] == 'g':
                data[m,n] = 10
            elif n == 9 and data[m,n] == 'r':
                data[m,n] = 11
            # stalk-shape
            if n == 10 and data[m,n] == 'e':
                data[m,n] = 0
            elif n == 10 and data[m,n] == 't':
                data[m,n] = 1
            # stalk-root
            if n == 11 and data[m,n] == 'r':
                data[m,n] = 0
            elif n == 11 and data[m,n] == 'z':
                data[m,n] = 1
            elif n == 11 and data[m,n] == 'e':
                data[m,n] = 2
            elif n == 11 and data[m,n] == 'c':
                data[m,n] = 3
            elif n == 11 and data[m,n] == 'b':
                data[m,n] = 4
            elif n == 11 and data[m,n] == 'u':
                data[m,n] = 5
            # stalk-surface-above-ring and stalk-surface-below-ring
            if (n == 12 or n == 13) and data[m,n] == 'f':
                data[m,n] = 0
            elif (n == 12 or n == 13) and data[m,n] == 'y':
                data[m,n] = 1
            elif (n == 12 or n == 13) and data[m,n] == 'k':
                data[m,n] = 2
            elif (n == 12 or n == 13) and data[m,n] == 's':
                data[m,n] = 3
            # stalk-color-above-ring and stalk-color-below-ring
            elif (n == 14 or n == 15) and data[m,n] == 'p':
                data[m,n] = 0
            elif (n == 14 or n == 15) and data[m,n] == 'e':
                data[m,n] = 1
            elif (n == 14 or n == 15) and data[m,n] == 'n':
                data[m,n] = 2
            elif (n == 14 or n == 15) and data[m,n] == 'c':
                data[m,n] = 3
            elif (n == 14 or n == 15) and data[m,n] == 'o':
                data[m,n] = 4
            elif (n == 14 or n == 15) and data[m,n] == 'b':
                data[m,n] = 5
            elif (n == 14 or n == 15) and data[m,n] == 'y':
                data[m,n] = 6
            elif (n == 14 or n == 15) and data[m,n] == 'w':
                data[m,n] = 7
            elif (n == 14 or n == 15) and data[m,n] == 'g':
                data[m,n] = 8
            # veil-type
            if n == 16 and data[m,n] == 'p':
                data[m,n] = 0
            elif n == 16 and data[m,n] == 'u':
                data[m,n] = 1
            # veil-color
            if n == 17 and data[m,n] == 'n':
                data[m,n] = 0
            elif n == 17 and data[m,n] == 'o':
                data[m,n] = 1
            elif n == 17 and data[m,n] == 'y':
                data[m,n] = 2
            elif n == 17 and data[m,n] == 'w':
                data[m,n] = 3
            # ring-number
            if n == 18 and data[m,n] == 'n':
                data[m,n] = 0
            elif n == 18 and data[m,n] == 'o':
                data[m,n] = 1
            elif n == 18 and data[m,n] == 't':
                data[m,n] = 2
            # ring-type
            if n == 19 and data[m,n] == 'n':
                data[m,n] = 0
            elif n == 19 and data[m,n] == 'c':
                data[m,n] = 1
            elif n == 19 and data[m,n] == 'z':
                data[m,n] = 2
            elif n == 19 and data[m,n] == 'e':
                data[m,n] = 3
            elif n == 19 and data[m,n] == 'p':
                data[m,n] = 4
            elif n == 19 and data[m,n] == 'f':
                data[m,n] = 5
            elif n == 19 and data[m,n] == 'l':
                data[m,n] = 6
            elif n == 19 and data[m,n] == 's':
                data[m,n] = 7
            # spore-print-color
            if n == 20 and data[m,n] == 'u':
                data[m,n] = 0
            elif n == 20 and data[m,n] == 'k':
                data[m,n] = 1
            elif n == 20 and data[m,n] == 'h':
                data[m,n] = 2
            elif n == 20 and data[m,n] == 'n':
                data[m,n] = 3
            elif n == 20 and data[m,n] == 'b':
                data[m,n] = 4
            elif n == 20 and data[m,n] == 'o':
                data[m,n] = 5
            elif n == 20 and data[m,n] == 'y':
                data[m,n] = 6
            elif n == 20 and data[m,n] == 'w':
                data[m,n] = 7
            elif n == 20 and data[m,n] == 'r':
                data[m,n] = 8
            # population
            if n == 21 and data[m,n] == 'y':
                data[m,n] = 0
            elif n == 21 and data[m,n] == 'a':
                data[m,n] = 1
            elif n == 21 and data[m,n] == 's':
                data[m,n] = 2
            elif n == 21 and data[m,n] == 'v':
                data[m,n] = 3
            elif n == 21 and data[m,n] == 'n':
                data[m,n] = 4
            elif n == 21 and data[m,n] == 'c':
                data[m,n] = 5
            # habitat
            if n == 22 and data[m,n] == 'l':
                data[m,n] = 0
            elif n == 22 and data[m,n] == 'm':
                data[m,n] = 1
            elif n == 22 and data[m,n] == 'g':
                data[m,n] = 2
            elif n == 22 and data[m,n] == 'd':
                data[m,n] = 3
            elif n == 22 and data[m,n] == 'p':
                data[m,n] = 4
            elif n == 22 and data[m,n] == 'u':
                data[m,n] = 5
            elif n == 22 and data[m,n] == 'w':
                data[m,n] = 6
    return data.astype(np.float)


## Main Programm
mushroom = load_data('mushroom_data.txt')
M,N = mushroom.shape
print('This is what the mushroom data set looks like:')
print(mushroom)
print('The rows represent the',mushroom.shape[0],'cases and the columns represent the',mushroom.shape[1],
      'different features.')

print()
print()

print('We want to check for each features how many values there are missing. Lets see:')
_ , missing_in_columns = count_missing(mushroom)
print(missing_in_columns)
print('The data is incomplete for the 11th feature. Lets get rid of all cases with missing values.')
cleaned_mushroom = remove_missing(mushroom)
print('We are now left with',cleaned_mushroom.shape[0],'different cases and ofcourse still',cleaned_mushroom.shape[1],
     'features.')
print(cleaned_mushroom)
print()
print()

print('We want to find out how each feature is correlated to the edibility feature of a mushroom. In order to find out, we have to transform the linguistic data into a numeric data set. The values for each value of a feature was determined beforehand.')
copy_cleaned_mushroom = np.copy(cleaned_mushroom)
transformed_mushroom = transform_data(copy_cleaned_mushroom)
print('This is what the transformed data set looks like:')
print(transformed_mushroom)

print()
print()

print('Now we can use the transformed data to generate a correlation matrix.')
correlation_matrix = pd.DataFrame({'edibility': transformed_mushroom[:,0], 
                   'cap-shape': transformed_mushroom[:,1],
                   'cap-surface': transformed_mushroom[:,2],
                   'cap-color': transformed_mushroom[:,3],
                   'bruises': transformed_mushroom[:,4],
                   'odor': transformed_mushroom[:,5],
                   'gill-attachment': transformed_mushroom[:,6],
                   'gill-spacing': transformed_mushroom[:,7],
                   'gill-size': transformed_mushroom[:,8],
                   'gill-odor': transformed_mushroom[:,9],
                   'stalk-shape': transformed_mushroom[:,10],
                   'stalk-root': transformed_mushroom[:,11],
                   'stalk-surface-above-ring': transformed_mushroom[:,12],
                   'stalk-surface-below-ring': transformed_mushroom[:,13],
                   'stalk-color-above-ring': transformed_mushroom[:,14],
                   'stalk-color-below-ring': transformed_mushroom[:,15],
                   'veil-type': transformed_mushroom[:,16],
                   'veil-color': transformed_mushroom[:,17],
                   'ring-number': transformed_mushroom[:,18],
                   'ring-type': transformed_mushroom[:,19],
                   'spore-print-color': transformed_mushroom[:,20],
                   'population': transformed_mushroom[:,21],
                   'habitat': transformed_mushroom[:,22]}).corr(method='kendall')
print('As we are only interested in the correlation of each feature with respect to the edibility, we will only display that column of the matrix:')
print(correlation_matrix[['edibility']])

print()
print()

print('Notable is the NaN value for the veil-type feature. Lets check the number of values each feature takes (after the data was cleaned and transformed).')
print(unique_vals(transformed_mushroom))
print('We observe that the 16th feature, which is the veil-type feature, takes only one value. This explains the NaN value!')

print()
print()

print('Based upon the result from the correlation matrix and what was published in the presented papers, it is decided that the following features are selected to continue:')
print('- odor')
print('- ring-type')
print('- stalk-shape')
print('- habitat')
print('- gill-size')
print('The other features are removed from the data and we are left with the following data set (edibility included in the first column):')

edibility = transformed_mushroom[:,0]
odor = transformed_mushroom[:,5]
gill_size = transformed_mushroom[:,8]
stalk_shape = transformed_mushroom[:,10]
ring_type = transformed_mushroom[:,19]
habitat = transformed_mushroom[:,22]
selected_features = np.column_stack((edibility,odor,gill_size,stalk_shape,ring_type,habitat))

print(selected_features)
print('We are looking at a set of 5644 rules. As you can see, the last two columns represent rules that are the same. We want to get rid of double rules.')

sorted_idx = np.lexsort(selected_features.T)
sorted_data =  selected_features[sorted_idx,:]
row_mask = np.append([True],np.any(np.diff(sorted_data,axis=0),1))
doubles_removed = sorted_data[row_mask]
print(doubles_removed)

print('If we remove all double rules (and ofcourse keeping one of them) we end up with',out.shape[0],'rules.')
print('We now observe that the first two rules contradict eachother (the same features result in different edibility measures).')
print('Contradictory rules must be removed.')


sorted_idx = np.lexsort(doubles_removed[:,1:].T)
sorted_data =  selected_features[sorted_idx,:]
row_mask = np.append([True],np.any(np.diff(sorted_data,axis=0),1))
contra_check = sorted_data[row_mask]
#print(contra_check.shape[0])

print('Apparently, only the first two rules contradict eachother so we only have to remove those two.')
final_rules = doubles_removed[2:,:]

def transform_back(data):
    M,N = data.shape
    converted_data = data.astype(np.str)
    for m in range(M):
        for n in range(N):
            # edibility
            if n == 0 and data[m,n] == 0:
                converted_data[m,n] = 'p'
            elif n == 0 and data[m,n] == 1:
                converted_data[m,n] = 'e'
            # odor
            if n == 1  and data[m,n] == 0:
                converted_data[m,n] = 'n'
            elif n == 1  and data[m,n] == 1 :
                converted_data[m,n] = 'a'
            elif n == 1  and data[m,n] == 2:
                converted_data[m,n] = 'l'
            elif n == 1  and data[m,n] == 3:
                converted_data[m,n] = 'c'
            elif n == 1  and data[m,n] == 4:
                converted_data[m,n] = 'y'
            elif n == 1  and data[m,n] == 5:
                converted_data[m,n] = 'f'
            elif n == 1  and data[m,n] == 6:
                converted_data[m,n] = 'm'
            elif n == 1  and data[m,n] == 7:
                converted_data[m,n] = 'p'
            elif n == 1  and data[m,n] == 8:
                converted_data[m,n] = 's'
            # gill-size
            if n == 2 and data[m,n] == 0:
                converted_data[m,n] = 'b'
            elif n == 2 and data[m,n] == 1:
                converted_data[m,n] = 'n'
             # stalk-shape
            if n == 3 and data[m,n] == 0:
                converted_data[m,n] = 'e'
            elif n == 3 and data[m,n] == 1:
                converted_data[m,n] = 't'
            # ring-type
            if n == 4 and data[m,n] == 0:
                converted_data[m,n] == 'n'
            elif n == 4 and data[m,n] == 1:
                converted_data[m,n] = 'c'
            elif n == 4 and data[m,n] == 2:
                converted_data[m,n] = 'z'
            elif n == 4 and data[m,n] == 3:
                converted_data[m,n] = 'e'
            elif n == 4 and data[m,n] == 4:
                converted_data[m,n] = 'p'
            elif n == 4 and data[m,n] == 5:
                converted_data[m,n] = 'f'
            elif n == 4 and data[m,n] == 6:
                converted_data[m,n] = 'l'
            elif n == 4 and data[m,n] == 7:
                converted_data[m,n] = 's'
            # habitat
            if n == 5 and data[m,n] == 0:
                converted_data[m,n] = 'l'
            elif n == 5 and data[m,n] == 1:
                converted_data[m,n] = 'm'
            elif n == 5 and data[m,n] == 2:
                converted_data[m,n] = 'g'
            elif n == 5 and data[m,n] == 3:
                converted_data[m,n] = 'd'
            elif n == 5 and data[m,n] == 4:
                converted_data[m,n] = 'p'
            elif n == 5 and data[m,n] == 5:
                converted_data[m,n] = 'u'
            elif n == 5 and data[m,n] == 6:
                converted_data[m,n] = 'w'
            converted_data[11,4] = 'n'
    return converted_data
    
print(transform_back(final_rules))
print('We end up with',transform_back(final_rules).shape[0],'final rules, which will going to be implemented in the MatLab Fuzzy Toolbox.')

print('Note that the column order of the features is as follows:')
print('edibility, odor, gill-size, stalk-shape, ring-type, habitat')

This is what the mushroom data set looks like:
[['p' 'x' 's' ..., 'k' 's' 'u']
 ['e' 'x' 's' ..., 'n' 'n' 'g']
 ['e' 'b' 's' ..., 'n' 'n' 'm']
 ..., 
 ['e' 'f' 's' ..., 'b' 'c' 'l']
 ['p' 'k' 'y' ..., 'w' 'v' 'l']
 ['e' 'x' 's' ..., 'o' 'c' 'l']]
The rows represent the 8124 cases and the columns represent the 23 different features.


We want to check for each features how many values there are missing. Lets see:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2480, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
The data is incomplete for the 11th feature. Lets get rid of all cases with missing values.
We are now left with 5644 different cases and ofcourse still 23 features.
[['p' 'x' 's' ..., 'k' 's' 'u']
 ['e' 'x' 's' ..., 'n' 'n' 'g']
 ['e' 'b' 's' ..., 'n' 'n' 'm']
 ..., 
 ['e' 'x' 'y' ..., 'w' 'y' 'p']
 ['p' 'x' 'y' ..., 'w' 'c' 'd']
 ['p' 'f' 'y' ..., 'w' 'c' 'd']]


We want to find out how each feature is correlated to the edibility feature of a mushroom. In order to find out, we have to transform the lingu