# Classification Rules For COVID-19 dataset Using PRISM Algorithm

<h3>PRISM Algorithm Implementation</h3>

In [55]:
class Rule:
    def __init__(self, conditions=None, class_label=None, accuracy=0.0, coverage=0):
        self.conditions = conditions
        self.class_label = class_label
        self.accuracy = accuracy
        self.coverage = coverage
        
    def __str__(self):
        str_rep = "If "
        
        counter = 0
        for condition in self.conditions:
            if counter == (len(self.conditions) - 1):
                str_rep += (str(condition) + "=" + str(self.conditions[condition]))
            else:
                str_rep += (str(condition) + "=" + str(self.conditions[condition]) + " and ")
            
            counter += 1
        
        str_rep += (", then " + self.class_label) 
        str_rep += (". Accuracy = " + str(self.accuracy))
        str_rep += (". Coverage = " + str(self.coverage))
        
        return str_rep

In [1]:
def get_unique_class_label(rows):
    
    class_set = set()
    
    for row in rows:
        
        # get the last column
        class_label = row[-1]
        if class_label not in class_set:
            class_set.add(class_label)
            
    return list(class_set)
        

In [2]:
def copy_rows(rows):
    nested_lst = []
    for row in rows:
        inner_lst = []
        for each_data in row:
            inner_lst.append(each_data)
            
        nested_lst.append(inner_lst)
    return nested_lst

In [3]:
# get specific rows based on attribute index and value of that attribute
def get_specific_rows(rows, attr_indx, val):
    return [row for row in rows if row[attr_indx] == val]

In [63]:
# get attributes that have not been used by the rule
def get_remaining_attributes(attributes, rule_node):
    
    if rule_node is None:
        return attributes
     
    rule_dic = rule_node.conditions
    
    return [attribute for attribute in attributes if attribute not in rule_dic]

In [5]:
def get_unique_attribute_vals(rows, indx):
    attribute_val_set = set()
    lst = [row[indx] for row in rows]
    return list(set(lst))

In [6]:
def get_attribute_indx(attribute, attribute_lst):
    for indx, attr in enumerate(attribute_lst):
        if attribute == attr:
            return indx
    

In [140]:
def get_condition_based_rows(rows, rule_node, attr_lst):
    
    if rule_node is not None:
        rule_dic = rule_node.conditions

        covered_rows = copy_rows(rows)

        for key in rule_dic:
            indx = get_attribute_indx(key, attr_lst)

            covered_rows = [row for row in covered_rows if row[indx] == rule_dic[key]]

        return covered_rows

In [8]:
def calculate_accuracy(correct_rows, covered_rows):
    no_of_correct_class = len(correct_rows)
    no_of_covered_rows = len(covered_rows)
    
    # accuracy is between 0.0 and 1.0
    accuracy = no_of_correct_class / float(no_of_covered_rows)
    return accuracy, no_of_correct_class

In [134]:
def get_rule_with_highest_accuracy(possible_rules, accu_thresh, cov_thresh):
    
    max_acc_node = None
    
    highest_acc = 0
    highest_correctness = 0
    
    
    for possible_rule_node in possible_rules:
        if possible_rule_node is None:
            continue
        
        acc = possible_rule_node.accuracy
        correct_num = possible_rule_node.coverage
        
        if acc > highest_acc:
            highest_acc = acc
            highest_correctness = correct_num
            
            max_acc_node = possible_rule_node
            
        elif acc == highest_acc:
            if correct_num > highest_correctness:
                highest_correctness = correct_num
                
                max_acc_node = possible_rule_node
                
    return max_acc_node
        
    
    '''
    rule_lst = []
    
    highest_acc = 0
    highest_correctness = 0
    
    for rule in possible_rules:
        acc = rule[2]
        correct_num = rule[3]
        
         
        if acc > highest_acc:
            highest_acc = acc
            highest_correctness = correct_num
            
            if len(rule_lst) == 0:
                rule_lst.append(rule)
            else:
                rule_lst[0] = rule
                
        elif acc == highest_acc:
            if correct_num > highest_correctness:
                highest_correctness = correct_num
                
                if len(rule_lst) == 0:
                    rule_lst.append(rule)
                else:
                    rule_lst[0] = rule
                    
    return rule_lst[0]
    '''
        
        

In [122]:
# remove rows which are covered by the rules
def get_remaining_rows(rows, covered_rows):
    return [row for row in rows if row not in covered_rows]



In [31]:
def learn_one_rule_testing(rows, class_labels, attributes, accuracy, coverage):
    
    # mapping attribute to index
    attribute_indx_dic = {}
    counter = 0
    for attribute in attributes:
        attribute_indx_dic[attribute] = counter
        counter += 1
    
    # for each class label
    for class_label in class_labels:
         
        # get rows with that class label
        # -1 refers to index of value of class label
        specific_rows = get_specific_rows(rows, -1, class_label)
        
        if len(specific_rows) == 0:
            continue
        
        rows_covered_by_rule = copy_rows(rows)
            
        
        rule_lst = []
        
       
    
        while True:
            all_possible_rules = []
            
            # remaining attributes that haven't been used in the rule
            remaining_attributes = get_remaining_attributes(attributes, rule_lst)


            for attr in remaining_attributes:
                indx = attribute_indx_dic[attr]
                
                # get unique attribute values
                remaining_attr_vals = get_unique_attribute_vals(rows_covered_by_rule, indx)

                for attr_val in remaining_attr_vals:

                    # note: both attribute and values of attribute results in condition
                    # eg. If 'Outlook' = 'Sunny' 

                    # get rows based on condition
                    # Eg. if 'Outlook' = 'Sunny'
                    # these are rows that satisfy only the condition
                    only_condition_rows = get_condition_based_rows(rows_covered_by_rule, [{attr: attr_val}], attributes)

                   
                    # get rows based on condition and class label
                    # Eg. if 'Outlook' = 'Sunny', then 'Yes'
                    # these are rows that satisfy both condition and class label
                    condition_class_label_rows = [row for row in only_condition_rows if row[-1] == class_label]

                    print("condition with that class rows for {} with val={}: ".format(attr,attr_val))
                    print(only_condition_rows)
                    print("\n")
                    
    
                    
                    # calculate accuracy
                    calculated_acc, correct_num = calculate_accuracy(condition_class_label_rows, only_condition_rows)

                        
                    all_possible_rules.append([{attr: attr_val}, class_label , calculated_acc, correct_num])

            print("all possible rules")
            print(all_possible_rules)
            
            highest_accuracy_rule = get_rule_with_highest_accuracy(all_possible_rules)

            
                
            
            print("highest accuracy rule: ")
            print(highest_accuracy_rule)

            # add rule info to rule_lst
            if len(rule_lst) == 0:
                rule_lst = highest_accuracy_rule
            else:
                # if rule info has been added before, then just update

                rule_dic = rule_lst[0]

                # update condition dictionary
                another_rule_dic = highest_accuracy_rule[0]
                for key in another_rule_dic:
                    attri_value = another_rule_dic[key]
                    rule_lst[0][key] = attri_value

                # update accuracy and correctness
                rule_lst[2] = highest_accuracy_rule[2]
                rule_lst[3] = highest_accuracy_rule[3]

            print('rules list: ')
            print(rule_lst)
            print('\n')
            rows_covered_by_rule = get_condition_based_rows(rows_covered_by_rule, rule_lst, attributes)
            
            if highest_accuracy_rule[2] == 1.0 or highest_accuracy_rule[3] < 3:
                break
                
        remaining_rows = get_remaining_rows(rows, rows_covered_by_rule)
        
        return remaining_rows, rule_lst
        
        
    
    

In [160]:
def learn_one_rule(rows, class_labels, attributes, accuracy_thresh, coverage_thresh):
    
    # mapping attribute to index
    attribute_indx_dic = {}
    counter = 0
    for attribute in attributes:
        attribute_indx_dic[attribute] = counter
        counter += 1
        
    
    rows_covered_by_rule = copy_rows(rows)
    
    highest_node = None
    
    while True:
    
        best_classes_rule_lst = []

        for class_label in class_labels:
            
            if highest_node is not None and highest_node.class_label != class_label:
                continue

            # remaining attributes that haven't been used in the rule
            remaining_attributes = get_remaining_attributes(attributes, highest_node)

            # this stores all the nodes generated from considering each attribute for each class label
            possible_rules = []


            # go over all the remaining attributes
            for attr in remaining_attributes:

                indx = attribute_indx_dic[attr]

                # get unique attribute values such as 'Sunny', Rainy, ...
                remaining_attr_vals = get_unique_attribute_vals(rows_covered_by_rule, indx)


                for attr_val in remaining_attr_vals:
                    # note: both attribute and values of attribute results in condition
                    # eg. If 'Outlook' = 'Sunny' 

                    # create a rule node for each attribute values
                    potential_rule_node = Rule(conditions={attr: attr_val}, class_label=class_label)


                    # Simply, give me rows which satisfy the condition
                    # eg. give me rows that has Outlook = Sunny
                    only_condition_rows = get_condition_based_rows(rows_covered_by_rule, potential_rule_node, attributes)


                    # Simply, give me rows which satisfy both the condition and the class label
                    # Eg. give me rows that has 'Outlook' = 'Sunny' and class label = 'Yes'
                    # these are rows that satisfy both condition and class label
                    condition_class_label_rows = [row for row in only_condition_rows if row[-1] == class_label]


                    #print("for {}: {} accuracy".format(attr,attr_val))
                    #print("correct: ", len(condition_class_label_rows))
                    #print("covered: ", len(only_condition_rows))
                    
                    # calculate accuracy
                    calculated_acc, correct_num = calculate_accuracy(condition_class_label_rows, only_condition_rows)
                    
                    if correct_num >= coverage_thresh:
                        potential_rule_node.accuracy = calculated_acc
                        potential_rule_node.coverage = correct_num
                    
                        possible_rules.append(potential_rule_node)
        
            
            if len(possible_rules) == 0:
                continue
            
            
            # get the rule with highest accuracy among the same class label
            highest_accuracy_rule = get_rule_with_highest_accuracy(possible_rules, accuracy_thresh, coverage_thresh)

            best_classes_rule_lst.append(highest_accuracy_rule)
            
        

            # update the highest_node
            if highest_node is not None:

                for key in highest_accuracy_rule.conditions:
                    attri_value = highest_accuracy_rule.conditions[key]

                    highest_node.conditions[key] = attri_value

                highest_node.accuracy = highest_accuracy_rule.accuracy
                highest_node.coverage = highest_accuracy_rule.coverage
            
        if len(best_classes_rule_lst) == 0:
            return None, None
        
        if highest_node is None:
            # this node is the best among other nodes including nodes from other class labels
            highest_node = get_rule_with_highest_accuracy(best_classes_rule_lst, accuracy_thresh, coverage_thresh)
        
        rows_covered_by_rule = get_condition_based_rows(rows_covered_by_rule, highest_node, attributes)

        if highest_node.accuracy == accuracy_thresh or highest_node.coverage < coverage_thresh:
            break
            
    
    remaining_rows = get_remaining_rows(rows, rows_covered_by_rule)
        
    return remaining_rows, highest_node
    
    
   
        
        
    
    
    

In [161]:
def PRISM_algorithm(rows, col_lst, accuracy_thresh=None, coverage_thresh=None):
    
    # copy all the data to another list 
    # so that when we remove rows, it will not affect original lst
    # here, remaining rows will be equal to original rows
    remaining_rows = copy_rows(rows)
    
    # eg. ['Yes', 'No'] or ['alive', 'dead']
    class_labels = get_unique_class_label(rows)
    
    # list of rule nodes 
    rules_lst = []
    
    # exclude the class label
    attributes = col_lst[:-1]
    
    # remaining_rows is not empty yet so learn a rule
    while len(remaining_rows) != 0:
        
        
        remaining_rows, rule_node = learn_one_rule(remaining_rows, class_labels, attributes, accuracy_thresh, coverage_thresh)
        
        
        #print("Remaining: ")
        #print(remaining_rows)
        
        if rule_node is None:
            break
        
        
        
        # add tup to rules
        rules_lst.append(rule_node)
        
    return rules_lst

<h5>Testing PRISM Algorithm using small dataset: weather dataset</h5>

In [13]:
col_lst = ['Outlook', 'Temp', 'Humidity', 'Windy', 'Play']

data_rows = [
    ['Sunny', 'Hot', 'High', 'False', 'No'],
    ['Sunny', 'Hot', 'High', 'True', 'No'],
    ['Overcast', 'Hot', 'High', 'False', 'Yes'],
    ['Rainy', 'Mild', 'High', 'False', 'Yes'],
    ['Rainy', 'Cool', 'Normal', 'False', 'Yes'],
    ['Rainy', 'Cool', 'Normal', 'True', 'No'],
    ['Overcast', 'Cool', 'Normal', 'True', 'Yes'],
    ['Sunny', 'Mild', 'High', 'False', 'No'],
    ['Sunny', 'Cool', 'Normal', 'False', 'Yes'],
    ['Rainy', 'Mild', 'Normal', 'False', 'Yes'],
    ['Sunny', 'Mild', 'Normal', 'True', 'Yes'],
    ['Overcast', 'Mild', 'High', 'True', 'Yes'],
    ['Overcast', 'Hot', 'Normal', 'False', 'Yes'],
    ['Rainy', 'Mild', 'High', 'True', 'No']
]

In [41]:
# eg of rule
conditions_demo = {'Outlook': 'Overcast'}

rule_demo = Rule(conditions=conditions_demo, class_label='Yes',accuracy=1.0,coverage=4)
print(rule_demo)
    

If Outlook=Overcast, then Yes. Accuracy = 1.0. Coverage = 4


In [91]:
print("getting attributes that are not used in the rule")

test_rule_node = Rule({'Humidity': 'normal'}, 'Yes', 1.0, 0.29)
print(get_remaining_attributes(col_lst[:-1], test_rule_node))
print("\n")

getting attributes that are not used in the rule
['Outlook', 'Temp', 'Windy']




In [123]:
print("Getting remaining rows after first rule: if outlook = overcast, then yes")
test_rule_node = Rule({'Outlook': 'Overcast'}, 'Yes', 1.0, 4)
test_covered_rows = get_condition_based_rows(data_rows, test_rule_node, col_lst[:-1])

#test_yes_class_rows = get_specific_rows(data_rows, -1, 'Yes')
#test_covered_rows = get_condition_based_rows(test_yes_class_rows, test_rule, col_lst[:-1])
rem_data_testing = get_remaining_rows(data_rows, test_covered_rows)
print(rem_data_testing)
print("\n")

print("Getting remaining rows after second rule: if humidity = normal and windy = false, then yes")
test_rule_node = Rule({'Humidity': 'Normal', 'Windy': 'False'}, 'Yes', 1.0, 3)
test_covered_rows = get_condition_based_rows(rem_data_testing, test_rule_node, col_lst[:-1])
rem_data_testing = get_remaining_rows(rem_data_testing, test_covered_rows)
print(rem_data_testing)
print("\n")

print("Getting remaining rows after third rule: if humidity = high and outlook = sunny, then no")
test_rule_node = Rule({'Humidity': 'High', 'Outlook': 'Sunny'}, 'No', 1.0, 3)
test_covered_rows = get_condition_based_rows(rem_data_testing, test_rule_node, col_lst[:-1])
rem_data_testing = get_remaining_rows(rem_data_testing, test_covered_rows)
print(rem_data_testing)
print("\n")


Getting remaining rows after first rule: if outlook = overcast, then yes
[['Sunny', 'Hot', 'High', 'False', 'No'], ['Sunny', 'Hot', 'High', 'True', 'No'], ['Rainy', 'Mild', 'High', 'False', 'Yes'], ['Rainy', 'Cool', 'Normal', 'False', 'Yes'], ['Rainy', 'Cool', 'Normal', 'True', 'No'], ['Sunny', 'Mild', 'High', 'False', 'No'], ['Sunny', 'Cool', 'Normal', 'False', 'Yes'], ['Rainy', 'Mild', 'Normal', 'False', 'Yes'], ['Sunny', 'Mild', 'Normal', 'True', 'Yes'], ['Rainy', 'Mild', 'High', 'True', 'No']]


Getting remaining rows after second rule: if humidity = normal and windy = false, then yes
[['Sunny', 'Hot', 'High', 'False', 'No'], ['Sunny', 'Hot', 'High', 'True', 'No'], ['Rainy', 'Mild', 'High', 'False', 'Yes'], ['Rainy', 'Cool', 'Normal', 'True', 'No'], ['Sunny', 'Mild', 'High', 'False', 'No'], ['Sunny', 'Mild', 'Normal', 'True', 'Yes'], ['Rainy', 'Mild', 'High', 'True', 'No']]


Getting remaining rows after third rule: if humidity = high and outlook = sunny, then no
[['Rainy', 'Mild',

In [162]:
print("----------Testing PRISM Algorithm on weather dataset----------")
test_weather_rules = PRISM_algorithm(rows=data_rows, col_lst=col_lst, accuracy_thresh=1.0, coverage_thresh=3)

for rule in test_weather_rules:
    print(rule)

----------Testing PRISM Algorithm on weather dataset----------
If Outlook=Overcast, then Yes. Accuracy = 1.0. Coverage = 4
If Humidity=Normal and Windy=False, then Yes. Accuracy = 1.0. Coverage = 3
If Humidity=High and Outlook=Sunny, then No. Accuracy = 1.0. Coverage = 3


<h3>Applying the PRISM Algorithm on COVID-19 dataset</h3>

In this dataset we have the following attributes:
1. sex: 1 -woman, 2-man
2. age: numeric
3. diabetes: yes/no
4. copd (chronic obstructive pulmonary disease): yes/no
5. asthma: yes/no
6. imm_supr (suppressed immune system): yes/no
7. hypertension: yes/no
8. cardiovascular: yes/no
9. renal_chronic: yes/no
10. tobacco: yes/no	
11. outcome: alive/dead

In [163]:
data_file = "../../data_ml_2020/covid_categorical_good.csv"

In [164]:
import pandas as pd
data = pd.read_csv(data_file)
data = data.dropna(how="any")
data.columns

Index(['sex', 'age', 'diabetes', 'copd', 'asthma', 'imm_supr', 'hypertension',
       'cardiovascular', 'obesity', 'renal_chronic', 'tobacco', 'outcome'],
      dtype='object')

In [165]:
data_rows = data.to_numpy().tolist()
len(data_rows)

219179

In [166]:
columns_list = data.columns.to_numpy().tolist()
print(columns_list)

['sex', 'age', 'diabetes', 'copd', 'asthma', 'imm_supr', 'hypertension', 'cardiovascular', 'obesity', 'renal_chronic', 'tobacco', 'outcome']


<h4>Summary of the rules discovered from the dataset</h4>