# Classification Rules For COVID-19 dataset Using PRISM Algorithm

<h3>PRISM Algorithm Implementation</h3>

In [7]:
def get_unique_class_label(rows):
    
    class_set = set()
    
    for row in rows:
        
        # get the last column
        class_label = row[-1]
        if class_label not in class_set:
            class_set.add(class_label)
            
    return list(class_set)
        

In [66]:
def copy_rows(rows):
    nested_lst = []
    for row in rows:
        inner_lst = []
        for each_data in row:
            inner_lst.append(each_data)
            
        nested_lst.append(inner_lst)
    return nested_lst

In [32]:
# get specific rows based on attribute index and value of that attribute
def get_specific_rows(rows, attr_indx, val):
    return [row for row in rows if row[attr_indx] == val]

In [153]:
# get attributes that have not been used by the rule
def get_remaining_attributes(attributes, rule):
    if len(rule) == 0:
        return attributes
    
    rule_dic = rule[0]
    
    return [attribute if attribute not in rule_dic else None for attribute in attributes]
        

In [40]:
def get_unique_attribute_vals(rows, indx):
    attribute_val_set = set()
    lst = [row[indx] for row in rows]
    return list(set(lst))

In [70]:
def get_attribute_indx(attribute, attribute_lst):
    for indx, attr in enumerate(attribute_lst):
        if attribute == attr:
            return indx
    

In [83]:
def get_condition_based_rows(rows, rule, attr_lst):
    rule_dic = rule[0]
    
    covered_rows = copy_rows(rows)
    
    for key in rule_dic:
        indx = get_attribute_indx(key, attr_lst)
        
        covered_rows = [row for row in covered_rows if row[indx] == rule_dic[key]]
        
    return covered_rows

In [54]:
def calculate_accuracy(correct_rows, covered_rows):
    no_of_correct_class = len(correct_rows)
    no_of_covered_rows = len(covered_rows)
    
    # accuracy is between 0.0 and 1.0
    accuracy = no_of_correct_class / float(no_of_covered_rows)
    return accuracy, no_of_correct_class

In [109]:
def get_rule_with_highest_accuracy(possible_rules):
    rule_lst = []
    
    highest_acc = 0
    highest_correctness = 0
    
    for rule in possible_rules:
        acc = rule[2]
        correct_num = rule[3]
        
         
        if acc > highest_acc:
            highest_acc = acc
            highest_correctness = correct_num
            
            if len(rule_lst) == 0:
                rule_lst.append(rule)
            else:
                rule_lst[0] = rule
                
        elif acc == highest_acc:
            if correct_num > highest_correctness:
                highest_correctness = correct_num
                
                if len(rule_lst) == 0:
                    rule_lst.append(rule)
                else:
                    rule_lst[0] = rule
                    
    return rule_lst[0]
        
        

In [85]:
# remove rows which are covered by the rules
def get_remaining_rows(rows, covered_rows):
    return [row for row in rows if row not in covered_rows]
    

In [156]:
def learn_one_rule(rows, class_labels, attributes, accuracy, coverage):
    
    # for each class label
    for class_label in class_labels:
         
        # get rows with that class label
        # -1 refers to index of value of class label
        specific_rows = get_specific_rows(rows, -1, class_label)
        
        if len(specific_rows) == 0:
            continue
        
        rows_covered_by_rule = copy_rows(rows)
            
        
        rule_lst = []
        
       
    
        while True:
            all_possible_rules = []
            
            # remaining attributes that haven't been used in the rule
            remaining_attributes = get_remaining_attributes(attributes, rule_lst)


            for indx, attr in enumerate(remaining_attributes):
                
                # get unique attribute values
                remaining_attr_vals = get_unique_attribute_vals(rows_covered_by_rule, indx)

                for attr_val in remaining_attr_vals:

                    # note: both attribute and values of attribute results in condition
                    # eg. If 'Outlook' = 'Sunny' 

                    # get rows based on condition
                    # Eg. if 'Outlook' = 'Sunny'
                    # these are rows that satisfy only the condition
                    only_condition_rows = get_condition_based_rows(rows_covered_by_rule, [{attr: attr_val}], attributes)

                    print("only condition rows for {} with val={}: ".format(attr,attr_val))
                    print(only_condition_rows)
                    print("\n")

                    # get rows based on condition and class label
                    # Eg. if 'Outlook' = 'Sunny', then 'Yes'
                    # these are rows that satisfy both condition and class label
                    condition_class_label_rows = get_condition_based_rows(specific_rows, [{attr: attr_val}], attributes)

                    # calculate accuracy
                    calculated_acc, correct_num = calculate_accuracy(condition_class_label_rows, only_condition_rows)


                    all_possible_rules.append([{attr: attr_val}, class_label , calculated_acc, correct_num])

            highest_accuracy_rule = get_rule_with_highest_accuracy(all_possible_rules)

            print("highest accuracy rule: ")
            print(highest_accuracy_rule)

            # add rule info to rule_lst
            if len(rule_lst) == 0:
                rule_lst = highest_accuracy_rule
            else:
                # if rule info has been added before, then just update

                rule_dic = rule_lst[0]

                # update condition dictionary
                another_rule_dic = highest_accuracy_rule[0]
                for key in another_rule_dic:
                    attri_value = another_rule_dic[key]
                    rule_lst[0][key] = attri_value

                # update accuracy and correctness
                rule_lst[2] = highest_accuracy_rule[2]
                rule_lst[3] = highest_accuracy_rule[3]

            print('rules list: ')
            print(rule_lst)
            print('\n')
            rows_covered_by_rule = get_condition_based_rows(rows_covered_by_rule, rule_lst, attributes)
            
            if highest_accuracy_rule[2] == 1.0 or highest_accuracy_rule[3] < 3:
                break
                
                
        remaining_rows = get_remaining_rows(rows, rows_covered_by_rule)
        
        return remaining_rows, rule_lst
        
        
    
    

In [134]:
def PRISM_algorithm(rows, col_lst, accuracy=None, coverage=None):
    
    # copy all the data to another list 
    # so that when we remove rows, it will not affect original lst
    # here, remaining rows will be equal to original rows
    remaining_rows = copy_rows(rows)
    
    # eg. ['Yes', 'No'] or ['alive', 'dead']
    class_labels = get_unique_class_label(rows)
    
    # list of rules 
    rules = []
    
    # exclude the class label
    attributes = col_lst[:-1]
    
    # remaining_rows is not empty yet so learn a rule
    while len(remaining_rows) != 0:
        
        # rule_lst = list consisting of info about rule
        # eg of rule_lst: 
        # [{'humidity': 'normal', 'windy': 'False'}, 'Yes', 1.0, 2]
        remaining_rows, rule_lst = learn_one_rule(remaining_rows, class_labels, attributes, accuracy, coverage)
        
        
        # add tup to rules
        rules.append(rule_lst)
        
    return rules

<h5>Testing PRISM Algorithm using small dataset: weather dataset</h5>

In [5]:
col_lst = ['Outlook', 'Temp', 'Humidity', 'Windy', 'Play']

data_rows = [
    ['Sunny', 'Hot', 'High', 'False', 'No'],
    ['Sunny', 'Hot', 'High', 'True', 'No'],
    ['Overcast', 'Hot', 'High', 'False', 'Yes'],
    ['Rainy', 'Mild', 'High', 'False', 'Yes'],
    ['Rainy', 'Cool', 'Normal', 'False', 'Yes'],
    ['Rainy', 'Cool', 'Normal', 'True', 'No'],
    ['Overcast', 'Cool', 'Normal', 'True', 'Yes'],
    ['Sunny', 'Mild', 'High', 'False', 'No'],
    ['Sunny', 'Cool', 'Normal', 'False', 'Yes'],
    ['Rainy', 'Mild', 'Normal', 'False', 'Yes'],
    ['Sunny', 'Mild', 'Normal', 'True', 'Yes'],
    ['Overcast', 'Mild', 'High', 'True', 'Yes'],
    ['Overcast', 'Hot', 'Normal', 'False', 'Yes'],
    ['Rainy', 'Mild', 'High', 'True', 'No']
]

In [157]:
print("Get unique class value (eg. ['Yes','No'])")
test_class_labels = get_unique_class_label(data_rows)
print(test_class_labels)
print("\n")

print("Getting specific rows based on class label")
test_rows = get_specific_rows(data_rows, -1, 'Yes')
print(test_rows)
print("\n")

print("getting attributes that are not used in the rule")
rule = [{'Humidity': 'normal'}, 'Yes', 1.0, 0.29]
print(get_remaining_attributes(col_lst[:-1], rule))
print("\n")

print("getting unique attribute values based on 'Outlook' attribute")
print(get_unique_attribute_vals(data_rows, 0))
print("\n")

print("getting rows based on condition")
print("In this eg: Outlook = sunny condition")
rule = [{'Outlook': 'Sunny'}]
test_condition = get_condition_based_rows(data_rows, rule, col_lst[:-1])
print(test_condition)
print("\n")

print("getting rows based on condition and class")
print("In this eg: if Outlook = sunny then Yes")
test_condition_class = get_condition_based_rows(test_rows, rule, col_lst[:-1])
print(test_condition_class)
print("\n")

print("getting rows based on conditions")
print("In this eg: if Humidity = normal and Windy = false")
rule = [{'Humidity': 'Normal', 'Windy': 'False'}]
test_rem_data = [['Sunny', 'Hot', 'High', 'False', 'No'], ['Sunny', 'Hot', 'High', 'True', 'No'], ['Rainy', 'Mild', 'High', 'False', 'Yes'], ['Rainy', 'Cool', 'Normal', 'False', 'Yes'], ['Rainy', 'Cool', 'Normal', 'True', 'No'], ['Sunny', 'Mild', 'High', 'False', 'No'], ['Sunny', 'Cool', 'Normal', 'False', 'Yes'], ['Rainy', 'Mild', 'Normal', 'False', 'Yes'], ['Sunny', 'Mild', 'Normal', 'True', 'Yes'], ['Rainy', 'Mild', 'High', 'True', 'No']]
test_conditions_class = get_condition_based_rows(test_rem_data, rule, col_lst[:-1])
print(test_conditions_class)
print("\n")

print("Finding Accuracy")
test_acc,test_no_of_correct = calculate_accuracy(test_condition_class, test_condition)
print("accuracy: ",test_acc)
print("no of correctness: ", test_no_of_correct)
print("\n")

print("Getting rules with highest accuracy: ")
'''
test_possible_rules = [
    [{'Temp': 'Mild'}, 'Yes', 1.0, 2],
    [{'Windy': 'True'}, 'Yes', 0.5, 1],
    [{'Windy': 'False'}, 'Yes', 1.0, 3]
]
'''
test_possible_rules = [[{'Outlook': 'Sunny'}, 'Yes', 0.4, 2], [{'Outlook': 'Overcast'}, 'Yes', 1.0, 4], [{'Outlook': 'Rainy'}, 'Yes', 0.6, 3], [{'Temp': 'Hot'}, 'Yes', 0.5, 2], [{'Temp': 'Cool'}, 'Yes', 0.75, 3], [{'Temp': 'Mild'}, 'Yes', 0.6666666666666666, 4], [{'Humidity': 'Normal'}, 'Yes', 0.8571428571428571, 6], [{'Humidity': 'High'}, 'Yes', 0.42857142857142855, 3], [{'Windy': 'False'}, 'Yes', 0.75, 6], [{'Windy': 'True'}, 'Yes', 0.5, 3]]
test_best_rule = get_rule_with_highest_accuracy(test_possible_rules)
print(test_best_rule)
print("\n")


print("Getting remaining rows after first rule: if outlook = overcast, then yes")
test_rule = [{'Outlook': 'Overcast'}, 'Yes', 1.0, 4]
test_yes_class_rows = get_specific_rows(data_rows, -1, 'Yes')
test_covered_rows = get_condition_based_rows(test_yes_class_rows, test_rule, col_lst[:-1])
print(get_remaining_rows(data_rows, test_covered_rows))
print("\n")


print("Learn First rule")
test_remaining_data, test_rules = learn_one_rule(rows=data_rows,class_labels=test_class_labels,
                                            attributes=col_lst[:-1], accuracy=1.0,
                                            coverage=3
                                           )
print("Remaining_data: ")
print(test_remaining_data)
print("Rules found: ")
print(test_rules)
print("\n")

print("Learn Second rule")
test_remaining_data, test_rules = learn_one_rule(rows=test_remaining_data,class_labels=test_class_labels,
                                            attributes=col_lst[:-1], accuracy=1.0,
                                            coverage=3
                                           )
print("Remaining_data: ")
print(test_remaining_data)
print("Rules found: ")
print(test_rules)
'''
print("----------Testing PRISM Algorithm on weather dataset----------")
test_weather_rules = PRISM_algorithm(rows=data_rows, col_lst=col_lst, accuracy=1.0, coverage=3)
print(test_weather_rules)
'''


Get unique class value (eg. ['Yes','No'])
['Yes', 'No']


Getting specific rows based on class label
[['Overcast', 'Hot', 'High', 'False', 'Yes'], ['Rainy', 'Mild', 'High', 'False', 'Yes'], ['Rainy', 'Cool', 'Normal', 'False', 'Yes'], ['Overcast', 'Cool', 'Normal', 'True', 'Yes'], ['Sunny', 'Cool', 'Normal', 'False', 'Yes'], ['Rainy', 'Mild', 'Normal', 'False', 'Yes'], ['Sunny', 'Mild', 'Normal', 'True', 'Yes'], ['Overcast', 'Mild', 'High', 'True', 'Yes'], ['Overcast', 'Hot', 'Normal', 'False', 'Yes']]


getting attributes that are not used in the rule
['Outlook', 'Temp', None, 'Windy']


getting unique attribute values based on 'Outlook' attribute
['Sunny', 'Overcast', 'Rainy']


getting rows based on condition
In this eg: Outlook = sunny condition
[['Sunny', 'Hot', 'High', 'False', 'No'], ['Sunny', 'Hot', 'High', 'True', 'No'], ['Sunny', 'Mild', 'High', 'False', 'No'], ['Sunny', 'Cool', 'Normal', 'False', 'Yes'], ['Sunny', 'Mild', 'Normal', 'True', 'Yes']]


getting rows based on con

IndexError: list index out of range

<h3>Applying the PRISM Algorithm on COVID-19 dataset</h3>

In this dataset we have the following attributes:
1. sex: 1 -woman, 2-man
2. age: numeric
3. diabetes: yes/no
4. copd (chronic obstructive pulmonary disease): yes/no
5. asthma: yes/no
6. imm_supr (suppressed immune system): yes/no
7. hypertension: yes/no
8. cardiovascular: yes/no
9. renal_chronic: yes/no
10. tobacco: yes/no	
11. outcome: alive/dead

In [None]:
data_file = "../../data_ml_2020/covid_categorical_good.csv"

<h4>Summary of the rules discovered from the dataset</h4>