# IS-02 Machine Learning - Data and Web Science
## Lecture 4: Rule-based Learning
## Project 3 - CN2
### <i>Avgitidis Konstantinos AM: 65</i>

In [1]:
#Importing necessary libraries
import Orange
from Orange.classification.rules import CN2Learner,CN2UnorderedLearner
from Orange.classification.rules import Evaluator as ev
from Orange.classification.rules import LaplaceAccuracyEvaluator
import pandas as pd
import random
import pickle

In [2]:
winedata = Orange.data.Table.from_file("wine.csv") #Load data into an Orange Table

In [3]:
#Lets print the data so we know what this dataset is all about
wines = pd.read_csv('wine.csv')
wines.head(10)

Unnamed: 0,Alcohol,Malic Acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline,Wine
0,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous,discrete
1,,,,,,,,,,,,,,class
2,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,1
3,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,1
4,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,1
5,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,1
6,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,1
7,14.2,1.76,2.45,15.2,112.0,3.27,3.39,0.34,1.97,6.75,1.05,2.85,1450.0,1
8,14.39,1.87,2.45,14.6,96.0,2.5,2.52,0.3,1.98,5.25,1.02,3.58,1290.0,1
9,14.06,2.15,2.61,17.6,121.0,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295.0,1


In [4]:
#Some useful data about our dataset
n = len(winedata.domain.attributes)
n_cont = sum(1 for a in winedata.domain.attributes if a.is_continuous)
n_disc = sum(1 for a in winedata.domain.attributes if a.is_discrete)
print("%d attributes: %d continuous, %d discrete" % (n, n_cont, n_disc))

print(
    "First three attributes:",
    ", ".join(winedata.domain.attributes[i].name for i in range(3)),
)

print("Class:", winedata.domain.class_var.name)

13 attributes: 13 continuous, 0 discrete
First three attributes: Alcohol, Malic Acid, Ash
Class: Wine


In [5]:
#Creating necessary lists for our output DataFrame
ordering = ['Ordered','Unordered','Ordered']
evaluator = ['Default (Entropy)','Default (Laplace)','Laplace']
#Creating our 3 main learners
lr_ord = Orange.classification.rules.CN2Learner()
lr_unord = Orange.classification.rules.CN2UnorderedLearner()
lr_ord2 = Orange.classification.rules.CN2Learner()
lr_ord2.rule_finder.quality_evaluator = LaplaceAccuracyEvaluator()
learners = [lr_ord,lr_unord,lr_ord2]

In [6]:
#Creating our DataFrame of Rule-Based models
#We really dont want to be creating the models each time so lets pickle it or laod it if we've previously created it
#The first loop's index can changed to a smaller int to decrease computation time
#Multithreading can also help cut down time
amodel = []
alist=[]
try:
    models = pickle.load( open( "rule_learning.p", "rb" ) )
except Exception:
    #Initialise the DataFrame we're going to be using
    columns = ['Algorithm', 'Rule ordering', 'Evaluator','Beam width','Min rule coverage','Max rule length','Accuracy','Precision','Recall','F1','Rules']
    models = pd.DataFrame(columns=columns) 
    for j in range(20):
        for i in range(3):
            learner = learners[i]
            arule = '\n'
            amodel.append('Rule-Based')
            amodel.append(ordering[i])
            amodel.append(evaluator[i])
            # consider 3 to 10 solution streams at one time
            learner.rule_finder.search_algorithm.beam_width = random.randint(3,10)
            #Don't constrain continuous values, it takes more time but better results
            learner.rule_finder.search_strategy.constrain_continuous = False
            # found rules must cover at least 7-15 examples
            learner.rule_finder.general_validator.min_covered_examples = random.randint(7,15)
            # found rules may combine at most 3 to 5 selectors (conditions)
            learner.rule_finder.general_validator.max_rule_length = random.randint(3,5)
            amodel.append(learner.rule_finder.search_algorithm.beam_width)
            amodel.append(learner.rule_finder.general_validator.min_covered_examples)
            amodel.append(learner.rule_finder.general_validator.max_rule_length)
            classifier = learner(winedata)
            cv = Orange.evaluation.CrossValidation(winedata, [learner], k=5) #Cross-Validate the results 5-fold
            amodel.append(Orange.evaluation.scoring.CA(cv)[0])
            amodel.append(Orange.evaluation.scoring.Recall(cv,average='macro')[0])
            amodel.append(Orange.evaluation.scoring.Precision(cv,average='macro')[0])
            amodel.append(Orange.evaluation.scoring.F1(cv,average='macro')[0])
            for i in classifier.rule_list: #getting all the rules in one row
                alist.append(str(i))

            arule = arule.join(alist)
            amodel.append(arule)
            models = models.append(pd.DataFrame([amodel], columns=columns))
            amodel.clear()
            alist.clear()
    models.reset_index(drop=True)
    pickle.dump(models, open( "rule_learning.p", "wb" ) )


In [7]:
from pandas import ExcelWriter
with ExcelWriter(path="RuleBased_results_all.xlsx",engine='openpyxl',mode='w') as xl_writer:
    models.to_excel(excel_writer=xl_writer,sheet_name='Sheet1',index=False,float_format="%.6f",freeze_panes=(1,0)) #Saving every model to an excel

In [8]:
#Getting the highest sum of metrics
models['Sum'] = models[['Accuracy','Precision','Recall','F1']].sum(axis=1)
models.sort_values(by=['Evaluator','Sum'],axis=0)
models.drop_duplicates(subset='Evaluator',keep='first',inplace=True)
models.reset_index(drop=True)
models.drop(columns='Sum',inplace=True)

In [9]:
with ExcelWriter(path="RuleBased_results.xlsx",engine='openpyxl',mode='w') as xl_writer:
    models.to_excel(excel_writer=xl_writer,sheet_name='Sheet1',index=False,float_format="%.6f",freeze_panes=(1,0)) #Saving best models to excel