# Rules vs ML
All the data was obtained by command:

`run RulesExperiment ...`

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
def load_predicates(cell):
    ps = pd.read_csv('/home/oleg/Desktop/data/predicates/roadmapepigenomics/\
predicates_{0}.csv'.format(cell), sep=',', na_values=False, low_memory=False, comment='#')
    ps = ps.replace(np.nan, False).replace('+', True).replace('-', False)
    ps = ps[ps.columns[1:]]
    return ps
predicates = load_predicates('spleen')
predicates.head()

In [None]:
rules = pd.read_csv('/home/oleg/Desktop/data/rules/roadmapepigenomics/rules.csv', sep=',', comment='#')
rules['id'] = rules['id'].astype('category')
rules = rules.sort_values(by=['conviction'], ascending=[0])
rules = rules[np.logical_and(rules['conviction'] > 5, rules['support'] > 1000)]
rules.head(5)

### KL itself is not that representative

In [None]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from math import log
fig = plt.figure(figsize=(6, 6))
ax = fig.add_subplot(111, projection='3d')
ax.set_xlabel('Conviction')
ax.set_ylabel('KL')
ax.set_zlabel('Support')
ax.bar(rules['conviction'], rules['support'], rules['KL'], zdir='y', alpha = 0.5)
plt.show()

Load predicates.

In [None]:
predicates = {}
cells = rules['id'].cat.categories
for cell in cells:
    print('Loading', cell)
    ps = load_predicates(cell)
    predicates[cell] = ps

### Rules mining vs ML

Accuracy can be evaluated having error types values, see: https://en.wikipedia.org/wiki/Accuracy_and_precision

Select rules with high conviction and test them vs ML.

Let us investigate dependence on condition, target, support on conviction.

In [None]:
rules['FP'] = rules['error_type_1']
rules['TP'] = rules['support']
rules['FN'] = rules['error_type_2']
rules['precision'] = rules['TP'] / (rules['TP'] + rules['FP'])
rules['recall'] = rules['TP'] / (rules['TP'] + rules['FN'])
rules.head()

In [None]:
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111, projection='3d')
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.set_zlabel('Conviction')
ax.bar(rules['recall'], rules['conviction'], rules['precision'], zdir='y', alpha = 0.5)
plt.show()

# Rules vs ML

In [None]:
import numpy as np
from pandas import DataFrame
from sklearn.cross_validation import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import re

names = ["Decision Tree", "Random Forest", "AdaBoost"]
classifiers = [
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier()]

headers = [x + " precision" for x in names] + [x + " recall" for x in names]
headers.sort()


def allow(p1, p2):
    """Check if two predicates can be used in a single rule"""
    m1 = re.compile(r'(NO )?([^\[@]+)(.*)').search(p1).group(2)
    m2 = re.compile(r'(NO )?([^\[@]+)(.*)').search(p2).group(2)
    return m1 != m2
# Small tests, ingore
print(allow('H3K4me3@tss', 'H3K4me3@tes'))
print(allow('NO H3K4me3@tss', 'H3K4me3@tes'))
print(allow('H3K4me2@tss', 'H3K4me3@tes'))


def test_ml(X, target):
    Y = X[target]
    X = X[X.columns[5:]].drop(target, 1)
    ignored = set()
    for p in X.columns:
        if not allow(p, target):
            if p in X.columns:
                X = X.drop(p, 1)
                ignored.add(p)
    
    print("Ignored {1} => {0}".format(target, ignored))
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.4)

    results = {}
    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        precision = precision_score(y_test, clf.predict(X_test))
        recall = recall_score(y_test, clf.predict(X_test))
        results[name + " precision"] = precision_score(y_test, clf.predict(X_test))
        results[name + " recall"] = recall_score(y_test, clf.predict(X_test))
        
    return results

## Load top rules and analyze accuracy and precision vs ML algorithms

In [None]:
TOP_RULES = 20
results = DataFrame(columns=['index', 'cell', 'condition', 'target', 'conviction', 'precision', 'recall'] + headers)
rules2process = rules.head(TOP_RULES)
i = 0
for n, row in rules2process.iterrows():
    cell = row['id']
    i+=1
    print("{0}/{1} {2} => {3}".format(i, len(rules2process), row['condition_name'], row['target_name']))
    ml = test_ml(predicates[cell], row['target_name'])
    results.loc[len(results)] = [n, row['id'], row['condition_name'], row['target_name'], 
                                    row['conviction'], row['precision'], row['recall']] + [ml[x] for x in headers]
results.head()

## The difference is clear - RM leads to higher precision!

In [None]:
import seaborn as sns
sns.boxplot(data=results[['precision', 'recall'] + headers])
sns.despine(offset=10, trim=True)
plt.xticks(rotation=45)
plt.show()