In [1]:
from sklearn import tree
import graphviz
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn import metrics
import joblib
from sklearn.metrics import classification_report

In [2]:
def training_testing_tree_model(training_years,testing_year, balanced=False):
    
    '''
    Read the dataframe for training decision tree models
    Args: filed (disease/gene/demo/pm), year: a list of years
    Returns: a dataframe
    '''
    
    columns = ['Human_PM', 'Animal_PM', 'Not_PM', 'Disease_Exact',
       'Disease_General', 'Disease_Specific', 'Disease_Not', 'Gene_Exact',
       'Gene_Missing', 'Gene_Missing_Variant', 'Gene_Diff_Variant',
       'Demo_Match', 'Demo_Notdiscussed', 'Demo_Exclude']
    
    dfs=[]
    for year in training_years:
        df = pd.read_csv("../../data/parsedjudgements/judgments"+str(year)+".csv")
        dfs.append(df)
    training = pd.concat(dfs, axis=0)
    training_features = training[columns]
    training_labels = training["Relevance"]
    
    testing = pd.read_csv("../../data/parsedjudgements/judgments"+str(testing_year)+".csv")
    testing_features = testing[columns]
    testing_labels = testing["Relevance"]

    if balanced == False:
        
        tree_model = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = 6)
        tree_model.fit(training_features, training_labels)

        predicted_labels = tree_model.predict(testing_features)
        print(classification_report(testing_labels, predicted_labels))

        # save pdf
        dot_data = tree.export_graphviz(tree_model, out_file=None, 
                          feature_names=columns,  
                          class_names=['NonRelevant','Relevant','HighlyRelevant'],  
                          filled=True, rounded=True,  
                          special_characters=True)  
        graph = graphviz.Source(dot_data)  
        graph.render(str(testing_year)+'.tree.model.unbalanced', view=True)

        # save tree model
        joblib.dump(tree_model, str(testing_year)+".tree.model.unbalanced.pkl")
        
    if balanced == True:
        
        tree_model = tree.DecisionTreeClassifier(criterion = 'entropy', max_depth = 5, class_weight="balanced")
        tree_model.fit(training_features, training_labels)

        predicted_labels = tree_model.predict(testing_features)
        print(classification_report(testing_labels, predicted_labels))

        # save pdf
        dot_data = tree.export_graphviz(tree_model, out_file=None, 
                          feature_names=columns,  
                          class_names=['NonRelevant','Relevant','HighlyRelevant'],  
                          filled=True, rounded=True,  
                          special_characters=True)  
        graph = graphviz.Source(dot_data)  
        graph.render(str(testing_year)+'.tree.model.balanced', view=True)

        # save tree model
        joblib.dump(tree_model, str(testing_year)+".tree.model.balanced.pkl")

In [3]:
# training_testing_tree_model([2017], 2018, balanced=True)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     16841
           1       1.00      1.00      1.00      2146
           2       1.00      1.00      1.00      3442

    accuracy                           1.00     22429
   macro avg       1.00      1.00      1.00     22429
weighted avg       1.00      1.00      1.00     22429



In [8]:
def probalistic_tree_unbalanced(node_dic):
    '''
    Use if-else statements to replicate the trained tree structure
    Traverse the tree to go through each path
    Multiply the probability on the path, and aggregate at the leaf

    Input: a dictionary of <node, value> pairs
    Output: a list of probabilties (not rel, rel, hightly rel)
    '''

    list_non_rel = []
    list_rel = []
    list_highly_rel = []
    list_all = []

    # path 1 - non-relevant
    prob_path1 = node_dic['Not_PM']
    list_non_rel.append(prob_path1)
    list_all.append(prob_path1)

    # path 2 - non-relevant
    prob_path2 = (1 - node_dic['Not_PM']) * node_dic['Disease_Not']
    list_non_rel.append(prob_path2)
    list_all.append(prob_path2)

    # path 3 - non-relevant
    prob_path3 = (1 - node_dic['Not_PM']) *(1 - node_dic['Disease_Not']) * node_dic['Gene_Missing']
    list_non_rel.append(prob_path3)
    list_all.append(prob_path3)

    # path 4 - non-relevant
    prob_path4 = (1 - node_dic['Not_PM']) *(1 - node_dic['Disease_Not']) * (1 - node_dic['Gene_Missing']) * \
                    (1 - node_dic['Gene_Exact']) * node_dic['Demo_Exclude']
    list_non_rel.append(prob_path4)
    list_all.append(prob_path4)

    # path 5 - relevant
    prob_path5 = (1 - node_dic['Not_PM']) *(1 - node_dic['Disease_Not']) * (1 - node_dic['Gene_Missing']) * \
                    (1 - node_dic['Gene_Exact']) * (1 - node_dic['Demo_Exclude']) * \
                    (1 - node_dic['Gene_Missing'])
    list_rel.append(prob_path5)
    list_all.append(prob_path5)
    
    # path 6 - relevant
    prob_path6 = (1 - node_dic['Not_PM']) *(1 - node_dic['Disease_Not']) * (1 - node_dic['Gene_Missing']) * \
                    (1 - node_dic['Gene_Exact']) * (1 - node_dic['Demo_Exclude']) * \
                    node_dic['Gene_Missing']
    list_rel.append(prob_path6)
    list_all.append(prob_path6)
    
    # path 7 - highly-relevant
    prob_path7 = (1 - node_dic['Not_PM']) *(1 - node_dic['Disease_Not']) * (1 - node_dic['Gene_Missing']) * \
                    node_dic['Gene_Exact'] * (1 - node_dic['Disease_General']) * (1 - node_dic['Demo_Exclude'])
    list_highly_rel.append(prob_path7)
    list_all.append(prob_path7)

    # path 8 - non-relevant
    prob_path8 = (1 - node_dic['Not_PM']) *(1 - node_dic['Disease_Not']) * (1 - node_dic['Gene_Missing']) * \
                    node_dic['Gene_Exact'] * (1 - node_dic['Disease_General']) * node_dic['Demo_Exclude']
    list_non_rel.append(prob_path8)
    list_all.append(prob_path8)

    # path 9 - relevant
    prob_path9 = (1 - node_dic['Not_PM']) *(1 - node_dic['Disease_Not']) * (1 - node_dic['Gene_Missing']) * \
                    node_dic['Gene_Exact'] * node_dic['Disease_General'] * (1- node_dic['Demo_Exclude'])
    list_rel.append(prob_path9)
    list_all.append(prob_path9)

    # path 10 - non-relevant
    prob_path10 = (1 - node_dic['Not_PM']) *(1 - node_dic['Disease_Not']) * (1 - node_dic['Gene_Missing']) * \
                    node_dic['Gene_Exact'] * node_dic['Disease_General'] * node_dic['Demo_Exclude']
    list_non_rel.append(prob_path10)
    list_all.append(prob_path10)

    return [np.sum(list_non_rel), np.sum(list_rel), np.sum(list_highly_rel)]

In [20]:
def probalistic_tree_balanced(node_dic):
    '''
    Use if-else statements to replicate the trained tree structure
    Traverse the tree to go through each path
    Multiply the probability on the path, and aggregate at the leaf

    Input: a dictionary of <node, value> pairs
    Output: a list of probabilties (not rel, rel, hightly rel)
    '''

    list_non_rel = []
    list_rel = []
    list_highly_rel = []

    # path 1 - relevant
    prob_path1 = (1-node_dic['Gene_Exact'])*(1-node_dic['Not_PM'])*(1-node_dic['Gene_Missing'])*\
                 (1-node_dic['Disease_Not'])*(1-node_dic['Demo_Exclude'])
    list_rel.append(prob_path1)


    # path 2 - non-relevant
    prob_path2 = (1-node_dic['Gene_Exact'])*(1-node_dic['Not_PM'])*(1-node_dic['Gene_Missing'])*\
                 (1-node_dic['Disease_Not'])*node_dic['Demo_Exclude']
    list_non_rel.append(prob_path2)


    # path 3 - non-relevant
    prob_path3 = (1-node_dic['Gene_Exact'])*(1-node_dic['Not_PM'])*(1-node_dic['Gene_Missing'])*\
                 node_dic['Disease_Not']
    list_non_rel.append(prob_path3)


    # path 4 - non-relevant
    prob_path4 = (1-node_dic['Gene_Exact'])*(1-node_dic['Not_PM'])* node_dic['Gene_Missing']
    list_non_rel.append(prob_path4)

    
    # path 5 - non-relevant
    prob_path5 = (1-node_dic['Gene_Exact'])*node_dic['Not_PM']
    list_non_rel.append(prob_path5)

    
    # path 6 - hihgly-relevant
    prob_path6 = node_dic['Gene_Exact']*(1-node_dic['Disease_General'])*(1-node_dic['Disease_Not'])*\
                 (1-node_dic['Demo_Exclude'])*(1-node_dic['Gene_Exact'])
    list_highly_rel.append(prob_path6)

    
    # path 7 - hihgly-relevant
    prob_path7 = node_dic['Gene_Exact']*(1-node_dic['Disease_General'])*(1-node_dic['Disease_Not'])*\
                 (1-node_dic['Demo_Exclude'])*node_dic['Gene_Exact']
    list_highly_rel.append(prob_path7)


    # path 8 - non-relevant
    prob_path8 = node_dic['Gene_Exact']*(1-node_dic['Disease_General'])*(1-node_dic['Disease_Not'])*\
                 node_dic['Demo_Exclude']
    list_non_rel.append(prob_path8)


    # path 9 - non-relevant
    prob_path9 = node_dic['Gene_Exact']*(1-node_dic['Disease_General'])*node_dic['Disease_Not']
    list_non_rel.append(prob_path9)

    # path 10 - relevant
    prob_path10 = node_dic['Gene_Exact']*node_dic['Disease_General']*(1-node_dic['Demo_Exclude'])*\
                  (1-node_dic['Gene_Missing'])*(1-node_dic['Demo_Match'])
    list_rel.append(prob_path10)

    # path 11 - relevant
    prob_path11 = node_dic['Gene_Exact']*node_dic['Disease_General']*(1-node_dic['Demo_Exclude'])*\
                  (1-node_dic['Gene_Missing'])*node_dic['Demo_Match']
    list_rel.append(prob_path11)
    
    # path 12 - relevant
    prob_path12 = node_dic['Gene_Exact']*node_dic['Disease_General']*(1-node_dic['Demo_Exclude'])*\
                  node_dic['Gene_Missing']*(1-node_dic['Gene_Missing'])
    list_rel.append(prob_path12)

    # path 13 - relevant
    prob_path13 = node_dic['Gene_Exact']*node_dic['Disease_General']*(1-node_dic['Demo_Exclude'])*\
                  node_dic['Gene_Missing']*node_dic['Gene_Missing']
    list_rel.append(prob_path13)
    
    # path 14 - non-relevant
    prob_path14 = node_dic['Gene_Exact']*node_dic['Disease_General']*node_dic['Demo_Exclude']
    list_non_rel.append(prob_path14)
    
    return [np.sum(list_non_rel), np.sum(list_rel), np.sum(list_highly_rel)]

In [1]:
def probalistic_tree_balanced_all_probs(node_dic):

    list_all = []

    # path 1 - relevant
    prob_path1 = (1-node_dic['Gene_Exact'])*(1-node_dic['Not_PM'])*(1-node_dic['Gene_Missing'])*\
                 (1-node_dic['Disease_Not'])*(1-node_dic['Demo_Exclude'])
    list_all.append(prob_path1)


    # path 2 - non-relevant
    prob_path2 = (1-node_dic['Gene_Exact'])*(1-node_dic['Not_PM'])*(1-node_dic['Gene_Missing'])*\
                 (1-node_dic['Disease_Not'])*node_dic['Demo_Exclude']
    list_all.append(prob_path2)


    # path 3 - non-relevant
    prob_path3 = (1-node_dic['Gene_Exact'])*(1-node_dic['Not_PM'])*(1-node_dic['Gene_Missing'])*\
                 node_dic['Disease_Not']
    list_all.append(prob_path3)


    # path 4 - non-relevant
    prob_path4 = (1-node_dic['Gene_Exact'])*(1-node_dic['Not_PM'])* node_dic['Gene_Missing']
    list_all.append(prob_path4)

    
    # path 5 - non-relevant
    prob_path5 = (1-node_dic['Gene_Exact'])*node_dic['Not_PM']
    list_all.append(prob_path5)

    
    # path 6 - hihgly-relevant
    prob_path6 = node_dic['Gene_Exact']*(1-node_dic['Disease_General'])*(1-node_dic['Disease_Not'])*\
                 (1-node_dic['Demo_Exclude'])*(1-node_dic['Gene_Exact'])
    list_all.append(prob_path6)

    
    # path 7 - hihgly-relevant
    prob_path7 = node_dic['Gene_Exact']*(1-node_dic['Disease_General'])*(1-node_dic['Disease_Not'])*\
                 (1-node_dic['Demo_Exclude'])*node_dic['Gene_Exact']
    list_all.append(prob_path7)


    # path 8 - non-relevant
    prob_path8 = node_dic['Gene_Exact']*(1-node_dic['Disease_General'])*(1-node_dic['Disease_Not'])*\
                 node_dic['Demo_Exclude']
    list_all.append(prob_path8)


    # path 9 - non-relevant
    prob_path9 = node_dic['Gene_Exact']*(1-node_dic['Disease_General'])*node_dic['Disease_Not']
    list_all.append(prob_path9)

    # path 10 - relevant
    prob_path10 = node_dic['Gene_Exact']*node_dic['Disease_General']*(1-node_dic['Demo_Exclude'])*\
                  (1-node_dic['Gene_Missing'])*(1-node_dic['Demo_Match'])
    list_all.append(prob_path10)

    # path 11 - relevant
    prob_path11 = node_dic['Gene_Exact']*node_dic['Disease_General']*(1-node_dic['Demo_Exclude'])*\
                  (1-node_dic['Gene_Missing'])*node_dic['Demo_Match']
    list_all.append(prob_path11)
    
    # path 12 - relevant
    prob_path12 = node_dic['Gene_Exact']*node_dic['Disease_General']*(1-node_dic['Demo_Exclude'])*\
                  node_dic['Gene_Missing']*(1-node_dic['Gene_Missing'])
    list_all.append(prob_path12)

    # path 13 - relevant
    prob_path13 = node_dic['Gene_Exact']*node_dic['Disease_General']*(1-node_dic['Demo_Exclude'])*\
                  node_dic['Gene_Missing']*node_dic['Gene_Missing']
    list_all.append(prob_path13)
    
    # path 14 - non-relevant
    prob_path14 = node_dic['Gene_Exact']*node_dic['Disease_General']*node_dic['Demo_Exclude']
    list_all.append(prob_path14)
    
    return list_all