In [1]:
%run /Users/jiamingqu/Desktop/proj/scripts/classifier/read.dataframe.ipynb

In [2]:
import re
import numpy as np
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")
import joblib
import json
from imblearn.over_sampling import SMOTE, ADASYN

In [2]:
# you can get your own apiKey
apiKey = "Bearer " + "eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJhdSI6Imx4ZzphcGkiLCJzYyI6WyJrZzpyZWFkIiwiZXh0cmFjdGlvbjpyZWFkIl0sImFpIjoiYXBpOmM5NjUxNmRmLWYyZjYtMDRhNC1mYzVjLWQ5MmFjZGM0ZWZjMSIsInVpIjoidXNlcjpjZmYxMjM0MS1lN2FmLWEzMmUtNjM3YS0yNjFlMjRjZmVkZDAiLCJpYXQiOjE1ODc0MDM1MjF9.EsFfBoTNKl2TVgEwHF_qs8n2gkgwqJVNB0MiHLzM2P0"

In [3]:
def find_synonyms(conceptGraphId):
    '''
    Function to return the sy
    Args:
        A disease in lexigram ID
    Returns:
        A lit of disease synonyms or preferred terms
    '''
    
    url = "https://api.lexigram.io//v1/lexigraph/concepts/" + conceptGraphId
    r = requests.get(url, headers={'Authorization': apiKey})
    response = json.loads(r.text) 
    synonyms = [] 
    synonyms.append(response['label'])
    synonyms += response['synonyms']
    return synonyms

In [4]:
def find_ancestors(conceptGraphId):
    '''
    Function to return a list of ancestors
    Args:
        conceptGraphId in the lexigram KB
    Returns:
        a list of ancestors
    '''
    
    url = "https://api.lexigram.io/v1/lexigraph/concepts/" + conceptGraphId + "/ancestors"
    r = requests.get(url, headers={'Authorization': apiKey})
    response = json.loads(r.text)
    ancestors_list = []
    
    for match in response['results']:
        if match["types"] == ['FINDINGS', 'PROBLEMS']:
            ancestors_list.append(match["label"])
    
    return ancestors_list

In [5]:
def find_descendants(conceptGraphId):
    '''
    Function to return a list of descendants
    Args:
        conceptGraphId in the lexigram KB
    Returns:
        a list of descendants
    '''

    url = "https://api.lexigram.io/v1/lexigraph/concepts/" + conceptGraphId + "/descendants"
    r = requests.get(url, headers={'Authorization': apiKey})
    response = json.loads(r.text)
    descendants_list = []

    for match in response['results']:
        if match["types"] == ['FINDINGS', 'PROBLEMS']:
            descendants_list.append(match["label"])
    
    return descendants_list

In [6]:
# we download the synonyms, ancestors and decendants for each topic to save computation time
def save_lexigram_output(year):
    
    results = {}
    
    query_topics = read_query_topics(year,"disease")
    for k,v in query_topics.items():
        
        sub_results = {}
        
        lexigram_conceptID = find_lexigram_id(v)
        if lexigram_conceptID!=0:
            sub_results["synonyms"] = find_synonyms(lexigram_conceptID)
            sub_results["ancestors"] = find_ancestors(lexigram_conceptID)
            sub_results["descendants"] = find_descendants(lexigram_conceptID)
        else:
            sub_results["synonyms"] = []
            sub_results["ancestors"] = []
            sub_results["descendants"] = []
        
        results[k]=sub_results
        
    fp = str(year)+".disease.expansion.json"
    with open(fp, 'w') as f:
        json.dump(results,f)

In [7]:
# save_lexigram_output(2017)
# save_lexigram_output(2018)
# save_lexigram_output(2019)

In [8]:
def generate_features(year):
    
    # read dataframe and parsing
    df = read_dataframe(year, 'disease')
    df = df.dropna()
    df = df.reset_index(drop=True)
    
    # read query topics
    query_topics=read_query_topics(year,"disease")
    
    # read expansion terms
    with open(str(year)+'.disease.expansion.json','r') as f:
        for line in f.readlines():
            expansion_terms_dict = json.loads(line)
    f.close()
            
    # read acronyms: a dict of <disease, acronyms>
    acronyms_dict = dict()
    with open("acronyms.json",'r') as f:
         for line in f.readlines():
            acronyms_dict = json.loads(line)
    f.close()

    # corpus folder path
    folder_path = "../../../data/corpus/"
    
    # save results
    feature_table = pd.DataFrame(columns=["count_match_self", "count_match_ancestor", "count_match_descendant",
                                         "topicid","docid","label"])
    
    print("Parsing year {}".format(year))
    
    for topic in set(df.trec_topic_number):
        
        disease = query_topics[topic]
        expansion_terms=expansion_terms_dict[str(topic)]
        synonyms = expansion_terms["synonyms"]
        ancestors = expansion_terms["ancestors"]
        descendants = expansion_terms["descendants"]
        if disease in acronyms_dict.keys():
            acronyms = acronyms_dict[disease].split(" ")
        else:
            acronyms = []
        
        
        df_topic = df.loc[df.trec_topic_number == topic]
        for index, rows in df_topic.iterrows():
            
            docid = str(rows["trec_doc_id"])
            content = []
            with open (folder_path+docid+".txt", 'r') as f:
                for line in f.readlines():
                    content.append(line.strip())
            f.close()
            raw_text = " ".join(content)
            text = raw_text.lower()
            
            # 1) count disease itself (original term+synonyms+acronyms)
            count_match_self = text.count(disease.lower())
            for s in synonyms:
                count_match_self += text.count(s.lower())
            for acronym in acronyms:
                # do not downcase and count acronyms
                # otherwise you will get a lot of match of something like "cc", "aa"
                count_match_self += raw_text.count(acronym) 
        
            # 2) count ancestors
            count_match_ancestor = 0
            for a in ancestors:
                count_match_ancestor += text.count(a.lower())
            
            # 3) count general descriptors
            for v in ["human cancer", "human tumor"]:
                count_match_ancestor += text.count(v.lower())
        
            # 4) count descendants
            count_match_descendant = 0
            for d in descendants:
                count_match_descendant += text.count(d.lower())
            
            topicid = rows["trec_topic_number"]
            label = rows["disease_desc"]
            feature_table = feature_table.append({"count_match_self":count_match_self,
                                                  "count_match_ancestor":count_match_ancestor,
                                                  "count_match_descendant":count_match_descendant,
                                                  "topicid":topicid,
                                                  "docid":docid,
                                                  "label":label},ignore_index=True)
            
        print("Topic {} has been parsed".format(topic))
    
    assert df.shape[0] == feature_table.shape[0]
    feature_table.to_csv(str(year) + ".disease.features.csv", index=False, sep = ",")

In [9]:
# generate_features(2017)
# generate_features(2018)

In [3]:
def training_testing_classifier(training_years, testing_years):
    
    df_list=[]
    for year in training_years:
        df = pd.read_csv(str(year)+".disease.features.csv")
        df_list.append(df)
    df_training=pd.concat(df_list)
    df_testing=pd.read_csv(str(testing_years)+".disease.features.csv")
    
    features = ["count_match_self", "count_match_ancestor", "count_match_descendant"]
    training_features=df_training[features]
    testing_features=df_testing[features]
    training_labels=df_training.label
    testing_labels=df_testing.label
    
    # over-sampling
    from imblearn.over_sampling import SMOTE, ADASYN
    training_features, training_labels = SMOTE().fit_resample(training_features, training_labels)
    
    # training
    logistic_model = LogisticRegression(multi_class="ovr",penalty='l1',solver='liblinear',C=0.5)
    logistic_model.fit(training_features, training_labels)
    
    predicted_labels = logistic_model.predict(testing_features)
    print(classification_report(testing_labels, predicted_labels))
    
    joblib.dump(logistic_model, str(testing_years)+".disease.classifier.pkl")

In [4]:
# training_testing_classifier([2017],2018)

               precision    recall  f1-score   support

        Exact       0.70      0.74      0.72      5168
 More General       0.12      0.30      0.17       686
More Specific       0.54      0.12      0.20      1915
  Not Disease       0.72      0.80      0.76      1455

     accuracy                           0.59      9224
    macro avg       0.52      0.49      0.46      9224
 weighted avg       0.63      0.59      0.58      9224



In [2]:
def print_original_distribution(training_years):
    
    df_list=[]
    for year in training_years:
        df = pd.read_csv(str(year)+".disease.features.csv")
        df_list.append(df)
    df_training=pd.concat(df_list)
    
    print(df_training.label.value_counts())

In [3]:
# print_original_distribution([2017])

Exact            4149
Not Disease      2914
More Specific    1273
More General      938
Name: label, dtype: int64
