In [2]:
%run /Users/jiamingqu/Desktop/proj/scripts/searching/evaluation.functions.ipynb
%run /Users/jiamingqu/Desktop/proj/scripts/classifier/read.dataframe.ipynb
%run /Users/jiamingqu/Desktop/proj/scripts/classifier/demo/demo.classifier.ipynb
%run /Users/jiamingqu/Desktop/proj/scripts/classifier/gene/gene.classifier.ipynb

In [3]:
import pandas as pd
import joblib
import json
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
stop = stopwords.words("english")

In [4]:
def generate_demo_features(text, topic, demo_classifier, query_topics_demo):
    
    # dictionary to save results
    features = dict()
    features_low = dict()
    features_high = dict()
    
    # load query topics and classifier
    demo = query_topics_demo[topic]
    age = int(demo.split("-")[0])
    gender = demo.split(" ")[1]
    
    # check numeric age
    age_numeric = check_age_diff_numeric(age, text)
    if age_numeric == "MissingNumericAge":
        (age_numeric_missing, age_numeric_diff) = (1, 0)
    else:
        (age_numeric_missing, age_numeric_diff) = (0, age_numeric)

    # check text age
    age_text = count_age_group_keywords_text(age, text)
    if age_text == "MissingTextAge":
        (age_text_missing, age_text_match) = (1, 0)
    else:
        (age_text_missing, age_text_match) = (0, age_text)

    # check geneder
    gender_check = check_gender_diff(gender, text)
    if gender_check == "MissingGender":
        (gender_missing, gender_diff) = (1, 0)
    else:
        (gender_missing, gender_diff) = (0, gender_check)
    
    # add to low level features dictionary
    features_low['age_missing_numeric'] = age_numeric_missing
    features_low['age_diff_numeric'] = age_numeric_diff
    features_low['age_missing_text'] = age_text_missing
    features_low['age_match_text'] = age_text_match
    features_low['gender_missing'] = gender_missing
    features_low['gender_diff'] = gender_diff
    features['low'] = features_low
    
    # add high level features, i.e., probabilities
    feature_vector = [age_numeric_diff,age_numeric_missing,age_text_missing,age_text_match,gender_diff,gender_missing]
    prob = demo_classifier.predict_proba([feature_vector])[0]
    
    
    demo_exclude_prob = prob[0]
    demo_match_prob = prob[1]
    demo_not_prob = prob[2]
    
        
    a = demo_exclude_prob/(7126/815)
    b = demo_match_prob/(7126/607)
    c = demo_not_prob/(7126/7126)

    adjusted_demo_exclude_prob = a/(a+b+c)
    adjusted_demo_match_prob = b/(a+b+c)
    adjusted_demo_not_prob = c/(a+b+c)

    
    # in accord with labels
    features_high['Demo_Exclude'] = adjusted_demo_exclude_prob
    features_high['Demo_Match'] = adjusted_demo_match_prob
    features_high['Demo_Notdiscussed'] = adjusted_demo_not_prob
    features['high'] = features_high
    
    return features

In [5]:
def generate_disease_features(raw_text, topic, disease_classifier, query_topics_disease, disease_expansion_terms, acronyms_dict):
    
    # dictionary to save results
    features = dict()
    features_low = dict()
    features_high = dict()
    
    
    disease = query_topics_disease[topic]
    
    expansion_terms = disease_expansion_terms[str(topic)]
    synonyms = expansion_terms["synonyms"]
    ancestors = expansion_terms["ancestors"]
    descendants = expansion_terms["descendants"]
    if disease in acronyms_dict.keys():
        acronyms = acronyms_dict[disease].split(" ")
    else:
        acronyms = []
    
    # start generating features
    text = raw_text.lower()
    count_match_self,count_match_ancestor,count_match_descendant = 0,0,0
    
    # 1) count disease itself
    count_match_self = text.count(disease.lower())
    for s in synonyms:
        count_match_self += text.count(s.lower())
    for acronym in acronyms:
        # do not downcase and count acronyms
        # otherwise you will get a lot of match of something like "cc", "aa"
        count_match_self += raw_text.count(acronym) 

    # 2) count ancestors
    for a in ancestors:
        count_match_ancestor += text.count(a.lower())

    # 3) count general descriptors
    for v in ["human cancer", "human tumor"]:
        count_match_ancestor += text.count(v.lower())

    # 4) count descendants
    for d in descendants:
        count_match_descendant += text.count(d.lower())
        
    # add to low level features dictionary
    features_low['count_match_self'] = count_match_self
    features_low['count_match_ancestor'] = count_match_ancestor
    features_low['count_match_descendant'] = count_match_descendant
    features['low'] = features_low
    
    # add high level features, i.e., probabilities
    feature_vector = [count_match_self,count_match_ancestor,count_match_descendant]
    prob = disease_classifier.predict_proba([feature_vector])[0]
    
    
    exact_prob = prob[0]
    general_prob = prob[1]
    specific_prob = prob[2]
    not_prob = prob[3]
    
    a = exact_prob/(4149/4149)
    b = general_prob/(4149/938)
    c = specific_prob/(4149/1273)
    d = not_prob/(4149/2914)

    adjusted_exact_prob = a/(a+b+c+d)
    adjusted_general_prob = b/(a+b+c+d)
    adjusted_specific_prob = c/(a+b+c+d)
    adjusted_not_prob =  d/(a+b+c+d)

    
    # in accord with labels
    features_high['Disease_Exact'] = adjusted_exact_prob
    features_high['Disease_General'] = adjusted_general_prob
    features_high['Disease_Specific'] = adjusted_specific_prob
    features_high['Disease_Not'] = adjusted_not_prob
    features['high'] = features_high
    
    return features

In [6]:
def clean_text(text):
    
    # to lower case
    text = text.lower()
    
    # remove punctuation and new line characters
    text.replace("\t"," ")
    text.replace("\n"," ")
    # remove punctuation
    text = re.sub(r"[^\w\s]", " ", text)
    # remove digits
    text = re.sub(r"\b\d+\b"," ", text)
    # remove multiple white spaces
    text = re.sub(r' +', ' ', text)
    # remove stopwords
    text = [x for x in text.split() if x not in stop]
    
    return " ".join(text)

In [7]:
def generate_pm_features(text, pm_classifier):
    
    # dictionary to save results
    features = dict()
    features_low = dict()
    features_high = dict()
    
    # first clean text
    text = clean_text(text)

    
    keyword_animal = ['mice', 'mouse', 'model', 'mammary', 
                      'rat','xenografts','dog','canie'
                      'vivo','cycle','mutated','preclinical',
                      'prostate','pten','liver','met','animal',
                      'mgkg','human', 'xenograft']

    keyword_human = ['gastrectomy','imatinib','gastric','stomach',
                 'fgfr1','prognostic','mutation','gastrointestinal',
                 'mutations','families','shorter','inhibitor',
                 'kit','located','lethal','kras','dose',
                 'tract','pfs','mutated']

    keyword_not = ['transplantation','symptoms','female','male',
                   'driver','pressure','pancreaticoduodenectomy',
                   'surface','triple','women',
                   'a549','mortality','adjuvant',
                   'bypass','basis','myxoid']
        
    match_human = 0
    match_animal = 0
    match_not = 0
    
    for k in keyword_human:
        match_human += text.count(k)
        
    for k in keyword_animal:
        match_animal += text.count(k)
            
    for k in keyword_not:
        match_not += text.count(k)

    features_low['match_human'] = match_human
    features_low['match_animal'] = match_animal
    features_low['match_not'] = match_not
    features['low'] = features_low

    
    feature_vector = [match_human,match_animal,match_not]
    prob = pm_classifier.predict_proba([feature_vector])[0]
    
    # adjust predicted prob bc we used SMOTE oversampling during training
    animal_prob = prob[0]
    human_prob = prob[1]
    not_prob = prob[2]

        
    a = animal_prob/(13368/536)
    b = human_prob/(13368/8738)
    c = not_prob/(13368/13368)


    adjusted_animal_prob = a/(a+b+c)
    adjusted_human_prob = b/(a+b+c)
    adjusted_not_prob = c/(a+b+c)
        
    
    features_high['Animal_PM'] = adjusted_animal_prob
    features_high['Human_PM'] = adjusted_human_prob
    features_high['Not_PM'] = adjusted_not_prob
    features['high'] = features_high
    
    return features

In [8]:
def generate_gene_features(text, topic, gene_classifier, query_topics_gene):
    
    # dictionary to save results
    features = dict()
    features_low = dict()
    features_high = dict()
    
    query_genes = query_topics_gene[topic]
    
    # there might be several genes in the query
    # we take the average for both low and mid level features
    match_gene_list = []
    has_variation_list = []
    match_variation_list = []
    match_total_variation_list = []
    has_other_info_list = []
    match_other_info_list = []
    
    differ_variant_list = []
    exact_list = []
    missing_gene_list = []
    missing_variant_list = []
    
    for query_gene in query_genes.split(","):
        
        query_gene = query_gene.strip()
        genes,variant,other_info = parsing_query_gene(query_gene)
        
        # 1) match gene
        match_gene = 0
        for gene in genes:
            if " " not in gene: 
                match_gene += text.count(gene)
                aliases = get_gene_alias_and_variations(gene)["aliases"]
                for alias in aliases:
                    match_gene += text.count(alias)
            else: # special case: the gene is a phrase
                for g in gene.split(" "):
                    g = g.strip()
                    match_gene += text.lower().count(g.lower())


        # 2) match variation
        if variant != 0:
            has_variation,match_variation,match_total_variation = 1,0,0
            # match variation
            match_variation += text.count(variant)
            # match all the variations
            for gene in genes:
                variants = get_gene_alias_and_variations(gene)["variants"]
                for v in variants:
                    match_total_variation += text.count(v)
        else:
            has_variation,match_variation,match_total_variation = 0,0,0
            # match all the variations
            for gene in genes:
                variants = get_gene_alias_and_variations(gene)["variants"]
                for v in variants:
                    match_total_variation += text.count(v)


        # 3) match other info
        if other_info != 0:
            has_other_info, match_other_info = 1,0
            for o in other_info.split(" "):
                o = o.strip()
                match_other_info += text.lower().count(o.lower())
        else:
            has_other_info, match_other_info = 0,0
            
        match_gene_list.append(match_gene)
        has_variation_list.append(has_variation)
        match_variation_list.append(match_variation)
        match_total_variation_list.append(match_total_variation)
        has_other_info_list.append(has_other_info)
        match_other_info_list.append(match_other_info)
        
        # make predictions
        feature_vector = [match_gene,has_variation,match_variation,match_total_variation,has_other_info,match_other_info]
        prob = gene_classifier.predict_proba([feature_vector])[0]
        
        
        differ_variant_prob = prob[0]
        exact_prob = prob[1]
        missing_gene_prob = prob[2]
        missing_variant_prob = prob[3]
        

        a = differ_variant_prob/(4578/602)
        b = exact_prob/(4578/4578)
        c = missing_gene_prob/(4578/3696)
        d = missing_variant_prob/(4578/1551)

        adjusted_differ_variant_prob = a/(a+b+c+d)
        adjusted_exact_prob = b/(a+b+c+d)
        adjusted_missing_gene_prob = c/(a+b+c+d)
        adjusted_missing_variant_prob = d/(a+b+c+d)
            

        differ_variant_list.append(adjusted_differ_variant_prob)
        exact_list.append(adjusted_exact_prob)
        missing_gene_list.append(adjusted_missing_gene_prob)
        missing_variant_list.append(adjusted_missing_variant_prob)
        
        
    # add the average to dictionary
    features_low['match_gene'] = np.mean(match_gene_list)
    features_low['has_variation'] = np.mean(has_variation_list)
    features_low['match_variation'] = np.mean(match_variation_list)
    features_low['match_total_variation'] = np.mean(match_total_variation_list)
    features_low['has_other_info'] = np.mean(has_other_info_list)
    features_low['match_other_info'] = np.mean(match_other_info_list)
    features['low'] = features_low
    
    features_high['Gene_Diff_Variant'] = np.mean(differ_variant_list)
    features_high['Gene_Exact'] = np.mean(exact_list)
    features_high['Gene_Missing'] = np.mean(missing_gene_list)
    features_high['Gene_Missing_Variant'] = np.mean(missing_variant_list)
    features['high'] = features_high
    
    return features

In [13]:
def generate_reranking_features(year):
    
    '''
    Generate reranking features for both low-level and mid-level
    low-level: raw features
    mid-level: predicted prob
    '''
    
    initial_retrieval = pd.read_csv("../../"+str(year)+".basic.query.result.txt",sep="\t")
    initial_retrieval.ID = initial_retrieval.ID.astype(str)
    
    
    # load classifiers and useful resources
    # disease
    query_topics_disease = read_query_topics(year,"disease")
    disease_classifier = joblib.load("../classifier/disease/" + str(year) + ".disease.classifier.pkl")
    disease_expansion_terms = dict()
    with open ("../classifier/disease/" + str(year) + ".disease.expansion.json", 'r') as f:
        for data in f:
            disease_expansion_terms = json.loads(data)
    f.close()
    # read acronyms: a dict of <disease, acronyms>
    acronyms_dict = dict()
    with open("../classifier/disease/acronyms.json",'r') as f:
         for line in f.readlines():
            acronyms_dict = json.loads(line)
    f.close()
    # demo
    query_topics_demo = read_query_topics(year,"demo")
    demo_classifier = joblib.load("../classifier/demo/" + str(year) + ".demo.classifier.pkl")
    # gene
    query_topics_gene = read_query_topics(year,"gene")
    gene_classifier = joblib.load("../classifier/gene/" + str(year) + ".gene.classifier.pkl")
    # pm
    pm_classifier = joblib.load("../classifier/pm/"+str(year)+".pm.classifier.pkl")
    

    # two dataframes of two levels of features
    df_low = pd.DataFrame(columns=['count_match_self', 'count_match_ancestor', 'count_match_descendant', 
                                    'age_missing_numeric', 'age_diff_numeric', 'age_missing_text',
                                    'age_match_text', 'gender_missing', 'gender_diff',
                                    'match_gene', 'has_variation', 'match_variation', 
                                    'match_total_variation', 'has_other_info', 'match_other_info',
                                    'match_human','match_animal','match_not',
                                    'topicid','docid','year'])
    
    df_high = pd.DataFrame(columns=['Human_PM', 'Animal_PM', 'Not_PM', 'Disease_Exact',
                                   'Disease_General', 'Disease_Specific', 'Disease_Not', 
                                   'Gene_Exact','Gene_Missing', 'Gene_Missing_Variant', 'Gene_Diff_Variant',
                                   'Demo_Match', 'Demo_Notdiscussed', 'Demo_Exclude',
                                   'topicid','docid','year'])
            
    for topic in set(initial_retrieval.TOPIC_NO):
        
        print("Parsing topic No. {}".format(topic))
        result_by_topic = initial_retrieval.loc[initial_retrieval.TOPIC_NO==topic]
        
        for index,rows in result_by_topic.iterrows():
            
            docid = result_by_topic.loc[index,"ID"]
            title = result_by_topic.loc[index,"TITLE"]
            content = result_by_topic.loc[index,"CONTENT"]
            text = title + " " + content
            
            # generate pm features
            pm_features = generate_pm_features(text,pm_classifier)
            pm_low = pm_features["low"]
            pm_high = pm_features["high"]
            
            # generate demo features
            demo_features = generate_demo_features(text,topic,demo_classifier,query_topics_demo)
            demo_low = demo_features["low"]
            demo_high = demo_features["high"]
            
            # generate disease features
            disease_features = generate_disease_features(text,topic,disease_classifier,\
                                                    query_topics_disease,disease_expansion_terms,acronyms_dict)
            disease_low = disease_features["low"]
            disease_high = disease_features["high"]
            
            # generate gene features
            gene_fearures = generate_gene_features(text,topic,gene_classifier,query_topics_gene)
            gene_low = gene_fearures["low"]
            gene_high = gene_fearures["high"]
            
            # add features and append to the dataframe
            record_low = dict(**pm_low, **demo_low, **disease_low, **gene_low)
            record_low['topicid'] = topic
            record_low['docid'] = docid
            record_low['year'] = year
            
            record_high = dict(**pm_high, **demo_high, **disease_high, **gene_high)
            record_high['topicid'] = topic
            record_high['docid'] = docid
            record_high['year'] = year
            
            
            df_low = df_low.append(record_low, ignore_index=True)
            df_high = df_high.append(record_high, ignore_index=True)
    

    # save to csv
    df_low.to_csv(str(year)+".low.features.csv", sep=",", index=False)
    df_high.to_csv(str(year)+".high.features.csv", sep=",", index=False)
    
    print(df_low.shape)
    print(df_high.shape)

In [14]:
# generate_reranking_features(2018)

Parsing topic No. 1
Parsing topic No. 2
Parsing topic No. 3
Parsing topic No. 4
Parsing topic No. 5
Parsing topic No. 6
Parsing topic No. 7
Parsing topic No. 8
Parsing topic No. 9
Parsing topic No. 10
Parsing topic No. 11
Parsing topic No. 12
Parsing topic No. 13
Parsing topic No. 14
Parsing topic No. 15
Parsing topic No. 16
Parsing topic No. 17
Parsing topic No. 18
Parsing topic No. 19
Parsing topic No. 20
Parsing topic No. 21
Parsing topic No. 22
Parsing topic No. 23
Parsing topic No. 24
Parsing topic No. 25
Parsing topic No. 26
Parsing topic No. 27
Parsing topic No. 28
Parsing topic No. 29
Parsing topic No. 30
Parsing topic No. 31
Parsing topic No. 32
Parsing topic No. 33
Parsing topic No. 34
Parsing topic No. 35
Parsing topic No. 36
Parsing topic No. 37
Parsing topic No. 38
Parsing topic No. 39
Parsing topic No. 40
Parsing topic No. 41
Parsing topic No. 42
Parsing topic No. 43
Parsing topic No. 44
Parsing topic No. 45
Parsing topic No. 46
Parsing topic No. 47
Parsing topic No. 48
P