In [2]:
%run /Users/jiamingqu/Desktop/proj/scripts/classifier/read.dataframe.ipynb
%run /Users/jiamingqu/Desktop/proj/scripts/classifier/demo/demo.classifier.ipynb
%run /Users/jiamingqu/Desktop/proj/scripts/classifier/gene/gene.classifier.ipynb

In [3]:
import pandas as pd
import joblib
import json
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
stop = stopwords.words("english")

In [4]:
def generate_demo_features(text, topic, query_topics_demo):
    
    # dictionary to save results
    features_low = dict()

    
    # load query topics and classifier
    demo = query_topics_demo[topic]
    age = int(demo.split("-")[0])
    gender = demo.split(" ")[1]
    
    # check numeric age
    age_numeric = check_age_diff_numeric(age, text)
    if age_numeric == "MissingNumericAge":
        (age_numeric_missing, age_numeric_diff) = (1, 0)
    else:
        (age_numeric_missing, age_numeric_diff) = (0, age_numeric)

    # check text age
    age_text = count_age_group_keywords_text(age, text)
    if age_text == "MissingTextAge":
        (age_text_missing, age_text_match) = (1, 0)
    else:
        (age_text_missing, age_text_match) = (0, age_text)

    # check geneder
    gender_check = check_gender_diff(gender, text)
    if gender_check == "MissingGender":
        (gender_missing, gender_diff) = (1, 0)
    else:
        (gender_missing, gender_diff) = (0, gender_check)
    
    # add to low level features dictionary
    features_low['age_missing_numeric'] = age_numeric_missing
    features_low['age_diff_numeric'] = age_numeric_diff
    features_low['age_missing_text'] = age_text_missing
    features_low['age_match_text'] = age_text_match
    features_low['gender_missing'] = gender_missing
    features_low['gender_diff'] = gender_diff
   
    return features_low

In [5]:
def generate_disease_features(raw_text, topic, query_topics_disease, disease_expansion_terms, acronyms_dict):
    
    # dictionary to save results
    features_low = dict()
    
    disease = query_topics_disease[topic]
    
    expansion_terms = disease_expansion_terms[str(topic)]
    synonyms = expansion_terms["synonyms"]
    ancestors = expansion_terms["ancestors"]
    descendants = expansion_terms["descendants"]
    if disease in acronyms_dict.keys():
        acronyms = acronyms_dict[disease].split(" ")
    else:
        acronyms = []
    
    # start generating features
    text = raw_text.lower()
    count_match_self,count_match_ancestor,count_match_descendant = 0,0,0
    
    # 1) count disease itself
    count_match_self = text.count(disease.lower())
    for s in synonyms:
        count_match_self += text.count(s.lower())
    for acronym in acronyms:
        # do not downcase and count acronyms
        # otherwise you will get a lot of match of something like "cc", "aa"
        count_match_self += raw_text.count(acronym) 

    # 2) count ancestors
    for a in ancestors:
        count_match_ancestor += text.count(a.lower())

    # 3) count general descriptors
    for v in ["human cancer", "human tumor"]:
        count_match_ancestor += text.count(v.lower())

    # 4) count descendants
    for d in descendants:
        count_match_descendant += text.count(d.lower())
        
    # add to low level features dictionary
    features_low['count_match_self'] = count_match_self
    features_low['count_match_ancestor'] = count_match_ancestor
    features_low['count_match_descendant'] = count_match_descendant
    
    
    return features_low

In [6]:
def clean_text(text):
    
    # to lower case
    text = text.lower()
    
    # remove punctuation and new line characters
    text.replace("\t"," ")
    text.replace("\n"," ")
    # remove punctuation
    text = re.sub(r"[^\w\s]", " ", text)
    # remove digits
    text = re.sub(r"\b\d+\b"," ", text)
    # remove multiple white spaces
    text = re.sub(r' +', ' ', text)
    # remove stopwords
    text = [x for x in text.split() if x not in stop]
    
    return " ".join(text)

In [7]:
def generate_pm_features(text):
    
    # dictionary to save results
    features_low = dict()

    # first clean text
    text = clean_text(text)

    
    keyword_animal = ['mice', 'mouse', 'model', 'mammary', 
                      'rat','xenografts','dog','canie'
                      'vivo','cycle','mutated','preclinical',
                      'prostate','pten','liver','met','animal',
                      'mgkg','human', 'xenograft']

    keyword_human = ['gastrectomy','imatinib','gastric','stomach',
                 'fgfr1','prognostic','mutation','gastrointestinal',
                 'mutations','families','shorter','inhibitor',
                 'kit','located','lethal','kras','dose',
                 'tract','pfs','mutated']

    keyword_not = ['transplantation','symptoms','female','male',
                   'driver','pressure','pancreaticoduodenectomy',
                   'surface','triple','women',
                   'a549','mortality','adjuvant',
                   'bypass','basis','myxoid']
        
    match_human = 0
    match_animal = 0
    match_not = 0
    
    for k in keyword_human:
        match_human += text.count(k)
        
    for k in keyword_animal:
        match_animal += text.count(k)
            
    for k in keyword_not:
        match_not += text.count(k)

    features_low['match_human'] = match_human
    features_low['match_animal'] = match_animal
    features_low['match_not'] = match_not
    
    return features_low

In [8]:
def generate_gene_features(text, topic, query_topics_gene):
    
    # dictionary to save results
    features_low = dict()
    
    query_genes = query_topics_gene[topic]
    
    # there might be several genes in the query
    # we take the average for both low and mid level features
    match_gene_list = []
    has_variation_list = []
    match_variation_list = []
    match_total_variation_list = []
    has_other_info_list = []
    match_other_info_list = []
    
    differ_variant_list = []
    exact_list = []
    missing_gene_list = []
    missing_variant_list = []
    
    for query_gene in query_genes.split(","):
        
        query_gene = query_gene.strip()
        genes,variant,other_info = parsing_query_gene(query_gene)
        
        # 1) match gene
        match_gene = 0
        for gene in genes:
            if " " not in gene: 
                match_gene += text.count(gene)
                aliases = get_gene_alias_and_variations(gene)["aliases"]
                for alias in aliases:
                    match_gene += text.count(alias)
            else: # special case: the gene is a phrase
                for g in gene.split(" "):
                    g = g.strip()
                    match_gene += text.lower().count(g.lower())


        # 2) match variation
        if variant != 0:
            has_variation,match_variation,match_total_variation = 1,0,0
            # match variation
            match_variation += text.count(variant)
            # match all the variations
            for gene in genes:
                variants = get_gene_alias_and_variations(gene)["variants"]
                for v in variants:
                    match_total_variation += text.count(v)
        else:
            has_variation,match_variation,match_total_variation = 0,0,0
            # match all the variations
            for gene in genes:
                variants = get_gene_alias_and_variations(gene)["variants"]
                for v in variants:
                    match_total_variation += text.count(v)


        # 3) match other info
        if other_info != 0:
            has_other_info, match_other_info = 1,0
            for o in other_info.split(" "):
                o = o.strip()
                match_other_info += text.lower().count(o.lower())
        else:
            has_other_info, match_other_info = 0,0
            
        match_gene_list.append(match_gene)
        has_variation_list.append(has_variation)
        match_variation_list.append(match_variation)
        match_total_variation_list.append(match_total_variation)
        has_other_info_list.append(has_other_info)
        match_other_info_list.append(match_other_info)
        
        
    # add the average to dictionary
    features_low['match_gene'] = np.mean(match_gene_list)
    features_low['has_variation'] = np.mean(has_variation_list)
    features_low['match_variation'] = np.mean(match_variation_list)
    features_low['match_total_variation'] = np.mean(match_total_variation_list)
    features_low['has_other_info'] = np.mean(has_other_info_list)
    features_low['match_other_info'] = np.mean(match_other_info_list)

    return features_low

In [12]:
def read_text_docid(docid):
    folder_path = "/Users/jiamingqu/Desktop/corpus/"
    lines = []
    with open(folder_path+docid+".txt",'r') as f:
            for line in f.readlines():
                lines.append(line.strip())
    full_text = " ".join(lines)
    return full_text

In [15]:
def generate_ltr_low_features(year, from_retrieval=True):
    
    '''
    Generate the low-level features for a year
    '''
    
    # feature table
    features=[ 'count_match_self', 'count_match_ancestor', 'count_match_descendant',
               'age_missing_numeric', 'age_diff_numeric', 'age_missing_text',
               'age_match_text', 'gender_missing', 'gender_diff',
               'match_gene', 'has_variation', 'match_variation', 
               'match_total_variation', 'has_other_info', 'match_other_info',
               'match_human','match_animal','match_not',
               'topicid','docid','year','rel']
    df_low = pd.DataFrame(columns=features)
    
    # read dataframe: from retrieval results or from qrel
    if from_retrieval==True:
        df = pd.read_csv("../../scripts/searching/2017.searching/2017.basic.query.result.txt",sep="\t")
        df.columns = ['topicid', 'q0', 'docid', 'rank', 'score', 'run_name', 'title', 'content']
        df['topicid'] = df['topicid'].astype(int)
        qrel = pd.read_csv("../../data/topics/2017qrel.txt", sep = " ", header=None)
        qrel.columns = ['topicid', 'q0', 'docid', 'Relevance']
        df = df.merge(qrel, on=['topicid','docid'], how='left')
        df.fillna(0,inplace=True)
        
    else:
        df =pd.read_csv("../../data/parsedjudgements/judgments"+str(year)+".csv")
        df['topicid'] = df['topicid'].astype(int)
    
    
    
    # load resources and query topics
    # disease
    query_topics_disease = read_query_topics(year,"disease")
    disease_expansion_terms = dict()
    with open ("../../scripts/classifier/disease/" + str(year) + ".disease.expansion.json", 'r') as f:
        for data in f:
            disease_expansion_terms = json.loads(data)
    f.close()
    # read acronyms: a dict of <disease, acronyms>
    acronyms_dict = dict()
    with open("../../scripts/classifier/disease/acronyms.json",'r') as f:
         for line in f.readlines():
            acronyms_dict = json.loads(line)
    f.close()
    
    # demo
    query_topics_demo = read_query_topics(year,"demo")
    
    # gene
    query_topics_gene = read_query_topics(year,"gene")
    

    
    # loop over topics
    for topic in set(df.topicid):

        df_topic = df.loc[df.topicid==topic]
        print("Parsing Topic No.{}".format(topic))
        
        for index,rows in df_topic.iterrows():
            
            rel = df_topic.loc[index,"Relevance"]
            
            docid = str(df_topic.loc[index,"docid"])
            if from_retrieval == True:
                text = df_topic.loc[index,'title'] + " " + df_topic.loc[index,'content']
            else:
                text = read_text_docid(docid)
                
            disease_features = generate_disease_features(text,topic,query_topics_disease,disease_expansion_terms,acronyms_dict)
            gene_features = generate_gene_features(text,topic,query_topics_gene)
            demo_features = generate_demo_features(text,topic,query_topics_demo)
            pm_features = generate_pm_features(text)
            
            record_low = dict(**disease_features, **gene_features, **demo_features, **pm_features)
            record_low['topicid'] = topic
            record_low['docid'] = docid
            record_low['year'] = year
            record_low['rel'] = rel
            
            # add to feature table
            df_low = df_low.append(record_low, ignore_index=True)

    assert df_low.shape[0] == df.shape[0]
    print(df_low.shape)
    
    if from_retrieval == True:
        df_low.to_csv(str(year)+".low.features.from.retrieval.csv", sep=",", index=False)
    else:
        df_low.to_csv(str(year)+".low.features.csv", sep=",", index=False)

In [18]:
# generate_ltr_low_features(2017, from_retrieval=True)