In [1]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import re

In [2]:
def parsing_gene_raw_judgements(year):
    
    '''
    Parsing the gene annatation in raw judgements
    Args: a certain year
    Returns: a dataframe of [year, topicid, docid, gene, label]
    '''
    
    gene_dataframe = pd.DataFrame(columns=["year","topicid", "docid", "gene", "label"])
    raw_judgements = pd.read_csv("../../../data/rawjudgements/judgments"+str(year)+".csv")
    
    # exclude NOT PM papers
    raw_judgements = raw_judgements.loc[raw_judgements.pm_rel_desc != "Not PM"]
    
    for index, rows in raw_judgements.iterrows():
        
        gene1_name = raw_judgements.loc[index,"gene1_name"]
        gene2_name = raw_judgements.loc[index,"gene2_name"]
        gene3_name = raw_judgements.loc[index,"gene3_name"]
        gene1_label = raw_judgements.loc[index,"gene1_annotation_desc"]
        gene2_label = raw_judgements.loc[index,"gene2_annotation_desc"]
        gene3_label = raw_judgements.loc[index,"gene3_annotation_desc"]
        topicid = raw_judgements.loc[index,"trec_topic_number"]
        docid = raw_judgements.loc[index,"trec_doc_id"]
        
        if str(docid)[0] != "A": # exclude conference papers
        
            if not pd.isna(gene1_label):
                gene_dataframe = gene_dataframe.append({"year":year,
                                                        "topicid":topicid,
                                                        "docid":docid,
                                                        "gene":gene1_name,
                                                        "label":gene1_label},
                                                        ignore_index=True)
            if not pd.isna(gene2_label):
                gene_dataframe = gene_dataframe.append({"year":year,
                                                        "topicid":topicid,
                                                        "docid":docid,
                                                        "gene":gene2_name,
                                                        "label":gene2_label},
                                                        ignore_index=True)

            if not pd.isna(gene3_label):
                gene_dataframe = gene_dataframe.append({"year":year,
                                                        "topicid":topicid,
                                                        "docid":docid,
                                                        "gene":gene3_name,
                                                        "label":gene3_label},
                                                        ignore_index=True)
            
    return gene_dataframe

In [13]:
def parsing_query_gene(query_gene):
    
    '''
    Extract gene, variation, other information from a piece of gene text
    Args:
        query_gene: genes in the query
    Returns:
        a tuple of ([gene(s)], variants, other_info)
            genes(s): we use a list because some gene actually has multiple genes, e.g., xxx-yyy fusion
            variants: gene variants, 0 if there is no variation
            other_info: amplication, deletion, inactivating，truncation，rearrangement
    '''
    
    try:
        if "fusion" in query_gene.lower():
            if "-" in query_gene: # two genes fusion
                genes = query_gene.split(" ")[0]
                return (genes.split("-"), 0, "fusion")
            else: # one gene fusion
                return ([query_gene.split(" ")[0].strip()], 0, "fusion")

        if "(" in query_gene:
            # with variations
            gene = re.search(r'[\w\s]+', query_gene)
            variation = query_gene[query_gene.find("(")+1 : query_gene.rfind(")")]
            gene = gene[0].strip()
            variation = variation.strip()
            return ([gene], variation, 0)

        # only genes, no variants or fusion
        gene = re.search(r'[A-Z]+[0-9]*[-]*[A-Z]*[0-9]*', query_gene)[0].strip()
        query_gene = query_gene.replace(gene,"")
        other_info = query_gene.strip()
        if other_info != "":
            return ([gene],0,query_gene.strip())
        else:
            return ([gene],0,0)

    except:
        # nothing matched
        return ([query_gene],0,0)