In [38]:
# This file parses the raw judgement into binray values

In [9]:
import pandas as pd

In [10]:
def parse_judgements(raw_file_path, parsed_file_path, qrel_file_path):
    
    '''
    Args: The path of the raw judgement and the qrel file
    Returns: Saving the parsed judgement (with binary values)
    '''
    
    df_raw = pd.read_csv(raw_file_path)
    df_raw.fillna("unknown", inplace=True)
    
    # merge with qrel files to get the relevance
    df_qrel = pd.read_csv(qrel_file_path, sep = " ", header = None)
    df_qrel.columns = ["topic", "q0", "docid", "relevance"]
    
    df = pd.merge(df_raw, df_qrel, 
                  left_on = ["trec_topic_number", "trec_doc_id"],
                  right_on = ["topic", "docid"])
    
    # create a new dataframe for binary values
    df_binary = pd.DataFrame(
            columns = ["topicid", "docid",
            "Human_PM", "Animal_PM", "Not_PM",
            "Disease_Exact", "Disease_General", "Disease_Specific", "Disease_Not", 
            "Gene_Exact", "Gene_Missing", "Gene_Missing_Variant", "Gene_Diff_Variant", 
            "Demo_Match", "Demo_Notdiscussed", "Demo_Exclude", 
            "Relevance"])
    
    for index, row in df.iterrows():
    
        topicid = df.loc[index, "trec_topic_number"]
        docid = df.loc[index, "trec_doc_id"]
        
        # PM or not
        (Human_PM, Animal_PM, Not_PM) = (0, 0, 0)
        pm_rel = df.loc[index, "pm_rel_desc"]
        if pm_rel == "Human PM":
            Human_PM = 1
        elif pm_rel == "Animal PM":
            Animal_PM = 1
        elif pm_rel == "Not PM":
            Not_PM = 1
        
        # Disease
        (Disease_Exact, Disease_General, Disease_Specific, Disease_Not) = (0, 0, 0, 0)
        disease = df.loc[index, "disease_desc"]
        if disease == "Exact":
            Disease_Exact = 1
        elif disease == "More General":
            Disease_General = 1
        elif disease == "More Specific":
            Disease_Specific = 1
        elif disease == "Not Disease":
            Disease_Not = 1

        # demographic information
        (Demo_Match, Demo_Notdiscussed, Demo_Exclude) = (0, 0, 0)
        demo_info = df.loc[index, "demographics_desc"]
        if demo_info == "Matches":
            Demo_Match = 1
        elif demo_info == "Not Discussed":
            Demo_Notdiscussed = 1
        elif demo_info == "Excludes":
            Demo_Exclude = 1
    
        # 1st gene
        (Gene1_Exact, Gene1_Missing, Gene1_Missing_Variant, Gene1_Diff_Variant) = (0, 0, 0, 0)
        gene1_info = df.loc[index, "gene1_annotation_desc"]
        if gene1_info == "Missing Gene":
            Gene1_Missing = 1
        elif gene1_info == "Exact":
            Gene1_Exact = 1
        elif gene1_info == "Missing Variant":
            Gene1_Missing_Variant = 1
        elif gene1_info == "Different Variant":
            Gene1_Diff_Variant = 1
    
        # 2st gene
        (Gene2_Exact, Gene2_Missing, Gene2_Missing_Variant, Gene2_Diff_Variant) = (0, 0, 0, 0)
        gene2_info = df.loc[index, "gene2_annotation_desc"]
        if gene2_info == "Missing Gene":
            Gene2_Missing = 1
        elif gene2_info == "Exact":
            Gene2_Exact = 1
        elif gene2_info == "Missing Variant":
            Gene2_Missing_Variant = 1
        elif gene2_info == "Different Variant":
            Gene2_Diff_Variant = 1
    
        # 3rd gene
        (Gene3_Exact, Gene3_Missing, Gene3_Missing_Variant, Gene3_Diff_Variant) = (0, 0, 0, 0)
        gene3_info = df.loc[index, "gene3_annotation_desc"]
        if gene3_info == "Missing Gene":
            Gene3_Missing = 1
        elif gene3_info == "Exact":
            Gene3_Exact = 1
        elif gene3_info == "Missing Variant":
            Gene3_Missing_Variant = 1
        elif gene3_info == "Different Variant":
            Gene3_Diff_Variant = 1
        
        # the number of genes
        number_of_gene = 1
        if gene2_info != "unknown":
            number_of_gene = 2
        if gene3_info != "unknown":
            number_of_gene = 3
        
        # aggregate gene labels
        Gene_Exact = (Gene1_Exact + Gene2_Exact + Gene3_Exact)/number_of_gene
        Gene_Missing = (Gene1_Missing + Gene2_Missing + Gene3_Missing)/number_of_gene
        Gene_Missing_Variant = (Gene1_Missing_Variant + Gene2_Missing_Variant + Gene3_Missing_Variant)/number_of_gene
        Gene_Diff_Variant = (Gene1_Diff_Variant + Gene2_Diff_Variant + Gene3_Diff_Variant)/number_of_gene
        
        # relevance
        rel = df.loc[index, "relevance"]
    
        # add to the binary feature data frame
        df_binary = df_binary.append({"topicid": topicid, 
                                      "docid": docid,
                                      "Human_PM": Human_PM, 
                                      "Animal_PM": Animal_PM, 
                                      "Not_PM": Not_PM,
                                      "Disease_Exact": Disease_Exact, 
                                      "Disease_General": Disease_General, 
                                      "Disease_Specific": Disease_Specific, 
                                      "Disease_Not":Disease_Not,
                                      "Gene_Exact": Gene_Exact, 
                                      "Gene_Missing": Gene_Missing, 
                                      "Gene_Missing_Variant": Gene_Missing_Variant, 
                                      "Gene_Diff_Variant": Gene_Diff_Variant, 
                                      "Demo_Match": Demo_Match, 
                                      "Demo_Notdiscussed": Demo_Notdiscussed, 
                                      "Demo_Exclude": Demo_Exclude,
                                      "Relevance": rel},
                                      ignore_index=True)
    
        if index % 5000 == 0:
            print(index)
        
    df_binary.to_csv(parsed_file_path, index=False)

In [11]:
def main():
    print("parsing 2017")
    parse_judgements("../../data/rawjudgements/judgments2017.csv", 
                     "../../data/parsedjudgements/judgments2017.csv",
                     "../../data/topics/2017qrel.txt")
    
    print("parsing 2018")
    parse_judgements("../../data/rawjudgements/judgments2018.csv", 
                     "../../data/parsedjudgements/judgments2018.csv",
                     "../../data/topics/2018qrel.txt")
    
    print("parsing 2019")
    parse_judgements("../../data/rawjudgements/judgments2019.csv", 
                     "../../data/parsedjudgements/judgments2019.csv",
                     "../../data/topics/2019qrel.txt")

In [12]:
if __name__ == '__main__':
    main()

parsing 2017
0
5000
10000
15000
20000
parsing 2018
0
5000
10000
15000
20000
parsing 2019
0
5000
10000
15000
