In [1]:
%run /Users/jiamingqu/Desktop/proj/scripts/classifier/gene/gene.parsing.functions.ipynb
%run /Users/jiamingqu/Desktop/proj/scripts/searching/query.expansion.ipynb
%run /Users/jiamingqu/Desktop/proj/scripts/classifier/read.dataframe.ipynb

In [4]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
import joblib
import warnings
warnings.filterwarnings("ignore")
from imblearn.over_sampling import SMOTE, ADASYN

In [4]:
alias_kb = pd.read_csv("/Users/jiamingqu/Desktop/2017.2018/data/knowledge.base/Homo_sapiens.gene_info", sep="\t", index_col=False)
variations_kb = pd.read_csv("/Users/jiamingqu/Desktop/2017.2018/data/knowledge.base/PMKB_Interpretations_Complete_20191118-1534.csv", \
                             sep=",", index_col=False)

In [5]:
def get_gene_alias_and_variations(gene, aliases = alias_kb, variations=variations_kb):
    
    '''
    Get a gene's aliases and all its variants
    Args:
        gene name, alias knowledge base, variations knowledge base
    Returns:
        a dictionary of {"alias": [], "variants": []}
    '''
    
    dic = {}
    
    # add aliases
    aliases = aliases[["Symbol", "Synonyms"]]
    if gene in list(aliases.Symbol):
        dic["aliases"] = aliases.loc[aliases.Symbol==gene].Synonyms.values[0].split("|")
    else:
        dic["aliases"] = []
    
    # add variants
    if gene in list(variations.Gene):
        # may have several records
        df_gene_variations = variations.loc[variations.Gene == gene]
        variation_list = []
        
        # iterate the result df
        for index,row in df_gene_variations.iterrows():
            
            # one record in the df
            gene_variations = df_gene_variations.loc[index, "Variant(s)"]
            for i in str(gene_variations).split("|"):
                i_cleaned = i.replace(gene, "")
                variation_list.append(i_cleaned.strip())
                
        # de-dup
        dic["variants"] = list(set(variation_list))
    else:
        dic["variants"] = []
        
    return dic

In [6]:
def generate_features(year):
    
    # read parsed gene dataframe
    gene_df = parsing_gene_raw_judgements(year)
    
    # feature table
    feature_table = pd.DataFrame(columns=["match_gene", 
                                          "has_variation", "match_variation", "match_total_variation",
                                          "has_other_info", "match_other_info", 
                                          "topicid","docid","label"])
    # folder path of the corpus
    folder_path = "../../../data/corpus/"
    
    # start generating features, iterate through topics
    for topic in set(gene_df.topicid):
        
        print("Start parsing topic {}".format(topic))
        
        sub_gene_df = gene_df.loc[gene_df.topicid == topic]
        
        # iterate through documents
        for index,rows in sub_gene_df.iterrows():
        
            # read documents
            docid = sub_gene_df.loc[index, "docid"]
            lines = []
            with open(folder_path+docid+".txt",'r') as f:
                for line in f.readlines():
                    lines.append(line.strip())
            full_text = " ".join(lines)
            
            query_gene = sub_gene_df.loc[index,"gene"]
            genes,variant,other_info = parsing_query_gene(query_gene)
        
            # 1) match gene
            match_gene = 0
            for gene in genes:
                if " " not in gene: 
                    match_gene += full_text.count(gene)
                    aliases = get_gene_alias_and_variations(gene)["aliases"]
                    for alias in aliases:
                        match_gene += full_text.count(alias)
                else: # special case: the gene is a phrase
                    for g in gene.split(" "):
                        g = g.strip()
                        match_gene += full_text.lower().count(g.lower())
                    

            # 2) match variation
            if variant != 0:
                has_variation,match_variation,match_total_variation = 1,0,0
                # match variation
                match_variation += full_text.count(variant)
                # match all the variations
                for gene in genes:
                    variants = get_gene_alias_and_variations(gene)["variants"]
                    for v in variants:
                        match_total_variation += full_text.count(v)
            else:
                has_variation,match_variation,match_total_variation = 0,0,0
                # match all the variations
                for gene in genes:
                    variants = get_gene_alias_and_variations(gene)["variants"]
                    for v in variants:
                        match_total_variation += full_text.count(v)

                        
            # 3) match other info
            if other_info != 0:
                has_other_info, match_other_info = 1,0
                for o in other_info.split(" "):
                    o = o.strip()
                    match_other_info += full_text.lower().count(o.lower())
            else:
                has_other_info, match_other_info = 0,0

                
            topicid = sub_gene_df.loc[index,"topicid"]
            docid = sub_gene_df.loc[index,"docid"]
            label = sub_gene_df.loc[index,"label"]

            feature_table = feature_table.append({"match_gene": match_gene, 
                                                  "has_variation": has_variation, 
                                                  "match_variation": match_variation, 
                                                  "match_total_variation": match_total_variation,
                                                  "has_other_info": has_other_info, 
                                                  "match_other_info":match_other_info,
                                                  "topicid":topic,
                                                  "docid":docid,
                                                  "label":label}, ignore_index=True)
            
        print("Topic {} has been parsed".format(topic))
        
    # sanity check and save results
    assert feature_table.shape[0] == gene_df.shape[0]
    feature_table.to_csv(str(year) + ".gene.features.csv", index=False, sep = ",")

In [1]:
# generate_features(2017)
# generate_features(2018)

In [7]:
def training_testing_classifier(training_years, testing_years):
    
    df_list=[]
    for year in training_years:
        df = pd.read_csv(str(year)+".gene.features.csv")
        df_list.append(df)
    df_training=pd.concat(df_list)
    df_testing=pd.read_csv(str(testing_years)+".gene.features.csv")
    
    features = ["match_gene", "has_variation", "match_variation", "match_total_variation",
                "has_other_info", "match_other_info"]
    training_features=df_training[features]
    testing_features=df_testing[features]
    training_labels=df_training.label
    testing_labels=df_testing.label
    
    # over-sampling
    training_features, training_labels = SMOTE().fit_resample(training_features, training_labels)
    
    # training
    logistic_model = LogisticRegression(multi_class="ovr", penalty='l2', C=0.5, max_iter=5000)
    logistic_model.fit(training_features, training_labels)
    
    predicted_labels = logistic_model.predict(testing_features)
    print(classification_report(testing_labels, predicted_labels))
    
    joblib.dump(logistic_model, str(testing_years)+".gene.classifier.pkl")

In [8]:
# training_testing_classifier([2017],2018)

                   precision    recall  f1-score   support

Different Variant       0.04      0.10      0.06       846
            Exact       0.75      0.39      0.51      4425
     Missing Gene       0.61      0.71      0.66      2226
  Missing Variant       0.33      0.52      0.41      1208

         accuracy                           0.46      8705
        macro avg       0.43      0.43      0.41      8705
     weighted avg       0.59      0.46      0.49      8705



In [2]:
def print_original_distribution(training_years):
    
    df_list=[]
    for year in training_years:
        df = pd.read_csv(str(year)+".gene.features.csv")
        df_list.append(df)
    df_training=pd.concat(df_list)
    
    print(df_training.label.value_counts())

In [3]:
# print_original_distribution([2017])

Exact                4578
Missing Gene         3696
Missing Variant      1551
Different Variant     602
Name: label, dtype: int64
