In [1]:
%run /Users/jiamingqu/Desktop/proj/scripts/classifier/read.dataframe.ipynb

In [2]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
import joblib
import nltk
from nltk.corpus import stopwords
stop = stopwords.words("english")
from nltk.tokenize import word_tokenize
import re

import warnings
warnings.filterwarnings("ignore")
from imblearn.over_sampling import SMOTE, ADASYN

In [4]:
def clean_text(text):
    
    # to lower case
    text = text.lower()
    
    # remove punctuation and new line characters
    text.replace("\t"," ")
    text.replace("\n"," ")
    
    # remove punctuation
    text = re.sub(r"[^\w\s]", " ", text)
    
    # remove digits
    text = re.sub(r"\b\d+\b"," ", text)
    
    # remove multiple white spaces
    text = re.sub(r' +', ' ', text)
    
    # remove stopwords
    text = [x for x in text.split() if x not in stop]
    
    return " ".join(text)

In [5]:
def generate_features(year):
    
    # read parsed gene dataframe
    pm_df = read_dataframe(year, "pm")
    
    # feature table
    feature_table = pd.DataFrame(columns=["match_human","match_animal","match_not", 
                                          "topicid","docid","label"])
    # folder path of the corpus
    folder_path = "../../../data/corpus/"
    
    keyword_animal = ['mice', 'mouse', 'model', 'mammary', 
                  'rat','xenografts','rats','models',
                  'vivo','cycle','mutated','preclinical',
                  'prostate','pten','liver','met','animals',
                  'mgkg','human', 'xenograft']

    keyword_human = ['gastrectomy','imatinib','gastric','stomach',
                 'fgfr1','prognostic','mutation','gastrointestinal',
                 'mutations','families','shorter','inhibitor',
                 'kit','located','lethal','kras','dose',
                 'tract','pfs','mutated']

    keyword_not = ['lung','transplantation','symptoms','female',
               'apoptotic','driver','cervical','pressure',
               'pancreaticoduodenectomy','surface','triple','women',
               'however','a549','cervix','mortality','adjuvant',
               'bypass','basis','myxoid']
        
    # iterate through documents
    for index,rows in pm_df.iterrows():

        # read documents
        docid = str(pm_df.loc[index, "trec_doc_id"])
        lines = []
        with open(folder_path+docid+".txt",'r') as f:
            for line in f.readlines():
                lines.append(line.strip())
        full_text = " ".join(lines)
        full_text = clean_text(full_text)
        
        match_human = 0
        match_animal = 0
        match_not = 0
        
        
        for k in keyword_human:
            match_human += full_text.count(k)
        
        for k in keyword_animal:
            match_animal += full_text.count(k)
            
        for k in keyword_not:
            match_not += full_text.count(k)


        topicid = pm_df.loc[index,"trec_topic_number"]
        label = pm_df.loc[index,"pm_rel_desc"]

        feature_table = feature_table.append({"match_human":match_human,
                                              "match_animal":match_animal,
                                              "match_not":match_not, 
                                              "topicid":topicid,
                                              "docid":docid,
                                              "label":label}, ignore_index=True)

        if index%1000==0:
            print(index)

    # sanity check and save results
    assert feature_table.shape[0] == pm_df.shape[0]
    feature_table.to_csv(str(year) + ".pm.features.csv", index=False, sep = ",")

In [6]:
# generate_features(2017)
# generate_features(2018)

In [3]:
def training_testing_classifier(training_years, testing_years):
    
    df_list=[]
    for year in training_years:
        df = pd.read_csv(str(year)+".pm.features.csv")
        df_list.append(df)
    df_training=pd.concat(df_list)
    df_testing=pd.read_csv(str(testing_years)+".pm.features.csv")
    
    features = ["match_human","match_animal","match_not"]
    training_features=df_training[features]
    testing_features=df_testing[features]
    training_labels=df_training.label
    testing_labels=df_testing.label
    
    # over-sampling
    training_features, training_labels = SMOTE().fit_resample(training_features, training_labels)
    
    # training
    logistic_model = LogisticRegression(multi_class="ovr", C=0.5)
    logistic_model.fit(training_features, training_labels)
    
    predicted_labels = logistic_model.predict(testing_features)
    print(classification_report(testing_labels, predicted_labels))
    
    joblib.dump(logistic_model, str(testing_years)+".pm.classifier.pkl")

In [8]:
# training_testing_classifier([2017],2018)

              precision    recall  f1-score   support

   Animal PM       0.10      0.63      0.17       590
    Human PM       0.59      0.41      0.48      8634
      Not PM       0.72      0.68      0.70     13205

    accuracy                           0.58     22429
   macro avg       0.47      0.57      0.45     22429
weighted avg       0.65      0.58      0.60     22429



In [3]:
def print_original_distribution(training_years):
    
    df_list=[]
    for year in training_years:
        df = pd.read_csv(str(year)+".pm.features.csv")
        df_list.append(df)
    df_training=pd.concat(df_list)
    
    print(df_training.label.value_counts())

In [5]:
# print_original_distribution([2017])

Not PM       13368
Human PM      8738
Animal PM      536
Name: label, dtype: int64
