In [1]:
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, matthews_corrcoef
from sklearn.model_selection import train_test_split
from fuzzywuzzy import fuzz
import os.path
import sys



In [2]:
def search_basic(text: str, key_terms: list) -> int:
    text = text.lower()
    found = 0
    for i in key_terms:
        if i.lower().replace("*", "") in text:
            found = 1
    return found
def levenstein(text: str, key_terms: list, cutoff: int= 90) -> int:
    text = text.lower()
    found = 0
    for i in key_terms:
        if fuzz.partial_ratio(i.lower().replace("*", ""), text) > cutoff:
            found = 1
    return found

In [3]:

key_terms_resilience = [line.replace("\n", "") for line in open(sys.path[0]+"/datasets/key_terms/resilience_final.txt")]
key_terms_biomarkers = [line.replace("\n", "") for line in open(sys.path[0]+"/datasets/key_terms/biomarkers+disease_final.txt")]
key_terms_stressors = [line.replace("\n", "") for line in open(sys.path[0]+"/datasets/key_terms/stressors_final_removed_ed.txt")]
key_terms_conditions = [line.replace("\n", "") for line in open(sys.path[0]+"/datasets/key_terms/conditions_final.txt")]

master_sheet = pd.read_csv(sys.path[0]+"/datasets/train_test.csv")
cancer_sheet= pd.read_csv(sys.path[0]+"/datasets/validation.csv")
print(master_sheet.head())
print(cancer_sheet.head())

        Key  Publication.Year  \
0  225CCC2R            2010.0   
1  2267DYLY            2000.0   
2  2272VXWK            1998.0   
3  229SFPIF            2017.0   
4  22ACI86B            2001.0   

                                              Author  \
0                   Beesley H.; Rhodes J.; Salmon P.   
1                    Sidiropoulos, A.; Muthny, F. A.   
2  Ehlers, C. L.; Garcia-Andrade, C.; Wall, T. L....   
3  Modgil, Shweta; Cameotra, Swaranjit S.; Sharma...   
4  Ito, K.; Olsen, S. L.; Qiu, W.; Deeley, R. G.;...   

                                               Title  \
0  Anger and childhood sexual abuse are independe...   
1  [Subjective theories of illness in the Greek c...   
2  Determinants of P3 amplitude and response to a...   
3  Early Life Pb Exposure and its Effect on Later...   
4  Mutation of a single conserved tryptophan in m...   

                                   Publication.Title  \
0               British journal of health psychology   
1  Gesundheitsw

In [4]:
def construct_report(df: object, df_name: str,topic_name: str, key_terms: list):
    print(df_name, topic_name, "Basic Search")
    pred_basic = [search_basic(text=abstract, key_terms=key_terms) for abstract in list(df["Abstract.Note"])]
    print("Accuracy", accuracy_score(y_true=list(df[topic_name]), y_pred=pred_basic))
    print("MM", matthews_corrcoef(y_true=list(df[topic_name]), y_pred=pred_basic))
    print(classification_report(y_true=list(df[topic_name]), y_pred=pred_basic))
    
    print(df_name, topic_name, "Fuzzy Search")
    pred_lev = [levenstein(text=abstract, key_terms=key_terms) for abstract in list(df["Abstract.Note"])]
    print("Accuracy", accuracy_score(y_true=list(df[topic_name]), y_pred=pred_lev))
    print("MM", matthews_corrcoef(y_true=list(df[topic_name]), y_pred=pred_lev))
    print(classification_report(y_true=list(df[topic_name]), y_pred=pred_lev))


In [None]:
df_dict = {"master_sheet":master_sheet, "cancer_sheet":cancer_sheet}
terms ={"Resilience":key_terms_resilience, "Biomarkers":key_terms_biomarkers, "Conditions":key_terms_conditions, "Stressors":key_terms_stressors}
for i in df_dict.keys():
    for j in terms.keys():
        if i == "master_sheet":
            train, test_df = train_test_split(df_dict[i],random_state=42, test_size=0.1)
            construct_report(df=test_df, df_name=i, topic_name=j, key_terms=terms[j])
        else:
            construct_report(df=df_dict[i], df_name=i, topic_name=j, key_terms=terms[j])

master_sheet Resilience Basic Search
Accuracy 0.3812423873325213
MM 0.11057416666891337
              precision    recall  f1-score   support

           0       0.84      0.13      0.22       567
           1       0.33      0.94      0.49       254

   micro avg       0.38      0.38      0.38       821
   macro avg       0.58      0.54      0.35       821
weighted avg       0.68      0.38      0.30       821

master_sheet Resilience Fuzzy Search
