# Entity Cleaning (lowercase, lemmatization)

In [None]:
# Defining the length checking function
def length_checker ():
    print(len(text))

In [2]:
# Defining pre-processing function

from nltk.stem import WordNetLemmatizer 
def preprocessEntity (text):
    processed1 = lowercase (text)
    processed2 = lemmatization(processed1)
    normalized = normalizeCOVID (processed2)
    return normalized

def lowercase (text):
    lowercased = text.lower()
    return lowercased

def lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    lemmatized = ' '.join([lemmatizer.lemmatize(w) for w in text.split()])
    return lemmatized

def normalizeCOVID (text):
    covid_variants1 = ['coronarivus disease', 'covid19 coronavirus',  'coronarivus', 'sars-cov-2', '2019-ncov']
    covid_variants2 = ['corona','sars-cov']
    for variant in covid_variants1:
        if variant in text:
            text = text.replace(variant, 'covid-19')
            
    if (('corona' in text) and ('coronavirus' not in text)):
            text = text.replace('corona', 'covid-19')
            
    if (('sars-cov' in text) and ('sars-cov-2' not in text)):
            text = text.replace('corona', 'covid-19')
    return text
        
    
preprocessEntity ('normalized bananas coronarivus disease coronarivus disease sars-cov-2')  

'normalized banana covid-19 covid-19 covid-19'

## Clean entities (given by entity extraction)

In [55]:
# Loading classified entities (STAGE 1 -BERN)
import pandas as pd
paths = (r"C:\Users\huyen\OneDrive - UNT System\COURSES\INFO 5731\Final_Project\Group6_Working\AUG4\KG\ENTS\BERN_Sample10k\All_ENT_Types_V2\All_Bern_Type_Disease_V2.csv",
r"C:\Users\huyen\OneDrive - UNT System\COURSES\INFO 5731\Final_Project\Group6_Working\AUG4\KG\ENTS\BERN_Sample10k\All_ENT_Types_V2\All_Bern_Type_Drug_V2.csv",
r"C:\Users\huyen\OneDrive - UNT System\COURSES\INFO 5731\Final_Project\Group6_Working\AUG4\KG\ENTS\BERN_Sample10k\All_ENT_Types_V2\All_Bern_Type_Gene_V2.csv",
r"C:\Users\huyen\OneDrive - UNT System\COURSES\INFO 5731\Final_Project\Group6_Working\AUG4\KG\ENTS\BERN_Sample10k\All_ENT_Types_V2\All_Bern_Type_Species_V2.csv")
def load_file(path, type_):
    with open (path, 'r', encoding = 'utf-8') as file:
        data = pd.read_csv(file)
    return pd.DataFrame(data[type_].unique(), columns = [type_])  # get the unique

disease = load_file(paths[0], 'disease' )
drug = load_file(paths[1], 'drug' )
gene = load_file(paths[2], 'gene' )
species = load_file(paths[3], 'species' )
print(len(disease), len(drug), len(gene), len(species))

8129 4753 16888 1809


In [56]:
# drop null values
disease = disease.dropna()
drug = drug.dropna()
gene = gene.dropna()
species = species.dropna()


In [57]:
# pre-process all entities
disease_pre = disease[['disease']].applymap(preprocessEntity)
drug_pre = drug[['drug']].applymap(preprocessEntity)
gene_pre = gene[['gene']].applymap(preprocessEntity)
species_pre = species[['species']].applymap(preprocessEntity)

print(len(disease_pre.disease.unique()), len(drug_pre.drug.unique()), len(gene_pre.gene.unique()), len(species_pre.species.unique()))

7095 4231 15799 1572


In [6]:
def witeFile(path, df):
    with open (path, 'w', encoding = 'utf-8', newline = '') as file:
        df.to_csv(file)
    return df

In [59]:
paths = (r"C:\Users\huyen\OneDrive - UNT System\COURSES\INFO 5731\Final_Project\Group6_Working\AUG4\KG\ENTS\BERN_Sample10k\All_ENT_Types_V2\All_Bern_Type_Disease_V3.csv",
r"C:\Users\huyen\OneDrive - UNT System\COURSES\INFO 5731\Final_Project\Group6_Working\AUG4\KG\ENTS\BERN_Sample10k\All_ENT_Types_V2\All_Bern_Type_Drug_V3.csv",
r"C:\Users\huyen\OneDrive - UNT System\COURSES\INFO 5731\Final_Project\Group6_Working\AUG4\KG\ENTS\BERN_Sample10k\All_ENT_Types_V2\All_Bern_Type_Gene_V3.csv",
r"C:\Users\huyen\OneDrive - UNT System\COURSES\INFO 5731\Final_Project\Group6_Working\AUG4\KG\ENTS\BERN_Sample10k\All_ENT_Types_V2\All_Bern_Type_Species_V3.csv")

witeFile(paths[0], disease_pre)
witeFile(paths[1], drug_pre)
witeFile(paths[2], gene_pre)
witeFile(paths[3], species_pre)

Unnamed: 0,species
0,canine
1,mers
2,dog
3,patient
4,mouse
...,...
1804,pasteurella
1805,mycoplasma dispar
1806,pasteurella sp
1807,fusobacterium necrophorum


## Clean subject/object (given by OpenIE)


In [3]:
# Loading the OpenIE triples
import pandas as pd
with open (r"C:\Users\huyen\OneDrive - UNT System\COURSES\INFO 5731\Final_Project\Group6_Working\AUG4\KG\OpenIE\refined-S-R-O_triples_onlyBERN_3.csv",'r', encoding = 'utf-8') as file:
    df = pd.read_csv(file)
df.head()

Unnamed: 0.1,Unnamed: 0,subject,relation,object
0,0,Japan,be,human virus
1,1,PRT,be_included_in,time
2,2,rSARS-CoV,lack,8a
3,3,mouse,sample/be_sampled_from,measurement
4,4,patient,met,Center


In [4]:
df['subject'] = df[['subject']].applymap(preprocessEntity)
df['object'] = df[['object']].applymap(preprocessEntity)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,subject,relation,object
0,0,japan,be,human virus
1,1,prt,be_included_in,time
2,2,rsars-cov,lack,8a
3,3,mouse,sample/be_sampled_from,measurement
4,4,patient,met,center


In [8]:
witeFile(r"C:\Users\huyen\OneDrive - UNT System\COURSES\INFO 5731\Final_Project\Group6_Working\AUG4\KG\OpenIE\refined-S-R-O_triples_onlyBERN_3_cleaned.csv", df)

Unnamed: 0.1,Unnamed: 0,subject,relation,object
0,0,japan,be,human virus
1,1,prt,be_included_in,time
2,2,rsars-cov,lack,8a
3,3,mouse,sample/be_sampled_from,measurement
4,4,patient,met,center
...,...,...,...,...
35575,35690,hct recipient,work_on,patient
35576,35691,covid-19,be_declared,pandemic
35577,35692,respiratory syndrome,be/have,acute respiratory illness
35578,35693,isg20,be_in_replication_in,yfv
