In [1]:
import pandas as pd
import requests
import io

In [2]:
# Defining the length checking function
def length_ranker (list_ent):
    lengths = []
    for ent in list_ent:
        lengths.append(len(ent))
    return sorted(lengths, reverse=False)

def length_checker (thres, text):
    if len(text)<=thres:
        return text


In [3]:
# Defining pre-processing function

from nltk.stem import WordNetLemmatizer 
def preprocessEntity (text):
    processed1 = lowercase (text)
    processed2 = lemmatization(processed1)
    normalized = normalizeCOVID (processed2)
    return normalized

def lowercase (text):
    lowercased = text.lower()
    return lowercased

def lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    lemmatized = ' '.join([lemmatizer.lemmatize(w) for w in text.split()])
    return lemmatized

def normalizeCOVID (text):
    covid_variants1 = ['coronarivus disease', 'covid19 coronavirus',  'coronarivus', 'sars-cov-2', '2019-ncov']
    covid_variants2 = ['corona','sars-cov']
    for variant in covid_variants1:
        if variant in text:
            text = text.replace(variant, 'covid-19')
            
    if (('corona' in text) and ('coronavirus' not in text)):
            text = text.replace('corona', 'covid-19')
            
    if (('sars-cov' in text) and ('sars-cov-2' not in text)):
            text = text.replace('corona', 'covid-19')
    return text
        
    
preprocessEntity ('normalized bananas coronarivus disease coronarivus disease sars-cov-2')  

'normalized banana covid-19 covid-19 covid-19'

In [4]:
# Clean final triples

# Downloading the csv file from your GitHub account
re_pdf = requests.get('https://raw.githubusercontent.com/HuyenNguyenHelen/CORD-19-KG/master/Data/new_triples_with_predefined_relations_pdf_June20_.csv').content
re_pmc = requests.get('https://raw.githubusercontent.com/HuyenNguyenHelen/CORD-19-KG/master/Data/new_triples_with_predefined_relations_pmc_June20_.csv').content
pdf = pd.read_csv(io.StringIO(re_pdf.decode('utf-8')))
pmc =  pd.read_csv(io.StringIO(re_pmc.decode('utf-8')))
dataset = pd.concat([pmc, pdf])
print(dataset.columns)
dataset.drop(columns='Unnamed: 0', inplace=True)
dataset = dataset[['subject','new_relation', 'object' ]]
dataset.head(5)

Index(['Unnamed: 0', 'Unnamed: 0.1', 'subject', 'object', 'new_relation'], dtype='object')


Unnamed: 0,subject,new_relation,object
0,dysfunction,disease_species,child
1,et dysfunction,disease_species,child
2,et dysfunction,disease_species,child
3,rsv,disease_disease,virus
4,treatment,treat_procedure_species,child


In [9]:
dataset['subject'].apply(lambda x: length_checker (1, x)).value_counts()

i    2944
u    1302
n     719
2     465
c     365
m     336
r     333
s     290
3     284
a     254
5     209
g     186
6     171
b     159
l     154
t     150
f     146
p     131
d     117
h      84
β      84
α      78
y      76
j      66
w      60
k      59
v      52
γ      49
z      46
e      33
o      28
θ      22
ε       8
ω       6
Name: subject, dtype: int64

In [10]:
lens = pd.DataFrame(length_ranker(dataset['subject']))
lens[lens[0]==1]

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
...,...
9461,1
9462,1
9463,1
9464,1


After observing ents with length =<1, we found that they are nonsense, so we decided to remove triples having them

In [11]:
sub_data = dataset[dataset['subject'].apply(lambda x: len(x)>1)]
final_data = sub_data[sub_data['object'].apply(lambda x: len(x)>1)]

# checking length
print(final_data['subject'].apply(lambda x: length_checker (1, x)).value_counts())
print(final_data['object'].apply(lambda x: length_checker (1, x)).value_counts())

final_data

Series([], Name: subject, dtype: int64)
Series([], Name: object, dtype: int64)


Unnamed: 0,subject,new_relation,object
0,dysfunction,disease_species,child
1,et dysfunction,disease_species,child
2,et dysfunction,disease_species,child
3,rsv,disease_disease,virus
4,treatment,treat_procedure_species,child
...,...,...,...
175873,vegf,gene_disease,edema
175874,virus,disease_species,person
175875,covid,disease_disease,respiratory failure
175876,covid,disease_disease,acute respiratory failure


In [18]:
final_data['subject'] = final_data['subject'].replace( 'covid', 'covid-19')
final_data ['object'] = final_data['object'].replace('covid', 'covid-19')
print(final_data[final_data['subject']=='covid'])
print(final_data[final_data['object']=='covid'])


Empty DataFrame
Columns: [subject, new_relation, object]
Index: []
Empty DataFrame
Columns: [subject, new_relation, object]
Index: []


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['subject'] = final_data['subject'].replace( 'covid', 'covid-19')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data ['object'] = final_data['object'].replace('covid', 'covid-19')


In [21]:
final_data
with open(r'C:\Users\huyen\OneDrive\Documents\GitHub\CORD-19-KG\Data\all-final-cleaned-triples.csv', 'w', encoding = 'utf-8') as f:
    final_data.to_csv(f)