In [1]:
import articles.article_fetch as article_fetch
import articles.articles_info as article_info
from articles import article

import clinical_trials.clinical_trial_fetch as ct_fetch
import clinical_trials.clinical_trials_info as ct_info
from clinical_trials import clinical_trial

import csv
import csv_data.csv as csv_data

from dataframe import create_dataframe
from dataframe import calculate_attributes

import pandas as pd

import importlib

# I get the gold standard
gold_standard = pd.read_csv('ClinicalPmidsALL.csv', encoding = 'ISO-8859-1', sep = ';')

# Now I change the common answer from string to numerical
csv_data.numerical_answers(gold_standard)

# Let's correct data on csv (common answer sometimes is not correct)
csv_data.correctData(gold_standard)

# I get all the clinical trials ID and I get the articles in xml
Clinical_trials = ct_fetch.get_xml_doms(gold_standard['CT'].tolist())

# I get all the PMID and I get the articles in xml
PubMed_id_string = list(map(str, gold_standard['PMID'].tolist())) # I get the PMID as list of strings
PubMed_articles = article_fetch.fetch_many_articles(PubMed_id_string, local=True) # It takes time to fetch the articles

# Let's map every clinical trial to every article (and remove the one that are not present) and common answer
df = create_dataframe.get_base_dataframe(gold_standard, Clinical_trials, PubMed_articles)

# standard, spacy or stanford
ct_org_sample = 'spacy'
ar_org_sample = 'spacy'

article 27953647  not found.
article 27948541  not found.
article 27943881  not found.
article 27949797  not found.
article 27955116  not found.
article 27950623  not found.
article 27945102  not found.
clinical trial NCT02659670  not found.
article 27198327  not found.


In [2]:
#lam = gold_standard['CT']
#lam.index[lam == 'NCT00003204'].tolist()[0]

ct_last_names, ct_first_name_initials, ct_first_names = ct_info.get_all_name_parts(df['CT'].tolist(), gold_standard)

df['ct_last_name'], df['ct_first_name_initial'], df['ct_first_name'] = [ct_last_names, ct_first_name_initials, ct_first_names]

In [3]:
# print(sum(x is not None for x in ct_last_names))
df = df.dropna().reset_index(drop = True)

In [4]:
ar_last_names, ar_first_name_initials, ar_first_names = article_info.get_all_name_parts(df['PubMed'].tolist(), gold_standard)

df['ar_last_name'], df['ar_first_name_initial'], df['ar_first_name'] = [ar_last_names, ar_first_name_initials, ar_first_names]
df.dropna().reset_index(drop = True)
print()




In [None]:
# now that we have the base information, let's add other attributes

# we get the organization of the principal investigator, so we don't need to pass his/her name
ct_organization_names = ct_info.get_all_organization_names(df['CT'].tolist(), df['ct_last_name'].tolist(),
                                                           df['ct_first_name_initial'].tolist(),
                                                           sample=ct_org_sample)

# Now we insert it in the dataframe
if('ct_organization' not in df):
    df.insert(6, 'ct_organization', ct_organization_names)
    

# I get the e-mails
ct_mails = ct_info.get_all_mails(df['CT'].tolist(), df['ct_last_name'].tolist(), df['ct_first_name_initial'].tolist())

if('ct_mail' not in df):
    df.insert(7, 'ct_mail', ct_mails)
    
# I get the year
ct_years = ct_info.get_all_years(df['CT'].tolist())
    
if('ct_year' not in df):
    df.insert(8, 'ct_year', ct_years)
    
# I get the initials of the name

ct_initials = ct_info.get_all_initials(df['ct_first_name'].tolist())

if('ct_initials' not in df):
    df.insert(9, 'ct_initials', ct_initials)

NCT00730210 doesn't have an organization.
NCT02220283 doesn't have an organization.


In [None]:
print(sum(x is not None for x in df['ct_mail']))

In [None]:
# Let's add the same attributes for the articles

ar_organization_names = article_info.get_all_organizations(df['PubMed'].tolist(), df['ar_last_name'].tolist(),
                                                           df['ar_first_name_initial'].tolist(),
                                                           sample = ar_org_sample)
df['ar_organization'] = ar_organization_names

ar_mails = article_info.get_all_mails(df['PubMed'].tolist(), df['ar_last_name'].tolist(),
                                      df['ar_first_name_initial'].tolist())
df['ar_mail'] = ar_mails

ar_years = article_info.get_all_years(df['PubMed'].tolist())

df['ar_year'] = ar_years

ar_initials = article_info.get_all_initials(df['ar_first_name'])

df['ar_initials'] = ar_initials

print(sum(x is not None for x in ar_organization_names))
print(sum(x is not None for x in ar_mails))

In [None]:
# I now calculate useful attributes for the classifiers
first_name_equalities = calculate_attributes.get_string_arrays_similarity(df['ct_first_name'].tolist(),
                                                                          df['ar_first_name'].tolist())
organization_similarities = calculate_attributes.get_organization_similarity(df['ct_organization'].tolist(),
                                                                             df['ar_organization'].tolist(),
                                                                             ct_org_sample, ar_org_sample)
email_equalities = calculate_attributes.get_arrays_equality(df['ct_mail'].tolist(), df['ar_mail'].tolist())
year_differences = calculate_attributes.get_year_differences(df['ct_year'].tolist(), df['ar_year'].tolist())
last_name_lengths = calculate_attributes.get_last_name_lengths(df['ct_last_name'].tolist())
initials_equality = calculate_attributes.get_arrays_equality(df['ct_initials'].tolist(), df['ar_initials'].tolist())

# Let's add the attributes to the data frame
df['first_name_equality'], df['organization_similarity'] = [first_name_equalities, organization_similarities]
df['email_equality'], df['year_difference'] = [email_equalities, year_differences]
df['last_name_length'], df['initials_equality'] = [last_name_lengths, initials_equality]

In [None]:
# Now we delete the ones we don't need in the classifier
df_to_save = df.drop(['CT', 'PubMed', 'ct_last_name', 'ct_first_name_initial', 'ct_first_name',
              'ct_organization', 'ct_mail', 'ct_year', 'ct_initials', 'ar_last_name', 'ar_first_name_initial',
                'ar_first_name','ar_organization', 'ar_mail', 'ar_initials', 'ar_year'], axis = 1)

In [None]:
# Writing the dataframe to file
import csv
import codecs
csv = df_to_save.to_csv(index=False)

file = codecs.open("dataframe.csv", "w", "utf-8")
file.write(csv)
file.close()

In [None]:
importlib.reload(article_info)
importlib.reload(ct_info)
importlib.reload(calculate_attributes)
importlib.reload(clinical_trial)

In [None]:
df.iloc[40]

In [None]:
df

In [None]:
lista = df['CT'].tolist()
a = df['ct_last_name'].tolist()
b = df['ct_first_name_initial'].tolist()
mails = df['ct_mail'].tolist()

count = 0

for i in range(len(lista)):
    tags = lista[i].get_all_name_tags(a[i],b[i])
    
    for tag in tags:
        if tag.email is not None or mails[i]:
            #print(tag.email, mails[i])
            count += 1
        
print(count)

In [None]:
lista = df['CT'].tolist()

for i in range(len(lista)):
    loc = lista[i].clinical_trial.findAll('location')
    if len(loc) > 1:
        print(loc[0].country.text, loc[1].country.text, '\n')

In [None]:
lista = df['CT'].tolist()

for i in range(len(lista)):
    loc = lista[i].clinical_trial.findAll('overall_official')
    if len(loc) >= 1:
        print(loc[0], '\n')

In [None]:

lista = df['organization_similarity'].tolist()
for i in range(len(lista)):
    print(lista[i], df['ct_organization'][i], '\n',df['ar_organization'][i], '\n\n')