In [1]:
import articles.article_fetch as article_fetch
import articles.articles_info as article_info
from articles import article

import clinical_trials.clinical_trial_fetch as ct_fetch
import clinical_trials.clinical_trials_info as ct_info
from clinical_trials import clinical_trial

import csv
import csv_data.csv as csv_data

from dataframe import create_dataframe
from dataframe import calculate_attributes

import pandas as pd

import importlib

# I get the gold standard
gold_standard = pd.read_csv('ClinicalPmidsALL.csv', encoding = 'ISO-8859-1', sep = ';')

# Now I change the common answer from string to numerical
csv_data.numerical_answers(gold_standard)

# Let's correct data on csv (common answer sometimes is not correct)
csv_data.correctData(gold_standard)

# I get all the clinical trials ID and I get the articles in xml
Clinical_trials = ct_fetch.get_xml_doms(gold_standard['CT'].tolist())

# I get all the PMID and I get the articles in xml
PubMed_id_string = list(map(str, gold_standard['PMID'].tolist())) # I get the PMID as list of strings
PubMed_articles = article_fetch.fetch_many_articles(PubMed_id_string, local=True) # It takes time to fetch the articles

# Let's map every clinical trial to every article (and remove the one that are not present) and common answer
df = create_dataframe.get_base_dataframe(gold_standard, Clinical_trials, PubMed_articles)

article 27953647  not found.
article 27948541  not found.
article 27943881  not found.
article 27949797  not found.
article 27955116  not found.
article 27950623  not found.
article 27945102  not found.
clinical trial NCT02659670  not found.
article 27198327  not found.


In [2]:
#lam = gold_standard['CT']
#lam.index[lam == 'NCT00003204'].tolist()[0]

ct_last_names, ct_first_name_initials, ct_first_names = ct_info.get_all_name_parts(df['CT'].tolist(), gold_standard)

df['ct_last_name'], df['ct_first_name_initial'], df['ct_first_name'] = [ct_last_names, ct_first_name_initials, ct_first_names]


In [3]:
# print(sum(x is not None for x in ct_last_names))
df = df.dropna().reset_index(drop = True)

In [4]:
ar_last_names, ar_first_name_initials, ar_first_names = article_info.get_all_name_parts(df['PubMed'].tolist(), gold_standard)

df['ar_last_name'], df['ar_first_name_initial'], df['ar_first_name'] = [ar_last_names, ar_first_name_initials, ar_first_names]
df.dropna().reset_index(drop = True)
print()




In [5]:
# now that we have the base information, let's add other attributes

# we get the organization of the principal investigator, so we don't need to pass his/her name
ct_organization_names = ct_info.get_all_organization_names(df['CT'].tolist())

# Now we insert it in the dataframe
if('ct_organization' not in df):
    df.insert(5, 'ct_organization', ct_organization_names)
    

# I get the e-mails
ct_mails = ct_info.get_all_mails(df['CT'].tolist(), df['ct_last_name'].tolist(), df['ct_first_name_initial'].tolist())

if('ct_mail' not in df):
    df.insert(6, 'ct_mail', ct_mails)

NCT00730210 doesn't have an organization.
NCT02220283 doesn't have an organization.


In [6]:
print(sum(x is not None for x in ct_mails))

230


In [7]:
# Let's add the same attributes for the articles

ar_organization_names = article_info.get_all_organizations(df['PubMed'].tolist(), df['ar_last_name'].tolist(),
                                                           df['ar_first_name_initial'].tolist())
df['ar_organization'] = ar_organization_names

ar_mails = article_info.get_all_mails(df['PubMed'].tolist(), df['ar_last_name'].tolist(),
                                      df['ar_first_name_initial'].tolist())
df['ar_mail'] = ar_mails

print(sum(x is not None for x in ar_organization_names))
print(sum(x is not None for x in ar_mails))

475
126


In [8]:
# I now calculate useful attributes for the classifiers
first_name_equalities = calculate_attributes.get_string_arrays_similarity(df['ct_first_name'].tolist(), df['ar_first_name'].tolist())
organization_similarities = calculate_attributes.get_organization_similarity(df['ct_organization'].tolist(),
                                                                             df['ar_organization'].tolist())
email_equalities = calculate_attributes.get_arrays_equality(df['ct_mail'].tolist(), df['ar_mail'].tolist())

# Let's add the attributes to the data frame
df['first_name_equality'], df['organization_similarity'] = [first_name_equalities, organization_similarities]
df['email_equality'] = email_equalities

In [9]:
# Now we delete the one we don't need in the classifier
df_to_save = df.drop(['CT', 'PubMed', 'ct_last_name', 'ct_first_name_initial', 'ct_first_name',
              'ct_organization', 'ct_mail', 'ar_last_name', 'ar_first_name_initial', 'ar_first_name',
              'ar_organization', 'ar_mail'], axis = 1)

In [10]:
# Writing the dataframe to file
import csv
import codecs
csv = df_to_save.to_csv(index=False)

file = codecs.open("dataframe.csv", "w", "utf-8")
file.write(csv)
file.close()

In [11]:
importlib.reload(article_info)
importlib.reload(ct_info)
importlib.reload(calculate_attributes)
importlib.reload(clinical_trial)

<module 'clinical_trials.clinical_trial' from 'C:\\Users\\Brescia\\Anaconda3\\Diploma\\clinical_trials\\clinical_trial.py'>

In [12]:
df['ar_organization'][4]

'Diabetes, Endocrinology, and Obesity Branch, National Institute of Diabetes and Digestive and Kidney Diseases, National Institutes of Health, Bethesda, Maryland 20892, USA. phillipg@intra.niddk.nih.gov'

In [13]:
from bs4 import BeautifulSoup
soup = BeautifulSoup("<xml><tag>taggo\ntaggo2</tag></xml>","xml")

In [14]:
soup.xml.tag.contents[0]

'taggo\ntaggo2'

In [15]:
df.iloc[40]

CT                         <clinical_trials.clinical_trial.ClinicalTrial ...
PubMed                     <articles.article.Article object at 0x00000155...
common_answer                                                              0
ct_last_name                                                            saba
ct_first_name_initial                                                      s
ct_organization                                     university of pittsburgh
ct_mail                                                                 None
ct_first_name                                                        samir f
ar_last_name                                                            saba
ar_first_name_initial                                                      s
ar_first_name                                                            s m
ar_organization            Fakultät für Physik & Astronomie, Ruhr-Univers...
ar_mail                                                                 None

In [16]:
df

Unnamed: 0,CT,PubMed,common_answer,ct_last_name,ct_first_name_initial,ct_organization,ct_mail,ct_first_name,ar_last_name,ar_first_name_initial,ar_first_name,ar_organization,ar_mail,first_name_equality,organization_similarity,email_equality
0,<clinical_trials.clinical_trial.ClinicalTrial ...,<articles.article.Article object at 0x00000155...,1,abdulkarim,b,ahs cancer control alberta,,bassam,abdulkarim,b,b,,,0.166667,0.000000,0
1,<clinical_trials.clinical_trial.ClinicalTrial ...,<articles.article.Article object at 0x00000155...,1,gawin,f,"friends research institute, inc.",,frank,gawin,f,frank,,,1.000000,0.000000,0
2,<clinical_trials.clinical_trial.ClinicalTrial ...,<articles.article.Article object at 0x00000155...,1,deutsch,s,washington d.c. veterans affairs medical center,,steven,deutsch,s,steven,,,1.000000,0.000000,0
3,<clinical_trials.clinical_trial.ClinicalTrial ...,<articles.article.Article object at 0x00000155...,1,maisiak,r,university of alabama at birmingham,,richard s,maisiak,r,richard s,,,1.000000,0.000000,0
4,<clinical_trials.clinical_trial.ClinicalTrial ...,<articles.article.Article object at 0x00000155...,1,gorden,p,national institute of diabetes and digestive a...,,phillip,gorden,p,phillip,"Diabetes, Endocrinology, and Obesity Branch, N...",phillipg@intra.niddk.nih.gov,1.000000,0.300000,0
5,<clinical_trials.clinical_trial.ClinicalTrial ...,<articles.article.Article object at 0x00000155...,1,hochster,h,eastern cooperative oncology group,,howard,hochster,h,howard,"Division of Medical Oncology, New York Univers...",howard.hochster@med.nyu.edu,1.000000,0.000000,0
6,<clinical_trials.clinical_trial.ClinicalTrial ...,<articles.article.Article object at 0x00000155...,1,fabian,c,university of kansas medical center,bkimler@kumc.edu,carol,fabian,c,carol,,,1.000000,0.000000,0
7,<clinical_trials.clinical_trial.ClinicalTrial ...,<articles.article.Article object at 0x00000155...,1,dematteo,r,american college of surgeons,,ronald,dematteo,r,ronald,"Karen T. Brown, Richard K. Do, Mithat Gonen, A...",,1.000000,0.250000,0
8,<clinical_trials.clinical_trial.ClinicalTrial ...,<articles.article.Article object at 0x00000155...,0,klein,j,university of rochester,,jonathan d,klein,j,julius,"University of California; Irvine, CA, USA.",,0.100000,0.333333,0
9,<clinical_trials.clinical_trial.ClinicalTrial ...,<articles.article.Article object at 0x00000155...,1,dispenzieri,a,mayo clinic,,angela,dispenzieri,a,a,,,0.166667,0.000000,0


In [17]:
#for i in range(100):
#    print(df['CT'][i].get_mail(df['ct_last_name'][i], df['ct_first_name_initial'][i]))
    
#df['CT'][4].clinical_trial.clinical_results.point_of_contact.email

mail = df['CT'][4].clinical_trial.location
contacts = [contact for contact in mail.findAll() if contact.name == 'contact']
contact = contacts[0]

print(contact.find('last_name').text)

For more information at the NIH Clinical Center contact Office of Patient Recruitment (OPR)
