In [1]:
import src.articles.article_fetch as article_fetch
import src.articles.articles_info as article_info
from src.articles import article

import src.clinical_trials.clinical_trial_fetch as ct_fetch
import src.clinical_trials.clinical_trials_info as ct_info
from src.clinical_trials import clinical_trial

import csv
import src.csv_data.csv as csv_data

from src.dataframe import create_dataframe
from src.dataframe import calculate_attributes

from src.common_functions import common_functions

import pandas as pd

import importlib

# I get the gold standard
gold_standard = pd.read_csv('src/ClinicalPmidsALL.csv', encoding = 'ISO-8859-1', sep = ';')

# Now I change the common answer from string to numerical
csv_data.numerical_answers(gold_standard)

# Let's correct data on csv (common answer sometimes is not correct)
csv_data.correct_data(gold_standard)

# I get all the clinical trials ID and I get the articles in xml
Clinical_trials = ct_fetch.get_xml_doms(gold_standard['CT'].tolist())

# I get all the PMID and I get the articles in xml
PubMed_id_string = list(map(str, gold_standard['PMID'].tolist())) # I get the PMID as list of strings
PubMed_articles = article_fetch.fetch_many_articles(PubMed_id_string, local=True)# It takes time to fetch the articles

# Let's map every clinical trial to every article (and remove the one that are not present) and common answer
df = create_dataframe.get_base_dataframe(gold_standard, Clinical_trials, PubMed_articles)

# standard, spacy or stanford
ct_org_sample = 'standard'
ar_org_sample = 'spacy'

article 27953647  not found.
article 27948541  not found.
article 27943881  not found.
article 27949797  not found.
article 27955116  not found.
article 27950623  not found.
article 27945102  not found.
clinical trial NCT02659670  not found.
article 27198327  not found.


In [2]:
#lam = gold_standard['CT']
#lam.index[lam == 'NCT00003204'].tolist()[0]

ct_last_names, ct_first_name_initials, ct_first_names = ct_info.get_all_name_parts(df['CT'].tolist(), gold_standard)

df['ct_last_name'], df['ct_first_name_initial'], df['ct_first_name'] = [ct_last_names, ct_first_name_initials, ct_first_names]

In [3]:
# print(sum(x is not None for x in ct_last_names))
df = df.dropna().reset_index(drop = True)

In [4]:
ar_last_names, ar_first_name_initials, ar_first_names = article_info.get_all_name_parts(df['PubMed'].tolist(), gold_standard)

df['ar_last_name'], df['ar_first_name_initial'], df['ar_first_name'] = [ar_last_names, ar_first_name_initials, ar_first_names]
df.dropna().reset_index(drop = True)
print()




In [5]:
# now that we have the base information, let's add other attributes

# we get the organization of the principal investigator, so we don't need to pass his/her name
ct_organization_names = ct_info.get_all_organization_names(df['CT'].tolist(), df['ct_last_name'].tolist(),
                                                           df['ct_first_name_initial'].tolist(),
                                                           sample=ct_org_sample)

# Now we insert it in the dataframe
if('ct_organization' not in df):
    df.insert(6, 'ct_organization', ct_organization_names)
    

# I get the e-mails
ct_mails = ct_info.get_all_mails(df['CT'].tolist(), df['ct_last_name'].tolist(), df['ct_first_name_initial'].tolist())

if('ct_mail' not in df):
    df.insert(7, 'ct_mail', ct_mails)
    
# I get the year
ct_years = ct_info.get_all_years(df['CT'].tolist())
    
if('ct_year' not in df):
    df.insert(8, 'ct_year', ct_years)
    
# I get the initials of the name
ct_initials = ct_info.get_all_initials(df['ct_first_name'].tolist())

if('ct_initials' not in df):
    df.insert(9, 'ct_initials', ct_initials)
    
# I get the title
ct_titles = ct_info.get_all_titles(df['CT'].tolist())

if('ct_title' not in df):
    df.insert(10, 'ct_title', ct_titles)
    
# I get the country and the city
ct_countries, ct_cities = ct_info.get_all_countries_and_cities(df['CT'].tolist())

if('ct_country' not in df):
    df.insert(11, 'ct_country', ct_countries)
    
if('ct_city' not in df):
    df.insert(12, 'ct_city', ct_cities)

NCT00730210 doesn't have an organization.
NCT02220283 doesn't have an organization.


In [6]:
print(sum(x is not None for x in df['ct_mail']))

267


In [7]:
# Let's add the same attributes for the articles

ar_organization_names, ar_locations = article_info.get_all_organizations_locations(df['PubMed'].tolist(),
                                                           df['ar_last_name'].tolist(),
                                                           df['ar_first_name_initial'].tolist(),
                                                           sample = ar_org_sample)
df['ar_organization'] = ar_organization_names

ar_mails = article_info.get_all_mails(df['PubMed'].tolist(), df['ar_last_name'].tolist(),
                                      df['ar_first_name_initial'].tolist())
df['ar_mail'] = ar_mails

ar_years = article_info.get_all_years(df['PubMed'].tolist())

df['ar_year'] = ar_years

ar_initials = article_info.get_all_initials(df['ar_first_name'])

df['ar_initials'] = ar_initials

ar_titles = article_info.get_all_titles(df['PubMed'].tolist())

df['ar_title'] = ar_titles
df['ar_location'] = ar_locations

print(sum(x is not None for x in ar_organization_names))
print(sum(x is not None for x in ar_mails))

475
126


In [8]:
# I now calculate useful attributes for the classifiers
first_name_equalities = calculate_attributes.get_string_arrays_similarity(df['ct_first_name'].tolist(),
                                                                          df['ar_first_name'].tolist())
organization_similarities, organization_type_equalities = calculate_attributes.get_organization_similarity_and_type_equality(df['ct_organization'].tolist(),
                                                                             df['ar_organization'].tolist(),
                                                                             ct_org_sample, ar_org_sample)
email_equalities = calculate_attributes.get_arrays_equality(df['ct_mail'].tolist(), df['ar_mail'].tolist())
year_differences = calculate_attributes.get_year_differences(df['ct_year'].tolist(), df['ar_year'].tolist())
last_name_lengths = calculate_attributes.get_last_name_lengths(df['ct_last_name'].tolist())
initials_equality = calculate_attributes.get_arrays_equality(df['ct_initials'].tolist(), df['ar_initials'].tolist())
namespace_sizes = calculate_attributes.get_namespace_ambiguities(df['ct_last_name'].tolist(),
                                                                 df['ct_first_name_initial'].tolist())
country_equalities, city_equalities = calculate_attributes.get_location_equalities(df['ct_country'].tolist(),
                                                                                   df['ct_city'].tolist(),
                                                                                   df['ar_location'].tolist())
jds_similarities, sts_similarities = calculate_attributes.get_all_jds_sts_similarities(df['CT'].tolist(),
                                                                    df['PubMed'].tolist(), 4, 52,
                                                                    'percentage_ranking', 'percentage_ranking')

# Let's add the attributes to the data frame
df['first_name_equality'], df['organization_similarity'] = [first_name_equalities, organization_similarities]
df['email_equality'], df['year_difference'] = [email_equalities, year_differences]
df['last_name_length'], df['initials_equality'] = [last_name_lengths, initials_equality]
df['namespace_size'], df ['country_equality'] = [namespace_sizes, country_equalities]
df['city_equality'], df['organization_type_equality'] = [city_equalities, organization_type_equalities]
df['jds_similarity'], df['sts_similarity'] = [jds_similarities, sts_similarities]

not yet connected


In [9]:
# Now let's normalize and/or standardize the attributes that need this
df['namespace_size'] = common_functions.normalize(df['namespace_size'])
df['last_name_length'] = common_functions.normalize(df['last_name_length'])
df['year_difference'] = common_functions.normalize(df['year_difference'])

'''
import numpy as np

np_array = np.array(df['namespace_size'].tolist())
std = np_array.std()
mean = np_array.mean()
for i in range(len(np_array)):
    np_array[i] = (np_array[i] - mean)/std
    
df['namespace_size'] = np_array.tolist()

np_array = np.array(df['last_name_length'].tolist())
std = np_array.std()
mean = np_array.mean()
for i in range(len(np_array)):
    np_array[i] = (np_array[i] - mean)/std
    
df['last_name_length'] = np_array.tolist()

np_array = np.array(df['year_difference'].tolist())
std = np_array.std()
mean = np_array.mean()
for i in range(len(np_array)):
    np_array[i] = (np_array[i] - mean)/std
    
df['year_difference'] = np_array.tolist()
'''

ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:25333)
Traceback (most recent call last):
  File "C:\Users\Brescia\Anaconda3\lib\site-packages\py4j\java_gateway.py", line 1174, in send_command
    self.socket.sendall(command.encode("utf-8"))
ConnectionResetError: [WinError 10054] Connessione in corso interrotta forzatamente dall'host remoto

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Brescia\Anaconda3\lib\site-packages\py4j\java_gateway.py", line 1014, in send_command
    response = connection.send_command(command)
  File "C:\Users\Brescia\Anaconda3\lib\site-packages\py4j\java_gateway.py", line 1178, in send_command
    "Error while sending", e, proto.ERROR_ON_SEND)
py4j.protocol.Py4JNetworkError: Error while sending

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Brescia\Anaconda3\lib\sit

ConnectionRefusedError: [WinError 10061] Impossibile stabilire la connessione. Rifiuto persistente del computer di destinazione
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:25333)
Traceback (most recent call last):
  File "C:\Users\Brescia\Anaconda3\lib\site-packages\py4j\java_gateway.py", line 958, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Brescia\Anaconda3\lib\site-packages\py4j\java_gateway.py", line 1096, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [WinError 10061] Impossibile stabilire la connessione. Rifiuto persistente del computer di destinazione
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:25333)
Traceback (most recent call last):
  File "C:\Users\Brescia\Anaconda3\lib\si

ConnectionRefusedError: [WinError 10061] Impossibile stabilire la connessione. Rifiuto persistente del computer di destinazione
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:25333)
Traceback (most recent call last):
  File "C:\Users\Brescia\Anaconda3\lib\site-packages\py4j\java_gateway.py", line 958, in _get_connection
    connection = self.deque.pop()
IndexError: pop from an empty deque

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\Brescia\Anaconda3\lib\site-packages\py4j\java_gateway.py", line 1096, in start
    self.socket.connect((self.address, self.port))
ConnectionRefusedError: [WinError 10061] Impossibile stabilire la connessione. Rifiuto persistente del computer di destinazione
ERROR:py4j.java_gateway:An error occurred while trying to connect to the Java server (127.0.0.1:25333)
Traceback (most recent call last):
  File "C:\Users\Brescia\Anaconda3\lib\si

"\nimport numpy as np\n\nnp_array = np.array(df['namespace_size'].tolist())\nstd = np_array.std()\nmean = np_array.mean()\nfor i in range(len(np_array)):\n    np_array[i] = (np_array[i] - mean)/std\n    \ndf['namespace_size'] = np_array.tolist()\n\nnp_array = np.array(df['last_name_length'].tolist())\nstd = np_array.std()\nmean = np_array.mean()\nfor i in range(len(np_array)):\n    np_array[i] = (np_array[i] - mean)/std\n    \ndf['last_name_length'] = np_array.tolist()\n\nnp_array = np.array(df['year_difference'].tolist())\nstd = np_array.std()\nmean = np_array.mean()\nfor i in range(len(np_array)):\n    np_array[i] = (np_array[i] - mean)/std\n    \ndf['year_difference'] = np_array.tolist()\n"

In [10]:
# Now we delete the ones we don't need in the classifier
df_to_save = df.drop(['CT', 'PubMed', 'ct_last_name', 'ct_first_name_initial', 'ct_first_name',
              'ct_organization', 'ct_mail', 'ct_year', 'ct_initials', 'ct_title', 'ct_country', 'ct_city', 
                'ar_last_name','ar_first_name_initial','ar_first_name','ar_organization', 'ar_mail', 'ar_initials',
                      'ar_year', 'ar_title', 'ar_location'], axis = 1)

In [11]:
# Writing the dataframe to file
import csv
import codecs
csv = df_to_save.to_csv(index=False)

file = codecs.open("src\\dataframe.csv", "w", "utf-8")
file.write(csv)
file.close()

In [12]:
importlib.reload(article_info)
importlib.reload(ct_info)
importlib.reload(calculate_attributes)
importlib.reload(clinical_trial)
importlib.reload(common_functions)

<module 'src.common_functions.common_functions' from 'C:\\Users\\Brescia\\Anaconda3\\Diploma\\src\\common_functions\\common_functions.py'>

In [29]:
df.iloc[0]

CT                            <src.clinical_trials.clinical_trial.ClinicalTr...
PubMed                        <src.articles.article.Article object at 0x0000...
common_answer                                                                 1
ct_last_name                                                         abdulkarim
ct_first_name_initial                                                         b
ct_first_name                                                            bassam
ct_organization                                      AHS Cancer Control Alberta
ct_mail                                                                    None
ct_year                                                                    2008
ct_initials                                                                   b
ct_title                      Combining Radiotherapy and Temozolomide With D...
ct_country                                                                   []
ct_city                                 

In [14]:
df

Unnamed: 0,CT,PubMed,common_answer,ct_last_name,ct_first_name_initial,ct_first_name,ct_organization,ct_mail,ct_year,ct_initials,...,email_equality,year_difference,last_name_length,initials_equality,namespace_size,country_equality,city_equality,organization_type_equality,jds_similarity,sts_similarity
0,<src.clinical_trials.clinical_trial.ClinicalTr...,<src.articles.article.Article object at 0x0000...,1,abdulkarim,b,bassam,AHS Cancer Control Alberta,,2008,b,...,0,0.000000,0.470588,1,0.000627,0,0,0,0.629170,0.441395
1,<src.clinical_trials.clinical_trial.ClinicalTr...,<src.articles.article.Article object at 0x0000...,1,gawin,f,frank,"Friends Research Institute, Inc.",,1999,f,...,0,0.045455,0.176471,1,0.000533,0,0,0,0.217699,0.103129
2,<src.clinical_trials.clinical_trial.ClinicalTr...,<src.articles.article.Article object at 0x0000...,1,deutsch,s,steven,Washington D.C. Veterans Affairs Medical Center,,1999,s,...,0,0.181818,0.294118,1,0.005061,0,0,0,0.000000,0.034479
3,<src.clinical_trials.clinical_trial.ClinicalTr...,<src.articles.article.Article object at 0x0000...,1,maisiak,r,richard s,University of Alabama at Birmingham,,1999,rs,...,0,0.106061,0.294118,1,0.000421,0,0,0,0.032278,0.028387
4,<src.clinical_trials.clinical_trial.ClinicalTr...,<src.articles.article.Article object at 0x0000...,1,gorden,p,phillip,National Institute of Diabetes and Digestive a...,gordenp@extra.niddk.nih.gov,1999,p,...,0,0.196970,0.235294,1,0.003331,1,1,1,0.661631,0.418099
5,<src.clinical_trials.clinical_trial.ClinicalTr...,<src.articles.article.Article object at 0x0000...,1,hochster,h,howard,Eastern Cooperative Oncology Group,,2000,h,...,0,0.121212,0.352941,1,0.001637,1,0,0,0.267082,0.374291
6,<src.clinical_trials.clinical_trial.ClinicalTr...,<src.articles.article.Article object at 0x0000...,1,fabian,c,carol j,University of Kansas Medical Center,bkimler@kumc.edu,2000,cj,...,0,0.106061,0.235294,0,0.002161,0,0,0,0.423678,0.374529
7,<src.clinical_trials.clinical_trial.ClinicalTr...,<src.articles.article.Article object at 0x0000...,1,dematteo,r,ronald,American College of Surgeons,,2001,r,...,0,0.227273,0.352941,1,0.004509,0,0,0,0.140057,0.345120
8,<src.clinical_trials.clinical_trial.ClinicalTr...,<src.articles.article.Article object at 0x0000...,0,klein,j,jonathan d,University of Rochester,,2002,jd,...,0,0.181818,0.176471,0,0.042241,1,0,1,0.000000,0.035419
9,<src.clinical_trials.clinical_trial.ClinicalTr...,<src.articles.article.Article object at 0x0000...,1,dispenzieri,a,angela,Mayo Clinic,,2002,a,...,0,0.015152,0.529412,1,0.006418,0,0,0,0.545796,0.270666


In [28]:
lista = df['PubMed'].tolist()
abstracts = []

count = 0
for i in range(len(lista)):
    coso = lista[i].article.findAll('MeshHeading')
    if len(coso) == 2:
        print(coso[0].text)
        
print(count)


Emergency Medicine
trends


Famous Persons


Humans

0
