In [1]:
import articles.article_fetch as article_fetch
import articles.articles_info as article_info
from articles import article

import csv
import codecs

import pandas as pd
from classifiers.random_forest import RandomForest
from classifiers.svm_classifier import SVMClassifier
from classifiers.mlp_classifier import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_recall_fscore_support

import clinical_trials.clinical_trial_fetch as ct_fetch
import clinical_trials.clinical_trials_info as ct_info
from clinical_trials import clinical_trial

import csv
import csv_data.csv as csv_data

from dataframe import create_dataframe
from dataframe import calculate_attributes

from common_functions import common_functions

import pandas as pd

import importlib

# I get the gold standard
gold_standard = pd.read_csv('ClinicalPmidsALL.csv', encoding = 'ISO-8859-1', sep = ';')

# Now I change the common answer from string to numerical
csv_data.numerical_answers(gold_standard)

# Let's correct data on csv (common answer sometimes is not correct)
csv_data.correctData(gold_standard)

# I get all the clinical trials ID and I get the articles in xml
Clinical_trials = ct_fetch.get_xml_doms(gold_standard['CT'].tolist())

# I get all the PMID and I get the articles in xml
PubMed_id_string = list(map(str, gold_standard['PMID'].tolist())) # I get the PMID as list of strings
PubMed_articles = article_fetch.fetch_many_articles(PubMed_id_string, local=True) # It takes time to fetch the articles

# Let's map every clinical trial to every article (and remove the one that are not present) and common answer
df = create_dataframe.get_base_dataframe(gold_standard, Clinical_trials, PubMed_articles)

# standard, spacy or stanford
ct_org_sample = 'standard'
ar_org_sample = 'spacy'

  from numpy.core.umath_tests import inner1d


article 27953647  not found.
article 27948541  not found.
article 27943881  not found.
article 27949797  not found.
article 27955116  not found.
article 27950623  not found.
article 27945102  not found.
clinical trial NCT02659670  not found.
article 27198327  not found.


In [2]:
#lam = gold_standard['CT']
#lam.index[lam == 'NCT00003204'].tolist()[0]

ct_last_names, ct_first_name_initials, ct_first_names = ct_info.get_all_name_parts(df['CT'].tolist(), gold_standard)

df['ct_last_name'], df['ct_first_name_initial'], df['ct_first_name'] = [ct_last_names, ct_first_name_initials, ct_first_names]

In [3]:
# print(sum(x is not None for x in ct_last_names))
df = df.dropna().reset_index(drop = True)

In [4]:
ar_last_names, ar_first_name_initials, ar_first_names = article_info.get_all_name_parts(df['PubMed'].tolist(), gold_standard)

df['ar_last_name'], df['ar_first_name_initial'], df['ar_first_name'] = [ar_last_names, ar_first_name_initials, ar_first_names]
df.dropna().reset_index(drop = True)
print()




In [5]:
# now that we have the base information, let's add other attributes

# we get the organization of the principal investigator, so we don't need to pass his/her name
ct_organization_names = ct_info.get_all_organization_names(df['CT'].tolist(), df['ct_last_name'].tolist(),
                                                           df['ct_first_name_initial'].tolist(),
                                                           sample=ct_org_sample)

# Now we insert it in the dataframe
if('ct_organization' not in df):
    df.insert(6, 'ct_organization', ct_organization_names)
    

# I get the e-mails
ct_mails = ct_info.get_all_mails(df['CT'].tolist(), df['ct_last_name'].tolist(), df['ct_first_name_initial'].tolist())

if('ct_mail' not in df):
    df.insert(7, 'ct_mail', ct_mails)
    
# I get the year
ct_years = ct_info.get_all_years(df['CT'].tolist())
    
if('ct_year' not in df):
    df.insert(8, 'ct_year', ct_years)
    
# I get the initials of the name
ct_initials = ct_info.get_all_initials(df['ct_first_name'].tolist())

if('ct_initials' not in df):
    df.insert(9, 'ct_initials', ct_initials)
    
# I get the title
ct_titles = ct_info.get_all_titles(df['CT'].tolist())

if('ct_title' not in df):
    df.insert(10, 'ct_title', ct_titles)
    
# I get the country and the city
ct_countries, ct_cities = ct_info.get_all_countries_and_cities(df['CT'].tolist())

if('ct_country' not in df):
    df.insert(11, 'ct_country', ct_countries)
    
if('ct_city' not in df):
    df.insert(12, 'ct_city', ct_cities)

NCT00730210 doesn't have an organization.
NCT02220283 doesn't have an organization.


In [6]:
print(sum(x is not None for x in df['ct_mail']))

267


In [7]:
# Let's add the same attributes for the articles

ar_organization_names, ar_locations = article_info.get_all_organizations_locations(df['PubMed'].tolist(),
                                                           df['ar_last_name'].tolist(),
                                                           df['ar_first_name_initial'].tolist(),
                                                           sample = ar_org_sample)
df['ar_organization'] = ar_organization_names

ar_mails = article_info.get_all_mails(df['PubMed'].tolist(), df['ar_last_name'].tolist(),
                                      df['ar_first_name_initial'].tolist())
df['ar_mail'] = ar_mails

ar_years = article_info.get_all_years(df['PubMed'].tolist())

df['ar_year'] = ar_years

ar_initials = article_info.get_all_initials(df['ar_first_name'])

df['ar_initials'] = ar_initials

ar_titles = article_info.get_all_titles(df['PubMed'].tolist())

df['ar_title'] = ar_titles
df['ar_location'] = ar_locations

print(sum(x is not None for x in ar_organization_names))
print(sum(x is not None for x in ar_mails))

475
126


In [8]:
# I now calculate useful attributes for the classifiers
first_name_equalities = calculate_attributes.get_string_arrays_similarity(df['ct_first_name'].tolist(),
                                                                          df['ar_first_name'].tolist())
organization_similarities, organization_type_equalities = calculate_attributes.get_organization_similarity_and_type_equality(df['ct_organization'].tolist(),
                                                                             df['ar_organization'].tolist(),
                                                                             ct_org_sample, ar_org_sample)
email_equalities = calculate_attributes.get_arrays_equality(df['ct_mail'].tolist(), df['ar_mail'].tolist())
year_differences = calculate_attributes.get_year_differences(df['ct_year'].tolist(), df['ar_year'].tolist())
last_name_lengths = calculate_attributes.get_last_name_lengths(df['ct_last_name'].tolist())
initials_equality = calculate_attributes.get_arrays_equality(df['ct_initials'].tolist(), df['ar_initials'].tolist())
namespace_sizes = calculate_attributes.get_namespace_ambiguities(df['ct_last_name'].tolist(),
                                                                 df['ct_first_name_initial'].tolist())
country_equalities, city_equalities = calculate_attributes.get_location_equalities(df['ct_country'].tolist(),
                                                                                   df['ct_city'].tolist(),
                                                                                   df['ar_location'].tolist())

# Let's add the attributes to the data frame
df['first_name_equality'], df['organization_similarity'] = [first_name_equalities, organization_similarities]
df['email_equality'], df['year_difference'] = [email_equalities, year_differences]
df['last_name_length'], df['initials_equality'] = [last_name_lengths, initials_equality]
df['namespace_size'], df ['country_equality'] = [namespace_sizes, country_equalities]
df['city_equality'], df['organization_type_equality'] = [city_equalities, organization_type_equalities]

In [9]:
from java_libraries import java_server

clinical_trials = df['CT'].tolist()
articles = df['PubMed'].tolist()

server = java_server.JavaServer()

jds = []
sts = []
for i in range(len(articles)):
    ct_texts = clinical_trials[i].get_text()
    ct_texts.append(clinical_trials[i].get_title())
    ct_texts = " ".join(ct_texts)

    ar_texts = articles[i].get_text()
    ar_texts.append(articles[i].get_title())
    ar_texts = " ".join(ar_texts)
        
    jd = [server.get_jds(ct_texts), server.get_jds(ar_texts)]
    jds.append(jd)
    st = [server.get_sts(ct_texts), server.get_sts(ar_texts)]
    sts.append(st)

In [10]:
server.close_server()

In [None]:
max_precision = 0

from dataframe import calculate_attributes

for i in range(1,20):
    for kk in range(1,20):
        jds_similarities = []
        sts_similarities = []
        for j in range(len(jds)):
            jds_similarities.append(calculate_attributes.get_jds_sts_basic_similarities(jds[j][0][:i],jds[j][1][:i]))
        for j in range(len(sts)):
            sts_similarities.append(calculate_attributes.get_jds_sts_basic_similarities(sts[j][0][:kk],sts[j][1][:kk]))

        df['jds_similarity'] = jds_similarities
        df['sts_similarity'] = sts_similarities

        # normalize
        df['namespace_size'] = common_functions.normalize(df['namespace_size'])
        df['last_name_length'] = common_functions.normalize(df['last_name_length'])
        df['year_difference'] = common_functions.normalize(df['year_difference'])

        # save csv
        df_to_save = df.drop(['CT', 'PubMed', 'ct_last_name', 'ct_first_name_initial', 'ct_first_name',
              'ct_organization', 'ct_mail', 'ct_year', 'ct_initials', 'ct_title', 'ct_country', 'ct_city', 
                'ar_last_name','ar_first_name_initial','ar_first_name','ar_organization', 'ar_mail', 'ar_initials',
                      'ar_year', 'ar_title', 'ar_location'], axis = 1)
        csv = df_to_save.to_csv(index=False)
        file = codecs.open("dataframe.csv", "w", "utf-8")
        file.write(csv)
        file.close()

        # model
        df_model = pd.read_csv('dataframe.csv', encoding = 'utf-8')
        x = df_model.drop('common_answer', axis=1)
        y = df_model['common_answer']

        avg_precision = 0
        for k in range(100):
            x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state = k)
            classifier = RandomForest(0)
            classifier.create_model(x_train, y_train)
            pred = classifier.predict(x_test)
            avg_precision += precision_recall_fscore_support(y_test, pred, average='weighted')[0]
        avg_precision /= 100

        print("i:",i,"average precision:",avg_precision)

        if avg_precision > max_precision:
            max_precision = avg_precision
            best_i = i
            best_kk = kk
        
print(best_i, best_kk)

In [None]:
max_precision = 0

for i in range(1,100):
    jds_similarities = []
    sts_similarities = []
    for j in range(len(jds)):
        jds_similarities.append(calculate_attributes.get_jds_sts_percentage_similarities(jds[j][0][:i],jds[j][1][:i]))
    for j in range(len(sts)):
        sts_similarities.append(calculate_attributes.get_jds_sts_percentage_similarities(sts[j][0][:i],sts[j][1][:i]))
    
    df['jds_similarity'] = jds_similarities
    df['sts_similarity'] = sts_similarities

    # normalize
    df['namespace_size'] = common_functions.normalize(df['namespace_size'])
    df['last_name_length'] = common_functions.normalize(df['last_name_length'])
    df['year_difference'] = common_functions.normalize(df['year_difference'])

    # save csv
    df_to_save = df.drop(['CT', 'PubMed', 'ct_last_name', 'ct_first_name_initial', 'ct_first_name',
          'ct_organization', 'ct_mail', 'ct_year', 'ct_initials', 'ct_title', 'ct_country', 'ct_city', 
            'ar_last_name','ar_first_name_initial','ar_first_name','ar_organization', 'ar_mail', 'ar_initials',
                  'ar_year', 'ar_title', 'ar_location'], axis = 1)
    csv = df_to_save.to_csv(index=False)
    file = codecs.open("dataframe.csv", "w", "utf-8")
    file.write(csv)
    file.close()

    # model
    df_model = pd.read_csv('dataframe.csv', encoding = 'utf-8')
    x = df_model.drop('common_answer', axis=1)
    y = df_model['common_answer']

    avg_precision = 0
    for k in range(100):
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state = k)
        classifier = RandomForest(0)
        classifier.create_model(x_train, y_train)
        pred = classifier.predict(x_test)
        avg_precision += precision_recall_fscore_support(y_test, pred, average='weighted')[0]
    avg_precision /= 100

    print("i:",i,"average precision:",avg_precision)

    if avg_precision > max_precision:
        max_precision = avg_precision
        best_i = i
        
print(best_i)

In [None]:
max_precision = 0

for i in range(1,100):
    jds_similarities = []
    sts_similarities = []
    for j in range(len(jds)):
        jds_similarities.append(calculate_attributes.get_jds_sts_ranking_similarities(jds[j][0][:i],jds[j][1][:i]))
    for j in range(len(sts)):
        sts_similarities.append(calculate_attributes.get_jds_sts_ranking_similarities(sts[j][0][:i],sts[j][1][:i]))
    
    df['jds_similarity'] = jds_similarities
    df['sts_similarity'] = sts_similarities

    # normalize
    df['namespace_size'] = common_functions.normalize(df['namespace_size'])
    df['last_name_length'] = common_functions.normalize(df['last_name_length'])
    df['year_difference'] = common_functions.normalize(df['year_difference'])

    # save csv
    df_to_save = df.drop(['CT', 'PubMed', 'ct_last_name', 'ct_first_name_initial', 'ct_first_name',
          'ct_organization', 'ct_mail', 'ct_year', 'ct_initials', 'ct_title', 'ct_country', 'ct_city', 
            'ar_last_name','ar_first_name_initial','ar_first_name','ar_organization', 'ar_mail', 'ar_initials',
                  'ar_year', 'ar_title', 'ar_location'], axis = 1)
    csv = df_to_save.to_csv(index=False)
    file = codecs.open("dataframe.csv", "w", "utf-8")
    file.write(csv)
    file.close()

    # model
    df_model = pd.read_csv('dataframe.csv', encoding = 'utf-8')
    x = df_model.drop('common_answer', axis=1)
    y = df_model['common_answer']

    avg_precision = 0
    for k in range(100):
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state = k)
        classifier = RandomForest(0)
        classifier.create_model(x_train, y_train)
        pred = classifier.predict(x_test)
        avg_precision += precision_recall_fscore_support(y_test, pred, average='weighted')[0]
    avg_precision /= 100

    print("i:",i,"average precision:",avg_precision)

    if avg_precision > max_precision:
        max_precision = avg_precision
        best_i = i
        
print(best_i)

In [12]:
max_precision = 0

from dataframe import calculate_attributes

for i in range(4,50):
    for kk in range(1,50):
        jds_similarities = []
        sts_similarities = []
        for j in range(len(jds)):
            jds_similarities.append(calculate_attributes.get_jds_sts_percentage_ranking_similarities(jds[j][0][:i],jds[j][1][:i]))
        for j in range(len(sts)):
            sts_similarities.append(calculate_attributes.get_jds_sts_percentage_ranking_similarities(sts[j][0][:kk],sts[j][1][:kk]))

        df['jds_similarity'] = jds_similarities
        df['sts_similarity'] = sts_similarities

        # normalize
        df['namespace_size'] = common_functions.normalize(df['namespace_size'])
        df['last_name_length'] = common_functions.normalize(df['last_name_length'])
        df['year_difference'] = common_functions.normalize(df['year_difference'])

        # save csv
        df_to_save = df.drop(['CT', 'PubMed', 'ct_last_name', 'ct_first_name_initial', 'ct_first_name',
              'ct_organization', 'ct_mail', 'ct_year', 'ct_initials', 'ct_title', 'ct_country', 'ct_city', 
                'ar_last_name','ar_first_name_initial','ar_first_name','ar_organization', 'ar_mail', 'ar_initials',
                      'ar_year', 'ar_title', 'ar_location'], axis = 1)
        csv = df_to_save.to_csv(index=False)
        file = codecs.open("dataframe.csv", "w", "utf-8")
        file.write(csv)
        file.close()

        # model
        df_model = pd.read_csv('dataframe.csv', encoding = 'utf-8')
        x = df_model.drop('common_answer', axis=1)
        y = df_model['common_answer']

        avg_precision = 0
        for k in range(15):
            x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state = k)
            classifier = RandomForest(0)
            classifier.create_model(x_train, y_train)
            pred = classifier.predict(x_test)
            avg_precision += precision_recall_fscore_support(y_test, pred, average='weighted')[0]
        avg_precision /= 15

        print("i:",i,"average precision:","kk:",kk,avg_precision)

        if avg_precision > max_precision:
            max_precision = avg_precision
            best_i = i
            best_kk = kk
        
print(best_i, best_kk)

i: 4 average precision: kk: 1 0.8871453039693695
i: 4 average precision: kk: 2 0.8849539626403151
i: 4 average precision: kk: 3 0.8828043849315894
i: 4 average precision: kk: 4 0.8834250961484948
i: 4 average precision: kk: 5 0.8857022645417897
i: 4 average precision: kk: 6 0.8838896215698352
i: 4 average precision: kk: 7 0.8838774557081963
i: 4 average precision: kk: 8 0.8839015357802716
i: 4 average precision: kk: 9 0.8858048558172129
i: 4 average precision: kk: 10 0.8904061853664359
i: 4 average precision: kk: 11 0.8892342634239935
i: 4 average precision: kk: 12 0.8887784118744395
i: 4 average precision: kk: 13 0.8869539583629461
i: 4 average precision: kk: 14 0.88708454395335
i: 4 average precision: kk: 15 0.8874596249745109
i: 4 average precision: kk: 16 0.8867620505722483
i: 4 average precision: kk: 17 0.8879746673054871
i: 4 average precision: kk: 18 0.8871773611057993
i: 4 average precision: kk: 19 0.8870156283506719
i: 4 average precision: kk: 20 0.8876676275638232
i: 4 averag

KeyboardInterrupt: 

In [None]:
importlib.reload(article_info)
importlib.reload(ct_info)
importlib.reload(calculate_attributes)
importlib.reload(clinical_trial)
importlib.reload(common_functions)

In [None]:
df