In [1]:
import articles.article_fetch as article_fetch
import articles.articles_info as article_info
from articles import article

import clinical_trials.clinical_trial_fetch as ct_fetch
import clinical_trials.clinical_trials_info as ct_info
from clinical_trials import clinical_trial

import csv
import csv_data.csv as csv_data

from dataframe import create_dataframe
from dataframe import calculate_attributes

import pandas as pd

import importlib

# I get the gold standard
gold_standard = pd.read_csv('ClinicalPmidsALL.csv', encoding = 'ISO-8859-1', sep = ';')

# Now I change the common answer from string to numerical
csv_data.numerical_answers(gold_standard)

# Let's correct data on csv (common answer sometimes is not correct)
csv_data.correctData(gold_standard)

# I get all the clinical trials ID and I get the articles in xml
Clinical_trials = ct_fetch.get_xml_doms(gold_standard['CT'].tolist())

# I get all the PMID and I get the articles in xml
PubMed_id_string = list(map(str, gold_standard['PMID'].tolist())) # I get the PMID as list of strings
PubMed_articles = article_fetch.fetch_many_articles(PubMed_id_string, local=True) # It takes time to fetch the articles

# Let's map every clinical trial to every article (and remove the one that are not present) and common answer
df = create_dataframe.get_base_dataframe(gold_standard, Clinical_trials, PubMed_articles)

article 27953647  not found.
article 27948541  not found.
article 27943881  not found.
article 27949797  not found.
article 27955116  not found.
article 27950623  not found.
article 27945102  not found.
clinical trial NCT02659670  not found.
article 27198327  not found.


In [2]:
#lam = gold_standard['CT']
#lam.index[lam == 'NCT00003204'].tolist()[0]

ct_last_names, ct_first_name_initials, ct_first_names = ct_info.get_all_name_parts(df['CT'].tolist(), gold_standard)

df['ct_last_name'], df['ct_first_name_initial'], df['ct_first_name'] = [ct_last_names, ct_first_name_initials, ct_first_names]


In [3]:
# print(sum(x is not None for x in ct_last_names))
df = df.dropna().reset_index(drop = True)

In [4]:
ar_last_names, ar_first_name_initials, ar_first_names = article_info.get_all_name_parts(df['PubMed'].tolist(), gold_standard)

df['ar_last_name'], df['ar_first_name_initial'], df['ar_first_name'] = [ar_last_names, ar_first_name_initials, ar_first_names]
df.dropna().reset_index(drop = True)
print()




In [5]:
# now that we have the base information, let's add other attributes

# we get the organization of the principal investigator, so we don't need to pass his/her name
ct_organization_names = ct_info.get_all_organization_names(df['CT'].tolist(), df['ct_last_name'].tolist(),
                                                           df['ct_first_name_initial'].tolist())

# Now we insert it in the dataframe
if('ct_organization' not in df):
    df.insert(6, 'ct_organization', ct_organization_names)
    

# I get the e-mails
ct_mails = ct_info.get_all_mails(df['CT'].tolist(), df['ct_last_name'].tolist(), df['ct_first_name_initial'].tolist())

if('ct_mail' not in df):
    df.insert(7, 'ct_mail', ct_mails)
    
# I get the year
ct_years = ct_info.get_all_years(df['CT'].tolist())
    
if('ct_year' not in df):
    df.insert(8, 'ct_year', ct_years)
    
# I get the initials of the name

ct_initials = ct_info.get_all_initials(df['ct_first_name'].tolist())

if('ct_initials' not in df):
    df.insert(9, 'ct_initials', ct_initials)

NCT00730210 doesn't have an organization.
NCT02220283 doesn't have an organization.


In [6]:
print(sum(x is not None for x in df['ct_mail']))

267


In [7]:
# Let's add the same attributes for the articles

ar_organization_names = article_info.get_all_organizations(df['PubMed'].tolist(), df['ar_last_name'].tolist(),
                                                           df['ar_first_name_initial'].tolist())
df['ar_organization'] = ar_organization_names

ar_mails = article_info.get_all_mails(df['PubMed'].tolist(), df['ar_last_name'].tolist(),
                                      df['ar_first_name_initial'].tolist())
df['ar_mail'] = ar_mails

ar_years = article_info.get_all_years(df['PubMed'].tolist())

df['ar_year'] = ar_years

ar_initials = article_info.get_all_initials(df['ar_first_name'])

df['ar_initials'] = ar_initials

print(sum(x is not None for x in ar_organization_names))
print(sum(x is not None for x in ar_mails))

475
126


In [8]:
# I now calculate useful attributes for the classifiers
first_name_equalities = calculate_attributes.get_string_arrays_similarity(df['ct_first_name'].tolist(),
                                                                          df['ar_first_name'].tolist())
organization_similarities = calculate_attributes.get_organization_similarity(df['ct_organization'].tolist(),
                                                                             df['ar_organization'].tolist())
email_equalities = calculate_attributes.get_arrays_equality(df['ct_mail'].tolist(), df['ar_mail'].tolist())
year_differences = calculate_attributes.get_year_differences(df['ct_year'].tolist(), df['ar_year'].tolist())
last_name_lengths = calculate_attributes.get_last_name_lengths(df['ct_last_name'].tolist())
initials_equality = calculate_attributes.get_arrays_equality(df['ct_initials'].tolist(), df['ar_initials'].tolist())

# Let's add the attributes to the data frame
df['first_name_equality'], df['organization_similarity'] = [first_name_equalities, organization_similarities]
df['email_equality'], df['year_difference'] = [email_equalities, year_differences]
df['last_name_length'], df['initials_equality'] = [last_name_lengths, initials_equality]

In [9]:
# Now we delete the ones we don't need in the classifier
df_to_save = df.drop(['CT', 'PubMed', 'ct_last_name', 'ct_first_name_initial', 'ct_first_name',
              'ct_organization', 'ct_mail', 'ct_year', 'ct_initials', 'ar_last_name', 'ar_first_name_initial',
                'ar_first_name','ar_organization', 'ar_mail', 'ar_initials', 'ar_year'], axis = 1)

In [10]:
# Writing the dataframe to file
import csv
import codecs
csv = df_to_save.to_csv(index=False)

file = codecs.open("dataframe.csv", "w", "utf-8")
file.write(csv)
file.close()

In [11]:
importlib.reload(article_info)
importlib.reload(ct_info)
importlib.reload(calculate_attributes)
importlib.reload(clinical_trial)

<module 'clinical_trials.clinical_trial' from 'C:\\Users\\Brescia\\Anaconda3\\Diploma\\clinical_trials\\clinical_trial.py'>

In [12]:
df.iloc[40]

CT                         <clinical_trials.clinical_trial.ClinicalTrial ...
PubMed                     <articles.article.Article object at 0x0000028C...
common_answer                                                              0
ct_last_name                                                            saba
ct_first_name_initial                                                      s
ct_first_name                                                        samir f
ct_organization                                     University of Pittsburgh
ct_mail                                                                 None
ct_year                                                                 2005
ct_initials                                                               sf
ar_last_name                                                            saba
ar_first_name_initial                                                      s
ar_first_name                                                            s m

In [13]:
df

Unnamed: 0,CT,PubMed,common_answer,ct_last_name,ct_first_name_initial,ct_first_name,ct_organization,ct_mail,ct_year,ct_initials,...,ar_organization,ar_mail,ar_year,ar_initials,first_name_equality,organization_similarity,email_equality,year_difference,last_name_length,initials_equality
0,<clinical_trials.clinical_trial.ClinicalTrial ...,<articles.article.Article object at 0x0000028C...,1,abdulkarim,b,bassam,AHS Cancer Control Alberta,,2008,b,...,,,2008,b,0.166667,0.000000,0,0,10,1
1,<clinical_trials.clinical_trial.ClinicalTrial ...,<articles.article.Article object at 0x0000028C...,1,gawin,f,frank,"Friends Research Institute, Inc.",,1999,f,...,,,2002,f,1.000000,0.000000,0,3,5,1
2,<clinical_trials.clinical_trial.ClinicalTrial ...,<articles.article.Article object at 0x0000028C...,1,deutsch,s,steven,Washington D.C. Veterans Affairs Medical Center,,1999,s,...,,,2011,s,1.000000,0.000000,0,12,7,1
3,<clinical_trials.clinical_trial.ClinicalTrial ...,<articles.article.Article object at 0x0000028C...,1,maisiak,r,richard s,University of Alabama at Birmingham,,1999,rs,...,,,2006,rs,1.000000,0.000000,0,7,7,1
4,<clinical_trials.clinical_trial.ClinicalTrial ...,<articles.article.Article object at 0x0000028C...,1,gorden,p,phillip,National Institute of Diabetes and Digestive a...,gordenp@extra.niddk.nih.gov,1999,p,...,"Diabetes, Endocrinology, and Obesity Branch, N...",phillipg@intra.niddk.nih.gov,2012,p,1.000000,0.900000,0,13,6,1
5,<clinical_trials.clinical_trial.ClinicalTrial ...,<articles.article.Article object at 0x0000028C...,1,hochster,h,howard,Eastern Cooperative Oncology Group,,2000,h,...,"Division of Medical Oncology, New York Univers...",howard.hochster@med.nyu.edu,2008,h,1.000000,0.250000,0,8,8,1
6,<clinical_trials.clinical_trial.ClinicalTrial ...,<articles.article.Article object at 0x0000028C...,1,fabian,c,carol j,University of Kansas Medical Center,bkimler@kumc.edu,2000,cj,...,,,2007,c,0.714286,0.000000,0,7,6,0
7,<clinical_trials.clinical_trial.ClinicalTrial ...,<articles.article.Article object at 0x0000028C...,1,dematteo,r,ronald,American College of Surgeons,,2001,r,...,"Karen T. Brown, Richard K. Do, Mithat Gonen, A...",,2016,r,1.000000,0.750000,0,15,8,1
8,<clinical_trials.clinical_trial.ClinicalTrial ...,<articles.article.Article object at 0x0000028C...,0,klein,j,jonathan d,University of Rochester,,2002,jd,...,"University of California; Irvine, CA, USA.",,2014,j,0.100000,0.666667,0,12,5,0
9,<clinical_trials.clinical_trial.ClinicalTrial ...,<articles.article.Article object at 0x0000028C...,1,dispenzieri,a,angela,Mayo Clinic,,2002,a,...,,,2001,a,0.166667,0.000000,0,1,11,1


In [14]:
lista = df['CT'].tolist()
a = df['ct_last_name'].tolist()
b = df['ct_first_name_initial'].tolist()
mails = df['ct_mail'].tolist()

count = 0

for i in range(len(lista)):
    tags = lista[i].get_all_name_tags(a[i],b[i])
    
    for tag in tags:
        if tag.email is not None or mails[i]:
            #print(tag.email, mails[i])
            count += 1
        
print(count)

501


In [15]:
lista = df['CT'].tolist()

for i in range(len(lista)):
    loc = lista[i].clinical_trial.findAll('location')
    if len(loc) > 1:
        print(loc[0].country.text, loc[1].country.text, '\n')

United States United States 

United States United States 

United States United States 

United States United States 

United States United States 

United States United States 

United States United States 

Hong Kong Hong Kong 

United States United States 

United States United States 

United States United States 

Japan Japan 

United States United States 

Canada Canada 

United States United States 

United States United States 

United States United States 

United States United States 

Canada Canada 

Japan Japan 

United States United States 

United States Korea, Republic of 

United States United States 

United States United States 

United States United States 

Denmark Denmark 

United States United States 

United Kingdom United Kingdom 

United States United States 

Belgium Belgium 

United States United States 

Canada Canada 

United States United States 

United States United States 

United States United States 

United States United States 

United States Unite

In [16]:
lista = df['CT'].tolist()

for i in range(len(lista)):
    loc = lista[i].clinical_trial.findAll('overall_official')
    if len(loc) >= 1:
        print(loc[0], '\n')

<overall_official>
<last_name>Bassam Abdulkarim, MD, FRCPC</last_name>
<role>Principal Investigator</role>
<affiliation>AHS Cancer Control Alberta</affiliation>
</overall_official> 

<overall_official>
<last_name>Frank Gawin, M.D.</last_name>
<role>Principal Investigator</role>
<affiliation>Friends Research Institute, Inc.</affiliation>
</overall_official> 

<overall_official>
<last_name>Steven Deutsch, M.D.</last_name>
<role>Principal Investigator</role>
<affiliation>Washington D.C. Veterans Affairs Medical Center</affiliation>
</overall_official> 

<overall_official>
<last_name>Richard S. Maisiak, PhD, MSPH</last_name>
<role>Principal Investigator</role>
<affiliation>University of Alabama at Birmingham</affiliation>
</overall_official> 

<overall_official>
<last_name>Phillip Gorden, M.D.</last_name>
<role>Principal Investigator</role>
<affiliation>National Institute of Diabetes and Digestive and Kidney Diseases (NIDDK)</affiliation>
</overall_official> 

<overall_official>
<last_name

<overall_official>
<last_name>Steven Grant</last_name>
<role>Principal Investigator</role>
<affiliation>Massey Cancer Center</affiliation>
</overall_official> 

<overall_official>
<last_name>Masashi Fujii, MD PhD</last_name>
<role>Principal Investigator</role>
<affiliation>Surugadai Nihon University Hospital</affiliation>
</overall_official> 

<overall_official>
<last_name>Aaron S Fink, MD</last_name>
<role>Principal Investigator</role>
<affiliation>Atlanta VA Medical and Rehab Center, Decatur, GA</affiliation>
</overall_official> 

<overall_official>
<last_name>Carsten Bokemeyer, MD</last_name>
<role>Principal Investigator</role>
<affiliation>University Hospital Tuebingen (PI until 30Nov2004)</affiliation>
</overall_official> 

<overall_official>
<last_name>Gen Yasuda, MD</last_name>
<role>Principal Investigator</role>
<affiliation>Yokohama City University Center Hospital</affiliation>
</overall_official> 

<overall_official>
<last_name>Sharon Compton, PhD</last_name>
<role>Principal 


<overall_official>
<last_name>James P Steinberg, MD</last_name>
<role>Principal Investigator</role>
<affiliation>Emory University</affiliation>
</overall_official> 

<overall_official>
<last_name>Stella M. Davies, MBBSPhd MRCP</last_name>
<role>Principal Investigator</role>
<affiliation>Children's Hospital Medical Center, Cincinnati</affiliation>
</overall_official> 

<overall_official>
<last_name>David H McDermott, M.D.</last_name>
<role>Principal Investigator</role>
<affiliation>National Institute of Allergy and Infectious Diseases (NIAID)</affiliation>
</overall_official> 

<overall_official>
<last_name>Sarah L Blair, MD</last_name>
<role>Principal Investigator</role>
<affiliation>University of California, San Diego</affiliation>
</overall_official> 

<overall_official>
<last_name>James D Carlson, Pharm. D.</last_name>
<role>Principal Investigator</role>
<affiliation>PRACS Institute, Ltd.</affiliation>
</overall_official> 

<overall_official>
<last_name>Philip Bejon, PhD</last_name


<overall_official>
<last_name>Sandrine Faivre</last_name>
<role>Principal Investigator</role>
<affiliation>Hopital Beaujon</affiliation>
</overall_official> 

<overall_official>
<last_name>Gordon W. Peterson, MD</last_name>
<role>Principal Investigator</role>
<affiliation>Neurology, Faculty Physicians and Surgeons of Loma Linda University School of Medicine</affiliation>
</overall_official> 

<overall_official>
<last_name>Stephen McRae, MD</last_name>
<role>Principal Investigator</role>
<affiliation>UT MD Anderson Cancer Center</affiliation>
</overall_official> 

<overall_official>
<last_name>Joseph Fay, MD</last_name>
<role>Principal Investigator</role>
<affiliation>Baylor Health Care System</affiliation>
</overall_official> 

<overall_official>
<last_name>Kathleen T Brady, MD, PhD</last_name>
<role>Principal Investigator</role>
<affiliation>Medical University of South Carolina</affiliation>
</overall_official> 

<overall_official>
<last_name>Krittaecho Siripassorn, MD</last_name>
<r

</overall_official> 

<overall_official>
<last_name>Nelson M Oyesiku, MD, PhD</last_name>
<role>Principal Investigator</role>
<affiliation>Emory University</affiliation>
</overall_official> 

<overall_official>
<last_name>Ming-Chung Wang, M.D.</last_name>
<role>Principal Investigator</role>
<affiliation>Chang Gung Memorial Hospital</affiliation>
</overall_official> 

<overall_official>
<last_name>Wang Weu</last_name>
<role>Study Director</role>
<affiliation>Comprehensive Weight Management Center Taipei Medical University Hospital</affiliation>
</overall_official> 

<overall_official>
<last_name>Kyung Sang Yu, Ph.D</last_name>
<role>Principal Investigator</role>
<affiliation>Seoul National University Hospital</affiliation>
</overall_official> 

<overall_official>
<last_name>Samuel T. Kuna, MD</last_name>
<role>Principal Investigator</role>
<affiliation>Philadelphia VA Medical Center, Philadelphia, PA</affiliation>
</overall_official> 

<overall_official>
<last_name>Karin Potthoff, MD</l

<overall_official>
<last_name>David MacLeod, MB BS</last_name>
<role>Principal Investigator</role>
<affiliation>Duke Anesthesiology</affiliation>
</overall_official> 

<overall_official>
<last_name>Stefan Breitenstein, MD</last_name>
<role>Principal Investigator</role>
<affiliation>Kantonsspital Winterthur KSW</affiliation>
</overall_official> 

<overall_official>
<last_name>Ofir Frenkel, M.D</last_name>
<role>Principal Investigator</role>
<affiliation>Sheba Medical Center</affiliation>
</overall_official> 

<overall_official>
<last_name>David L. Katz, MD, MPH</last_name>
<role>Principal Investigator</role>
<affiliation>Yale-Griffin Prevention Research Center</affiliation>
</overall_official> 

<overall_official>
<last_name>Daniel J Jackson, MD</last_name>
<role>Principal Investigator</role>
<affiliation>University of Wisconsin, Madison</affiliation>
</overall_official> 

<overall_official>
<last_name>Michal Fried, Ph.D.</last_name>
<role>Principal Investigator</role>
<affiliation>Nati

In [21]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

lista = df['ar_organization'].tolist()
for item in lista:
    if item is None:
        continue
    doc = nlp(item.replace(';',','))
    print(item, [(X.text, X.label_) for X in doc.ents if X.label_ == 'ORG'], '\n')

Diabetes, Endocrinology, and Obesity Branch, National Institute of Diabetes and Digestive and Kidney Diseases, National Institutes of Health, Bethesda, Maryland 20892, USA. phillipg@intra.niddk.nih.gov [('Obesity Branch', 'ORG'), ('National Institute of Diabetes and Digestive', 'ORG'), ('National Institutes of Health', 'ORG')] 

Division of Medical Oncology, New York University School of Medicine, NYU Cancer Institute, NY 10016, USA. howard.hochster@med.nyu.edu [('Division of Medical Oncology', 'ORG'), ('New York University School of Medicine', 'ORG'), ('NYU Cancer Institute', 'ORG')] 

Karen T. Brown, Richard K. Do, Mithat Gonen, Anne M. Covey, George I. Getrajdman, Constantinos T. Sofocleous, William R. Jarnagin, Michael I. D'Angelica, Peter J. Allen, Joseph P. Erinjeri, Lynn A. Brody, Gerald P. O'Neill, Kristian N. Johnson, Alessandra R. Garcia, Christopher Beattie, Stephen B. Solomon, Ronald DeMatteo, and Ghassan K. Abou-Alfa, Memorial Sloan Kettering Cancer Center; Binsheng Zhao a

University of Illinois at Peoria, University of Illinois at Chicago, Chicago, Illinois, USA. [('University of Illinois at Peoria', 'ORG'), ('University of Illinois', 'ORG')] 

Department of Obstetrics and Gynecology and Women's Health, New York, NY, 10461, USA. [("Department of Obstetrics and Gynecology and Women's Health", 'ORG')] 

University of Pennsylvania Perelman School of Medicine, Philadelphia. [('University of Pennsylvania Perelman School of Medicine', 'ORG')] 

State Key Laboratory of Nuclear Physics and Technology, Peking University, Beijing, China. [('Nuclear Physics and Technology', 'ORG'), ('Peking University', 'ORG')] 

Gulf Medical University, Ajman, UAE. [('Gulf Medical University', 'ORG'), ('UAE', 'ORG')] 

Perinatal Epidemiology Research Unit, Aarhus University Hospital, Aarhus, Denmark. [('Perinatal Epidemiology Research Unit', 'ORG'), ('Aarhus University Hospital', 'ORG')] 

Perinatology Research Branch, Program for Perinatal Research and Obstetrics, Division of In

Bloorview Research Institute, Holland Bloorview Kids Rehabilitation Hospital, Toronto, Ontario, Canada; Departments of Occupational Science and Occupational Therapy, University of Toronto, Toronto, Ontario, Canada. [('Bloorview Research Institute', 'ORG'), ('Holland Bloorview Kids Rehabilitation Hospital', 'ORG'), ('Departments of Occupational Science', 'ORG'), ('Occupational Therapy', 'ORG'), ('University of Toronto', 'ORG')] 

Intermountain Healthcare, Institute for Healthcare Delivery Research, Salt Lake City, UT, USA. sarah.goodlin@ihc.com [('Intermountain Healthcare', 'ORG'), ('Institute for Healthcare Delivery Research', 'ORG'), ('UT', 'ORG')] 

Department of Family Medicine, McMaster University, Canada; Kingston Community Health Centres, Canada; Department of Family Medicine, Queen's University, Canada. Electronic address: imaan@kchc.ca. [('Department of Family Medicine', 'ORG'), ('McMaster University', 'ORG'), ('Kingston Community Health Centres', 'ORG'), ('Department of Family

School of Psychiatry,University of New South Wales, Sydney, New South Wales,Australia. [('School of Psychiatry,University of New South Wales', 'ORG')] 

b Sugioka Memorial Hospital Clinical Research Center , Fukuoka , Japan ; [('Sugioka Memorial Hospital Clinical Research Center', 'ORG')] 

Hofstra North Shore-Long Island Jewish Medical Center School of Medicine at Hofstra University, Hempstead, NY; [('Hofstra University', 'ORG')] 

Hofstra North Shore-Long Island Jewish Medical Center School of Medicine at Hofstra University, Hempstead, NY; [('Hofstra University', 'ORG')] 

Herzog Hospital, Givat Shaul, Jerusalem, Israel ; Faculty of Medicine, Hebrew University, Jerusalem, Israel. [('Herzog Hospital', 'ORG'), ('Faculty of Medicine', 'ORG'), ('Hebrew University', 'ORG')] 

Department of Immunology, Genetics and Pathology, Uppsala University, Uppsala, Sweden. [('Department of Immunology, Genetics', 'ORG'), ('Uppsala University', 'ORG')] 

Department of Gastroenterology and General Surge

Ronald Levy, Andrew J. Gentles, Chih Long Liu, Robert Tibshirani, and Ash A. Alizadeh, Stanford University Medical Center, Palo Alto; Christos E. Emmanouilides and John M. Timmerman, University of California Los Angeles Medical Center, Los Angeles; Lori A. Kunkel, Diane E. Ingolia, and Dan W. Denney Jr, Genitope, Fremont, CA; Kristen N. Ganjoo and Michael J. Robertson, Indiana University Medical Center, Indianapolis, IN; John P. Leonard, Weill Medical College of Cornell University, New York, NY; Julie M. Vose, University of Nebraska Medical Center, Omaha, NE; Ian W. Flinn and Richard F. Ambinder, Johns Hopkins University Oncology Center, Baltimore, MD; Joseph M. Connors, British Columbia Cancer Agency Centre for Lymphoid Cancer, Vancouver, British Columbia; Neil L. Berinstein, Toronto-Sunnybrook Regional Cancer Centre, Toronto, Ontario; Andrew R. Belch, Cross Cancer Institute, Edmonton, Alberta, Canada; Nancy L. Bartlett, Washington University School of Medicine, St Louis, MO; Craig Ni

Department of Interventional Radiology, The University of Texas MD Anderson Cancer Center, Unit 1471, PO Box 301402, Houston, TX, 77230-1402, USA. [('Department of Interventional Radiology', 'ORG'), ('The University of Texas MD Anderson Cancer Center', 'ORG'), ('PO', 'ORG')] 

State Key Laboratory of Nuclear Physics and Technology, Peking University, Beijing, China. [('Nuclear Physics and Technology', 'ORG'), ('Peking University', 'ORG')] 

Department of Psychiatry and Behavioral Sciences, Medical University of South Carolina, Charleston, SC, 29425, United States. [('Department of Psychiatry and Behavioral Sciences', 'ORG'), ('Medical University of South Carolina', 'ORG')] 

Department of Neurological Surgery, University of Washington School of Medicine, Seattle, WA, USA. [('Department of Neurological Surgery, University of Washington School of Medicine', 'ORG'), ('WA, USA', 'ORG')] 

Department of Orthopedic Surgery, University of Mississippi Medical Center, Jackson, Mississippi. [('D

Department of Medicine/Diabetes Unit, Massachusetts General Hospital, Boston, MA, United States. [('Department of Medicine/Diabetes Unit', 'ORG'), ('Massachusetts General Hospital', 'ORG')] 

Maumenee B-110, 600 N. Wolfe St, Baltimore, MD 21287, USA. [] 

Sue C. Kaste, Deqing Pei, Cheng Cheng, Michael D. Neel, Raul C. Ribeiro, Monika L. Metzger, Deepa Bhojwani, Hiroto Inaba, Patrick Campbell, Jeffrey E. Rubnitz, Sima Jeha, John T. Sandlund, James R. Downing, Mary V. Relling, Ching-Hon Pui, and Scott C. Howard, St Jude Children's Research Hospital; Sue C. Kaste, University of Tennessee Health Sciences Center, Memphis, TN; and W. Paul Bowman, Cook Children's Medical Center, Fort Worth, TX. [('Ching-Hon Pui', 'ORG'), ("St Jude Children's Research Hospital", 'ORG'), ('University of Tennessee Health Sciences Center', 'ORG'), ("Cook Children's Medical Center", 'ORG')] 

Department of Psychiatry, Tokyo Metropolitan Police Hospital, Tokyo, Japan; Department of Psychiatry, Showa University Nort

Department of Gynaecology and Institute of Clinical Epidemiology, Martin Luther University, Halle an der Saale, Germany; School of Public Health, Departments of Pathology and Gynaecology, and Radiotherapy Center, School of Medicine, Addis Ababa University, Addis Ababa, Ethiopia; Department of Epidemiology, School of Public Health, Boston University, Boston, Massachusetts, USA eva.kantelhardt@medizin.uni-halle.de. [('Department of Gynaecology and Institute of Clinical Epidemiology', 'ORG'), ('Martin Luther University', 'ORG'), ('School of Public Health', 'ORG'), ('Departments of Pathology and Gynaecology', 'ORG'), ('Radiotherapy Center', 'ORG'), ('School of Medicine', 'ORG'), ('Addis Ababa University', 'ORG'), ('Department of Epidemiology, School of Public Health', 'ORG'), ('Boston University', 'ORG')] 

Endocrinology and Metabolism Research Center (EMRC), Vali-Asr Hospital, School of Medicine, Tehran University of Medical Sciences, Tehran, Iran. [('Endocrinology and Metabolism Research

Department of Endocrinology, Metabolism and Diabetes, Nationwide Children's Hospital, Columbus OH. [("Department of Endocrinology, Metabolism and Diabetes, Nationwide Children's Hospital", 'ORG'), ('Columbus OH', 'ORG')] 

Department of Chemistry, University of Illinois at Urbana-Champaign, Urbana, Illinois 61801. [('Department of Chemistry', 'ORG'), ('University of Illinois', 'ORG')] 

Department of Pediatrics, Pediatric Gastroenterology and Liver Unit, Sapienza University of Rome, University Hospital Umberto I, Viale Regina Elena, 324-00161 Rome, RM, Italy. giovanni.dinardo@uniroma1.it [('Department of Pediatrics', 'ORG'), ('Liver Unit', 'ORG'), ('Sapienza University of Rome', 'ORG')] 

Hospital Universitario Virgen Macarena, Sevilla, Spain. [] 

From the Department of Anesthesiology, National Taiwan University Hospital (Y.-C.Y., C.-Y.W., Y.-J.C., C.-M.L., Z.-G.W., W.-Z.S.) and the Graduate Institutes of Physiology, College of Medicine (L.C.-H.Y.), National Taiwan University, Taipei,

Department of Anesthesia, Faculty of Medicine, Erciyes University, Kayseri, Turkey. [('Department of Anesthesia', 'ORG'), ('Faculty of Medicine', 'ORG'), ('Erciyes University', 'ORG')] 

Food Science and Human Nutrition Department, University of Florida, Institute of Food and Agricultural Sciences, P.O. Box 110370, 359 FSHN Bldg., 572 Newell Dr., Gainesville, FL 32611-0370, USA. [('Food Science and Human Nutrition Department', 'ORG'), ('University of Florida', 'ORG'), ('Institute of Food and Agricultural Sciences', 'ORG'), ('Newell', 'ORG')] 

Departments of Infection Control 6901 and Clinical Microbiology 9301, Copenhagen University Hospital (Rigshospitalet), Denmark. Electronic address: tobias.ibfelt@regionh.dk. [('Clinical Microbiology 9301', 'ORG'), ('Copenhagen University Hospital', 'ORG'), ('Rigshospitalet', 'ORG')] 

State Key Laboratory of Nuclear Physics and Technology, Peking University, Beijing, China. [('Nuclear Physics and Technology', 'ORG'), ('Peking University', 'ORG')]

Brain & Mind Research Institute, Nerve Research Foundation, Level 7, Building F, Brain and Mind Research Institute, the University of Sydney, 94 Mallett Street, Camperdown, NSW 2050, Australia. ambrose.chan@sydney.edu.au [('Brain & Mind Research Institute', 'ORG'), ('Nerve Research Foundation', 'ORG'), ('Level 7', 'ORG'), ('Mind Research Institute', 'ORG'), ('the University of Sydney', 'ORG'), ('NSW', 'ORG')] 

Amita Avadhani is director, Post Masters DNP-Practice (Executive and online) assistant professor/coordinator-Adult Gerontology Acute/Critical Care Nurse Practitioner Program (MSN) Rutgers University (formerly UMDNJ), Newark, NJ and a critical care nurse practitioner at Saint Peters University Hospital, New Brunswick, N.J. Lorraine Steefel is the nurse educator/clinical coordinator at Rutgers University Behavioral Health Care/University Correctional Health Care, Trenton, N.J. [('Post Masters DNP-Practice', 'ORG'), ('MSN', 'ORG'), ('Rutgers University', 'ORG'), ('UMDNJ', 'ORG'), (

University of North Carolina at Chapel Hill. [('University of North Carolina', 'ORG'), ('Chapel Hill', 'ORG')] 

Department for Hematology, Oncology and Clinical Immunology, University Hospital, Heinrich-Heine-University, Duesseldorf, Germany. [('Department for Hematology, Oncology and Clinical Immunology', 'ORG'), ('University Hospital', 'ORG'), ('Heinrich-Heine-University', 'ORG')] 

Chinese Academy of Medical Sciences, Dong Cheng District,Beijing,People's Republic of China. [('Chinese Academy of Medical Sciences', 'ORG')] 

Children's Hospital of Pittsburgh, Pittsburgh, PA 15213, USA. [("Children's Hospital of Pittsburgh", 'ORG')] 

Buenos Aires University Medical School, Buenos Aires, Argentina. [('Buenos Aires University Medical School', 'ORG')] 

State Key Laboratory of Cancer Biology and Xijing Hospital of Digestive Diseases, Xijing Hospital, Fourth Military Medical University, Xi'an, Shaanxi, China; State Key Laboratory of Cancer Biology and Department of Biochemistry and Molec

Department of Osteopathic Medicine, College of Osteopathic Medicine, New York Institute of Technology (NYITCOM), Old Westbury, NY, USA. [('Department of Osteopathic Medicine', 'ORG'), ('College of Osteopathic Medicine', 'ORG'), ('New York Institute of Technology', 'ORG'), ('NYITCOM', 'ORG')] 

Department of Radiation Oncology, Erasmus MC-Daniel den Hoed Cancer Center, Groene Hilledijk 301, Rotterdam, The Netherlands. a.al-mamgani@erasmusmc.nl [('Department of Radiation Oncology', 'ORG'), ('Hoed Cancer Center', 'ORG')] 

BC Centre for Excellence in HIV/AIDS, Vancouver, BC, Canada; Faculty of Health Sciences, Simon Fraser University, Vancouver, BC, Canada. [('BC Centre for Excellence', 'ORG'), ('Faculty of Health Sciences', 'ORG'), ('Simon Fraser University', 'ORG')] 

State Key Laboratory of Nuclear Physics and Technology, Peking University, Beijing, China. [('Nuclear Physics and Technology', 'ORG'), ('Peking University', 'ORG')] 

Department of Medical Imaging, University of Toronto, T

Department of Molecular and Human Genetics, Baylor College of Medicine, Houston, TX 77030, USA. [('Department of Molecular and Human Genetics', 'ORG'), ('Baylor College of Medicine', 'ORG'), ('TX 77030', 'ORG')] 

a Department of Oncology and Radiation Physics , Skåne University Hospital, Lund University , Lund , Sweden. [('Department of Oncology and Radiation Physics', 'ORG'), ('Skåne University Hospital', 'ORG'), ('Lund University', 'ORG')] 

Department of Internal medicine, Division of Gastroenterology, University Hospital Center Rijeka, Croatia. [('Department of Internal', 'ORG'), ('Division of Gastroenterology', 'ORG'), ('University Hospital Center Rijeka', 'ORG')] 

Division of Gastrointestinal Surgery & Gastric Cancer Center, The First Affiliated Hospital of Sun Yat-sen University, Guangzhou, China. [('Division of Gastrointestinal Surgery & Gastric Cancer Center', 'ORG'), ('The First Affiliated Hospital', 'ORG')] 

Silver School of Social Work, New York University, New York, NY 

Department of Psychiatry and Behavioral Sciences, Stanford University School of Medicine, Stanford, California 94305-5719, USA. vcarrion@stanford.edu [('Department of Psychiatry and Behavioral Sciences', 'ORG'), ('Stanford University School of Medicine', 'ORG'), ('Stanford', 'ORG')] 

Service de médecine interne I Hôpital Laennec, Paris. [] 

a From the Institute of Biomedicine, Department of Infectious Diseases , University of Gothenburg , Gothenburg , Sweden. [('the Institute of Biomedicine, Department of Infectious Diseases , University of Gothenburg', 'ORG')] 

INSERM U1149, Faculté de Médecine Xavier Bichat, 16 rue Henri Huchard, Paris, France. [('rue Henri Huchard', 'ORG')] 

State Key Laboratory of Nuclear Physics and Technology, Peking University, Beijing, China. [('Nuclear Physics and Technology', 'ORG'), ('Peking University', 'ORG')] 

Clinical Nutrition Research Centre, Centre for Translational Medicine, Yong Loo Lin School of Medicine, Singapore; Singapore Institute for Cli

Moscow Clinical Scientific Centre, Central Scientific Research Institute of Gastroenterology, Moscow, Russia. [('Moscow Clinical Scientific Centre', 'ORG'), ('Central Scientific Research Institute of Gastroenterology', 'ORG')] 

Cardiology Department, CHU Poitiers, Poitiers, France. [('Cardiology Department', 'ORG'), ('CHU Poitiers', 'ORG'), ('Poitiers', 'ORG')] 

Department of Obstetrics & Gynecology. [('Department of Obstetrics & Gynecology', 'ORG')] 

Department of Health and Exercise Science, Colorado State University, 220 Moby B Complex, Fort Collins, CO 80523-1582, USA.  thorsten.rudroff@colostate.edu. [('Department of Health and Exercise Science', 'ORG'), ('Colorado State University', 'ORG')] 

University of Alabama at Birmingham, USA. roger.bedimo@med.va.gov [('University of Alabama', 'ORG')] 

School of Sport, Exercise and Health Sciences, Loughborough University, Loughborough, UK. [('Health Sciences', 'ORG'), ('Loughborough University', 'ORG'), ('Loughborough', 'ORG')] 

I-Ch

In [18]:
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize


stanford_classifier = 'C:/Users/Brescia/Anaconda3/Diploma/stanfordNER/classifiers/english.conll.4class.distsim.crf.ser.gz'
stanford_ner_path = 'C:/Users/Brescia/Anaconda3/Diploma/stanfordNER/stanford-ner-3.9.2.jar'

# Creating Tagger Object
st = StanfordNERTagger(stanford_classifier, stanford_ner_path, encoding='utf-8')

import os
from itertools import groupby
java_path = "C:/Program Files/Java/jdk-11.0.1/bin/java.exe"
os.environ['JAVAHOME'] = java_path
count = 0
count_star = 0
# Classifier ready bitches
# Now classify, bitch!
lista = df['ar_organization'].tolist()
for item in lista:
    if item is None:
        continue
    text = item.replace(';','')
    tokenized_text = word_tokenize(text)
    classified_text = st.tag(tokenized_text)
    print(text)

    count_star += 1
    flag = True
    for tag, chunk in groupby(classified_text, lambda x:x[1]):
        if tag == "ORGANIZATION":
            if flag:
                flag = False
                count+=1
            print("%-12s"%tag, " ".join(w for w, t in chunk))
    print('\n')

print(count_star, count)

Diabetes, Endocrinology, and Obesity Branch, National Institute of Diabetes and Digestive and Kidney Diseases, National Institutes of Health, Bethesda, Maryland 20892, USA. phillipg@intra.niddk.nih.gov
ORGANIZATION Obesity Branch
ORGANIZATION National Institute of Diabetes
ORGANIZATION Kidney Diseases
ORGANIZATION National Institutes of Health


Division of Medical Oncology, New York University School of Medicine, NYU Cancer Institute, NY 10016, USA. howard.hochster@med.nyu.edu
ORGANIZATION New York University School of Medicine
ORGANIZATION NYU Cancer Institute


Karen T. Brown, Richard K. Do, Mithat Gonen, Anne M. Covey, George I. Getrajdman, Constantinos T. Sofocleous, William R. Jarnagin, Michael I. D'Angelica, Peter J. Allen, Joseph P. Erinjeri, Lynn A. Brody, Gerald P. O'Neill, Kristian N. Johnson, Alessandra R. Garcia, Christopher Beattie, Stephen B. Solomon, Ronald DeMatteo, and Ghassan K. Abou-Alfa, Memorial Sloan Kettering Cancer Center Binsheng Zhao and Lawrence H. Schwartz,

State Key Laboratory of Nuclear Physics and Technology, Peking University, Beijing, China.
ORGANIZATION State Key Laboratory of Nuclear Physics
ORGANIZATION Peking University


Gulf Medical University, Ajman, UAE.
ORGANIZATION Gulf Medical University


Perinatal Epidemiology Research Unit, Aarhus University Hospital, Aarhus, Denmark.
ORGANIZATION Perinatal Epidemiology Research Unit
ORGANIZATION Aarhus University Hospital
ORGANIZATION Aarhus


Perinatology Research Branch, Program for Perinatal Research and Obstetrics, Division of Intramural Research, Eunice Kennedy Shriver National Institute of Child Health and Human Development, National Institutes of Health, Bethesda, MD, and Detroit, MI, Department of Obstetrics and Gynecology, University of Michigan, Ann Arbor, MI, Department of Epidemiology and Biostatistics, Michigan State University, East Lansing, MI.
ORGANIZATION Perinatology Research Branch
ORGANIZATION Perinatal Research and Obstetrics
ORGANIZATION Intramural Research
ORGANI

ORGANIZATION Respiratory Diseases
ORGANIZATION University of Ghent


Division of Critical Care Medicine, Department of Pediatrics, Children's Hospital of Wisconsin, Milwaukee, Wis.
ORGANIZATION Critical Care Medicine
ORGANIZATION Department of Pediatrics
ORGANIZATION Milwaukee


Sibley Heart Center Cardiology, Children's Healthcare of Atlanta, Atlanta, Ga.


Department of Medical Oncology, Dana-Farber Cancer Institute, Harvard Medical School, Boston, MA 02115.
ORGANIZATION Department of Medical Oncology
ORGANIZATION Dana-Farber Cancer Institute
ORGANIZATION Harvard Medical School


Gastroenterology & Hepatology.
ORGANIZATION Gastroenterology & Hepatology


Johns Hopkins University School of Medicine, Baltimore, Maryland 21224, USA. Hejones@jhmi.edu
ORGANIZATION Johns Hopkins University School of Medicine
ORGANIZATION Hejones


Johns Hopkins University School of Medicine, Baltimore, Maryland 21224, USA. Hejones@jhmi.edu
ORGANIZATION Johns Hopkins University School of Medicine
ORGANIZATI

Shoklo Malaria Research Unit, Mae Sot Tak, Thailand Faculty of Tropical Medicine, Mahidol University, Bangkok, Thailand Nuffield Department of Clinical Medicine, Centre for Clinical Vaccinology and Tropical Medicine, University of Oxford, Oxford, United Kingdom. Electronic address: francois@tropmedres.ac.
ORGANIZATION Shoklo Malaria Research Unit
ORGANIZATION Tropical Medicine
ORGANIZATION Mahidol University
ORGANIZATION Thailand Nuffield Department of Clinical Medicine
ORGANIZATION Centre for Clinical Vaccinology
ORGANIZATION Tropical Medicine
ORGANIZATION University of Oxford


Department of Emergency Medicine, Allegheny Health Network, Pittsburgh, PA.
ORGANIZATION Department of Emergency Medicine
ORGANIZATION Allegheny Health Network
ORGANIZATION PA


Northwestern University Feinberg School of Medicine, Northwestern Memorial Hospital, Department of Surgery, Chicago, Illinois.
ORGANIZATION Northwestern University Feinberg School of Medicine
ORGANIZATION Department of Surgery


Depart

ORGANIZATION Development Division of the National Association of County and City Health Officials
ORGANIZATION DC


Banner Alzheimer's Institute,Arizona,USA.
ORGANIZATION Institute


Pulmonary-Critical Care Medicine Branch, National Heart, Lung, and Blood Institute, National Institutes of Health, Bethesda, MD 20892, USA. levines@nhlbi.nih.gov
ORGANIZATION Pulmonary-Critical Care Medicine Branch
ORGANIZATION National Heart
ORGANIZATION Blood Institute
ORGANIZATION National Institutes of Health


Department of Pediatrics, Washington University School of Medicine, St Louis, MO 63110, USA. manary@kids.wustl.edu
ORGANIZATION Department of Pediatrics
ORGANIZATION Washington University School of Medicine


Department of Urology, National Taiwan University College of Medicine, Taipei, Taiwan.
ORGANIZATION Department of Urology
ORGANIZATION National Taiwan University College of Medicine


Albert Einstein College of Medicine, Children's Hospital at Montefiore, Bronx, New York 10467, USA.
ORGANIZ

Ronald Levy, Andrew J. Gentles, Chih Long Liu, Robert Tibshirani, and Ash A. Alizadeh, Stanford University Medical Center, Palo Alto Christos E. Emmanouilides and John M. Timmerman, University of California Los Angeles Medical Center, Los Angeles Lori A. Kunkel, Diane E. Ingolia, and Dan W. Denney Jr, Genitope, Fremont, CA Kristen N. Ganjoo and Michael J. Robertson, Indiana University Medical Center, Indianapolis, IN John P. Leonard, Weill Medical College of Cornell University, New York, NY Julie M. Vose, University of Nebraska Medical Center, Omaha, NE Ian W. Flinn and Richard F. Ambinder, Johns Hopkins University Oncology Center, Baltimore, MD Joseph M. Connors, British Columbia Cancer Agency Centre for Lymphoid Cancer, Vancouver, British Columbia Neil L. Berinstein, Toronto-Sunnybrook Regional Cancer Centre, Toronto, Ontario Andrew R. Belch, Cross Cancer Institute, Edmonton, Alberta, Canada Nancy L. Bartlett, Washington University School of Medicine, St Louis, MO Craig Nichols, Oreg

Département de médecine familiale et de médecine d'urgence, Université Laval Association des troubles de l'humeur et d'anxiété du Québec (ATHAQ) Unité de médecine familiale, Hôpital Saint-François d'Assise, CSSS Vieille-Capitale.
ORGANIZATION Université Laval Association
ORGANIZATION CSSS Vieille-Capitale


Department of Vascular and Endovascular Surgery and Department of Interventional Radiology, University Hospital Leuven, Leuven, Belgium.
ORGANIZATION Department of Vascular
ORGANIZATION Endovascular Surgery and Department of Interventional Radiology


Department of Physiology and Biophysics, Dalhousie University, Halifax, Canada. jwroy@dal.ca
ORGANIZATION Department of Physiology and Biophysics
ORGANIZATION Dalhousie University


Laboratoire de Physiologie des Interactions, Hopital Arnaud De Villeneuve, Montpellier, France.
ORGANIZATION Laboratoire de Physiologie des Interactions
ORGANIZATION Montpellier


1] Fred Hutchinson Cancer Research Center, Seattle, WA, USA [2] University of

KeyboardInterrupt: 