In [1770]:
import psycopg2
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)
import re

host = 'aact-db.ctti-clinicaltrials.org'
port = '5432'
user = ''
password = ''
database = 'aact'

conn_str = "host={} dbname={} user={} password={}".format(host, database, user, password)
conn = psycopg2.connect(conn_str)
print(conn_str.split())

['host=aact-db.ctti-clinicaltrials.org', 'dbname=aact', 'user=csyuming', 'password=19911030']


### Extract Data from SQL

In [948]:
# Alzheimer related clinical trials
sql = """SELECT stud.*, a.name AS sponsor_name, a.agency_class, i.name AS drug_name , i.intervention_type
            FROM studies stud 
            INNER JOIN (SELECT * FROM sponsors WHERE lead_or_collaborator = 'lead') a ON a.nct_id = stud.nct_id 
            INNER JOIN (SELECT DISTINCT name, nct_id, intervention_type FROM interventions) i ON i.nct_id = stud.nct_id 
            WHERE stud.nct_id IN (SELECT nct_id FROM browse_conditions WHERE downcase_mesh_term LIKE '%alzheimer%')
            AND stud.study_type = 'Interventional'
            AND (intervention_type = 'Drug')
            AND stud.nct_id IN (SELECT nct_id FROM designs WHERE primary_purpose = 'Treatment')
            """
df=pd.read_sql(sql, con=conn)

In [949]:
# save to local drive
df.to_csv('Alzheimer_Clinical_Trails.csv', index=False)

In [950]:
df = pd.read_csv('./Alzheimer_Clinical_Trails.csv')
df.intervention_type.value_counts()

Drug    1378
Name: intervention_type, dtype: int64

In [951]:
df1 = df.copy()

### Explorary Data Analysis

Causes of duplicate ***nct_id*** in joined table
- nct_id is primary key, it is unique in ***studies***
- only one ***lead*** sponsor associate on nct_id
- duplicate nct_id in the joined table is caused by ***one-to-many*** relationship between ***studies*** and ***interventions***  

We are going to try to clean the format of drug name next. Many contain dosage in the name.

In [952]:
# duplicate nut_id
df1['nct_id'].value_counts()[df1['nct_id'].value_counts()>1]

NCT01677754    6
NCT00766363    6
NCT03030105    6
NCT00874939    5
NCT02788513    5
              ..
NCT00566501    2
NCT00744978    2
NCT02547818    2
NCT02840279    2
NCT00440050    2
Name: nct_id, Length: 446, dtype: int64

#### Clean Drug Names Contain ***'mg/'*** ####

In [953]:
# clean drug name contains 'mg/day'
ix = df1[df1['drug_name'].str.contains('mg/')].index
loc = [i.find(' ') for i in df1[df1['drug_name'].str.contains('mg/')]['drug_name']]

for i,l in zip(ix,loc):
    df1.loc[i, 'drug_name'] = df1.loc[i, 'drug_name'][:l]

In [954]:
# double check 
df1[df1['drug_name'].str.contains('mg/')]

Unnamed: 0,nct_id,nlm_download_date_description,study_first_submitted_date,results_first_submitted_date,disposition_first_submitted_date,last_update_submitted_date,study_first_submitted_qc_date,study_first_posted_date,study_first_posted_date_type,results_first_submitted_qc_date,results_first_posted_date,results_first_posted_date_type,disposition_first_submitted_qc_date,disposition_first_posted_date,disposition_first_posted_date_type,last_update_submitted_qc_date,last_update_posted_date,last_update_posted_date_type,start_month_year,start_date_type,start_date,verification_month_year,verification_date,completion_month_year,completion_date_type,completion_date,primary_completion_month_year,primary_completion_date_type,primary_completion_date,target_duration,study_type,acronym,baseline_population,brief_title,official_title,overall_status,last_known_status,phase,enrollment,enrollment_type,source,limitations_and_caveats,number_of_arms,number_of_groups,why_stopped,has_expanded_access,expanded_access_type_individual,expanded_access_type_intermediate,expanded_access_type_treatment,has_dmc,is_fda_regulated_drug,is_fda_regulated_device,is_unapproved_device,is_ppsd,is_us_export,biospec_retention,biospec_description,ipd_time_frame,ipd_access_criteria,ipd_url,plan_to_share_ipd,plan_to_share_ipd_description,created_at,updated_at,sponsor_name,agency_class,drug_name,intervention_type


Now, after modified drug names there should be some duplicates. We are going to drop thoes duplicates

In [955]:
len(df1) - len(df1.drop_duplicates())

13

We are going to drop 15 rows have the same drug name

In [956]:
df1.drop_duplicates(inplace=True)

#### Clean Drug Names Contain mg

- Start with drug_name has 2 letters
- locate ' '(blanck space), if a space after mg the drug name would later words; if no space after mg then drug name would be first word

Find rows have ***one space*** after 'mg' in drug name, example: 15mg T3D-959

In [957]:
# rows has two words in drug name and contain 'mg'
mg_one_space = df1[(df1['drug_name'].str.count(' ') == 1) & (df1['drug_name'].str.contains('mg'))]

# find rows have a space after mg
mg_one_space_after = mg_one_space[mg_one_space['drug_name'].str.contains('mg ')]
mg_one_space_after['drug_name']

4           15mg T3D-959
9       250mg rilapladib
14    400mg LM11A-31-BHS
17    800mg LM11A-31-BHS
Name: drug_name, dtype: object

In [958]:
# for loop to change the name one by one
for ix in mg_one_space_after.index:
    old_name = mg_one_space_after.loc[ix, 'drug_name']
    space_location = old_name.find(' ')
    # plus 1 to add space location
    new_name = old_name[space_location + 1:]
    #assign new name
    mg_one_space.loc[ix, 'drug_name'] = new_name
    mg_one_space_after.loc[ix, 'drug_name'] = new_name
    df1.loc[ix, 'drug_name'] = new_name

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [959]:
# double check
mg_one_space_after['drug_name']

4          T3D-959
9       rilapladib
14    LM11A-31-BHS
17    LM11A-31-BHS
Name: drug_name, dtype: object

Find rows have a space before 'mg', example :'Azeliragon 5mg'

In [960]:
# since we already clean the drug name has dosage after, the reset have dosage before space
mg_one_space_before = df1[(df1['drug_name'].str.count(' ') == 1) & (df1['drug_name'].str.contains('mg'))]
mg_one_space_before['drug_name']

123       Azeliragon 5mg
218          CNP520 15mg
219          CNP520 50mg
338     donepezil 5-10mg
340        Donepezil 5mg
1236         S47445 15mg
1237         S47445 50mg
1238          S47445 5mg
1265      SB-742457 15mg
1266      SB-742457 35mg
Name: drug_name, dtype: object

In [961]:
# for loop to change the name one by one
for ix in mg_one_space_before.index:
    old_name = mg_one_space_before.loc[ix, 'drug_name']
    space_location = old_name.find(' ')
    # plus 1 to add space location
    new_name = old_name[:space_location]
    #assign new name
    mg_one_space.loc[ix, 'drug_name'] = new_name
    mg_one_space_before.loc[ix, 'drug_name'] = new_name
    df1.loc[ix, 'drug_name'] = new_name

In [962]:
mg_one_space_before['drug_name']

123     Azeliragon
218         CNP520
219         CNP520
338      donepezil
340      Donepezil
1236        S47445
1237        S47445
1238        S47445
1265     SB-742457
1266     SB-742457
Name: drug_name, dtype: object

Clean drug names have ***two spaces*** and have dosage in it
- ***1st case***: 'mg' is in between example: 30 mg T3D-959
- ***2nd case***: 'mg' at the end example: Atabecestat, 25 mg
- ***3rd case***: there is no space between dosage amount and unit 'mg' example: AD-35 90mg group

In [963]:
# rows has three words in drug name and contain 'mg'
mg_two_space = df1[(df1['drug_name'].str.count(' ') == 2) & (df1['drug_name'].str.contains('mg'))]
# number of rows
mg_two_space['drug_name'].head()

11       30 mg T3D-959
15       45 mg T3D-959
55         AD-35 60 mg
56    AD-35 60mg group
57    AD-35 90mg group
Name: drug_name, dtype: object

In [964]:
# function to split dosage for the first case
def split_dosage(drug_name):
    # space before and after 'mg' identify the first case
    try:
        return re.split(r" mg ", drug_name)[1]
    except:
        return drug_name

In [965]:
# apply the function to drug_name column
mg_two_space['drug_name'] = mg_two_space['drug_name'].apply(split_dosage)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [967]:
print('Orginal name: ' + mg_two_space.loc[55, 'drug_name']+
      '\nNew name after Regex split: '+re.split(r"\s\d+\.?\d*\smg$", mg_two_space.loc[55, 'drug_name'])[0])

Orginal name: AD-35 60 mg
New name after Regex split: AD-35


In [968]:
# function to split dosage for the second case
def split_dosage(drug_name):
    # space before and after 'mg' identify the sencond case
    try:
        return re.split(r"\s\d+\.?\d*\smg$", drug_name)[0]
    except:
        return drug_name

In [969]:
# apply the function to drug_name column
mg_two_space['drug_name'] = mg_two_space['drug_name'].apply(split_dosage)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [970]:
mg_two_space['drug_name']

11                             T3D-959
15                             T3D-959
55                               AD-35
56                    AD-35 60mg group
57                    AD-35 90mg group
101                       Atabecestat,
102                       Atabecestat,
115                AZD0530 100mg daily
116                AZD0530 125mg daily
337                          Donepezil
488                             GV1001
489                             GV1001
515                       Idalopirdine
537                       JNJ-54861911
538                      JNJ-54861911,
539                      JNJ-54861911,
541                      JNJ-54861911,
542                       JNJ-54861911
543                      JNJ-54861911,
544                      JNJ-54861911,
545                      JNJ-54861911,
561                        Lemborexant
562                        Lemborexant
563                        Lemborexant
564                        Lemborexant
662                      

Looks like it works with majority drug names except few don't have space between dosage amount and unit example: ***AD-35 60mg group***. Simple Regex split would work for these

In [971]:
print('Orginal name: ' + mg_two_space.loc[1282, 'drug_name']+
      '\nNew name after Regex split: '+re.split(r"\s\d+mg", mg_two_space.loc[1282, 'drug_name'])[0])

Orginal name: Sodium oligo-mannurarate 600mg
New name after Regex split: Sodium oligo-mannurarate


In [972]:
# function to split dosage for the second case
def split_dosage(drug_name):
    # space before and after 'mg' identify the sencond case
    try:
        return re.split(r"\s\d+mg", drug_name)[0]
    except:
        return drug_name
# apply the function to drug_name column
mg_two_space['drug_name'] = mg_two_space['drug_name'].apply(split_dosage)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [973]:
mg_two_space['drug_name']

11                       T3D-959
15                       T3D-959
55                         AD-35
56                         AD-35
57                         AD-35
101                 Atabecestat,
102                 Atabecestat,
115                      AZD0530
116                      AZD0530
337                    Donepezil
488                       GV1001
489                       GV1001
515                 Idalopirdine
537                 JNJ-54861911
538                JNJ-54861911,
539                JNJ-54861911,
541                JNJ-54861911,
542                 JNJ-54861911
543                JNJ-54861911,
544                JNJ-54861911,
545                JNJ-54861911,
561                  Lemborexant
562                  Lemborexant
563                  Lemborexant
564                  Lemborexant
662                      MK-0249
663                      MK-0249
731                  PF-01913539
741                 Pimavanserin
742                 Pimavanserin
1157      

In [974]:
# replace the new name into our df1 dataframe
df1.loc[mg_two_space.index, 'drug_name'] = mg_two_space['drug_name']

Next we are going to look up drug names contain ***three spaces***. The names are geting more compalicated with more spaces involved

In [975]:
# rows has three words in drug name and contain 'mg'
mg_three_space = df1[(df1['drug_name'].str.count(' ') == 3) & (df1['drug_name'].str.contains('mg'))]
# number of rows
mg_three_space['drug_name']

64                    AGB101 220 mg tablet
215     CKD-355A (D797/Memantine HCl 20mg)
216     CKD-355B (D797/Memantine HCl 20mg)
224            Comparator: Placebo 5-10 mg
249              D324 (Memantine HCl 10mg)
540        JNJ-54861911, 10 milligram (mg)
590                   LUPRON DEPOT 22.5 mg
1134               PTI-125, 100 mg tablets
1135                  PTI-125 50 mg tablet
1218    Rosiglitazone Extended Release 2mg
1219    Rosiglitazone Extended Release 2mg
1220    Rosiglitazone Extended Release 8mg
1221    Rosiglitazone Extended Release 8mg
1235                 RVT-101 35 mg tablets
Name: drug_name, dtype: object

In [977]:
print('Old name: '+mg_three_space.loc[1218, 'drug_name'] +
      '\nNew name: ' + re.split(r'\s\d+\.?\-?\d*\s*mg', mg_three_space.loc[1218, 'drug_name'])[0])

Old name: Rosiglitazone Extended Release 2mg
New name: Rosiglitazone Extended Release


In [978]:
# function to split dosage for the second case
def split_dosage(drug_name):
    # space before and after 'mg' identify the sencond case
    try:
        return re.split(r'\s\d+\.?\-?\d*\s*mg', drug_name)[0]
    except:
        return drug_name
mg_three_space['drug_name'] = mg_three_space['drug_name'].apply(split_dosage)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [979]:
mg_three_space.drug_name

64                               AGB101
215        CKD-355A (D797/Memantine HCl
216        CKD-355B (D797/Memantine HCl
224                 Comparator: Placebo
249                 D324 (Memantine HCl
540     JNJ-54861911, 10 milligram (mg)
590                        LUPRON DEPOT
1134                           PTI-125,
1135                            PTI-125
1218     Rosiglitazone Extended Release
1219     Rosiglitazone Extended Release
1220     Rosiglitazone Extended Release
1221     Rosiglitazone Extended Release
1235                            RVT-101
Name: drug_name, dtype: object

We have one more need to correct, it use ***milligram*** instead of ***mg***

In [980]:
# function to split dosage for the second case
def split_dosage(drug_name):
    # space before and after 'mg' identify the sencond case
    try:
        return re.split(r'\s\d+\.?\d*\smilligram\s', drug_name)[0]
    except:
        return drug_name
mg_three_space['drug_name'] = mg_three_space['drug_name'].apply(split_dosage)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [981]:
mg_three_space.drug_name

64                              AGB101
215       CKD-355A (D797/Memantine HCl
216       CKD-355B (D797/Memantine HCl
224                Comparator: Placebo
249                D324 (Memantine HCl
540                      JNJ-54861911,
590                       LUPRON DEPOT
1134                          PTI-125,
1135                           PTI-125
1218    Rosiglitazone Extended Release
1219    Rosiglitazone Extended Release
1220    Rosiglitazone Extended Release
1221    Rosiglitazone Extended Release
1235                           RVT-101
Name: drug_name, dtype: object

In [982]:
# replace the drug name to main dataframe
df1.loc[mg_three_space.index, 'drug_name'] = mg_three_space['drug_name']

All look good now, next we are going to work with drug names have more than three spaces

In [983]:
# rows has three words in drug name and contain 'mg'
mg_three_space_more = df1[(df1['drug_name'].str.count(' ') > 3) & (df1['drug_name'].str.contains('mg'))]
# number of rows
mg_three_space_more['drug_name']

0                                   10 mg IR in Study 326
8                                   23 mg SR in Study 326
45      acetaminophen 650 mg qid and placebo qid PRN. ...
52                        Active Treatment- CT1812 100 mg
53                        Active Treatment- CT1812 300 mg
54                   AD-35 30 mg + Placebo of AD-35 30 mg
90                           Aricept (donepezil IR 10 mg)
91                           Aricept (donepezil SR 23 mg)
225                      Comparator: Placebo 5mg (run in)
261             Deferiprone 600mg delayed release tablets
280                                  Dimebon 20 mg po TID
339                                   Donepezil 5 - 10 mg
516                       Idalopirdine (Lu AE58054) 60 mg
1061                      Placebo + Donepezil 5mg or 10mg
1079                    Placebo of AD-35 60mg /AD-35 30mg
1080                    Placebo of AD-35 60mg /AD-35 60mg
1133                           PTI-125 100 mg oral tablet
1350          

In [984]:
# function to split dosage for the second case
def split_dosage(drug_name):
    # space before and after 'mg' identify the sencond case
    try:
        return re.split(r'\s\d+\.?\-?\d*\s*mg', drug_name)[0]
    except:
        return drug_name
mg_three_space_more['drug_name'] = mg_three_space_more['drug_name'].apply(split_dosage)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [985]:
# function to split dosage for the second case
def split_dosage(drug_name):
    # space before and after 'mg' identify the sencond case
    try:
        return re.split(r"\s\d+mg", drug_name)[0]
    except:
        return drug_name
# apply the function to drug_name column
mg_three_space_more['drug_name'] = mg_three_space_more['drug_name'].apply(split_dosage)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [986]:
# replace the drug name to main dataframe
df1.loc[mg_three_space_more.index, 'drug_name'] = mg_three_space_more['drug_name']

Since We have cleaned many drug names, now we may expect to see some duplicate rows, we are going to remove those rows

In [987]:
df1.drop_duplicates(inplace=True,)
df1.reset_index(drop=True, inplace=True)

We cleaned drug names have dosage related issue, next we'll remove clinical trails whice are designed for blined test

#### Clean Drug Names Contain Placebo ####

***placebo*** is a drug for blind test, one of test group will recieve ***placebo*** which has no active ingredients referred as control group. Another group will receive the treatmet to be evaluated.  * 

There are common Alzheimer drugs are used in the trails as standard of care, Some related trails are testing various investigational drugs as add-on therapy to donepezil. Some are testing efficacy, safety or tolerability of marketed drugs under certain conditions** 
 - donepezil
 - rivastigmine
 - galantamine
 - memantine  
*source: design_group of NCT01677754  
**source: https://www.alzforum.org/therapeutics/donepezil;  
https://www.alz.org/alzheimers-dementia/treatments/medications-for-memory

In [991]:
# example of add-on trails related to donepezil
df1[df1['nct_id'] == 'NCT01255046'].loc[322, 'brief_title']

'Study of STA-1 as an Add-on Treatment to Donepezil'

In [992]:
df1['nct_id'].value_counts()[df1['nct_id'].value_counts()>1]

NCT01677754    6
NCT03030105    6
NCT02788513    5
NCT02051335    5
NCT00506415    5
              ..
NCT00904683    2
NCT02127476    2
NCT03116126    2
NCT00684710    2
NCT00440050    2
Name: nct_id, Length: 440, dtype: int64

We still have 500+ duplicate nct_id

In [993]:
# lowercase drug name and find 'placebo'
ix = df1[df1['drug_name'].str.lower().str.contains('placebo')].index
df1.drop(ix, inplace=True)
df1.reset_index(inplace=True, drop=True)

In [994]:
# lowercase drug name and find 'placebo'
ix = df1[df1['drug_name'].str.lower().str.contains('donepezil')].index
df1.drop(ix, inplace=True)
df1.reset_index(inplace=True, drop=True)

In [995]:
# lowercase drug name and find 'placebo'
ix = df1[df1['drug_name'].str.lower().str.contains('rivastigmine')].index
df1.drop(ix, inplace=True)
df1.reset_index(inplace=True, drop=True)

In [996]:
# lowercase drug name and find 'placebo'
ix = df1[df1['drug_name'].str.lower().str.contains('memantine')].index
df1.drop(ix, inplace=True)
df1.reset_index(inplace=True, drop=True)

In [997]:
# lowercase drug name and find 'placebo'
ix = df1[df1['drug_name'].str.lower().str.contains('galantamine')].index
df1.drop(ix, inplace=True)
df1.reset_index(inplace=True, drop=True)

In [998]:
# lowercase drug name and find 'placebo'
ix = df1[df1['drug_name'].str.lower().str.contains('dimebon')].index
df1.drop(ix, inplace=True)
df1.reset_index(inplace=True, drop=True)

In [1026]:
# lowercase drug name and find 'placebo'
ix = df1[df1['drug_name'].str.lower().str.contains('aricept')].index
df1.drop(ix, inplace=True)
df1.reset_index(inplace=True, drop=True)

In [1027]:
df1['nct_id'].value_counts()[df1['nct_id'].value_counts()>1]

NCT00208819    4
NCT02388152    4
NCT02788513    4
NCT00015548    4
NCT03752463    3
              ..
NCT03030105    2
NCT00000179    2
NCT01436045    2
NCT00884533    2
NCT01258452    2
Name: nct_id, Length: 78, dtype: int64

After removing rows contain 'placebo', we have 100+ duplicate nct_id

#### Remove Vitamin Related Trials

In [1028]:
# drop these vitamin related trials
ix = df1[df1['drug_name'].str.lower().str.contains('vitamin', na=False)].index
df1.drop(ix, inplace=True)
df1.reset_index(inplace=True, drop=True)

In [1029]:
df1['nct_id'].value_counts()[df1['nct_id'].value_counts()>1]

NCT00208819    4
NCT02388152    4
NCT02788513    4
NCT00015548    4
NCT03752463    3
              ..
NCT03030105    2
NCT00000179    2
NCT01436045    2
NCT00884533    2
NCT01258452    2
Name: nct_id, Length: 78, dtype: int64

#### Clean Minor Differences With One Space

There are many minor adjustments we need to do for the drug names, we are going to scale the list down to ***industry(agency_class)*** only.

In [1030]:
# agency_class industry only
df1_industry = df1[df1['agency_class']=='Industry']

In [1031]:
# there are about 
print('The number of compnies in the list: {}'.format(len(df1_industry['sponsor_name'].unique())))

The number of compnies in the list: 146


Clean the rows with one space in drug name

If rows has the same nct_id, split the drug name into two parts and use the most common part as the drug name

In [1032]:
# get index for drug names have one space
nct_id_count = df1_industry[df1_industry['drug_name'].str.count(' ') == 1]['nct_id'].value_counts()
nct_id_count[nct_id_count>1]

NCT01203384    3
NCT01258452    2
NCT02667496    2
NCT02244541    2
NCT01211782    2
Name: nct_id, dtype: int64

In [1033]:
trial_id = nct_id_count[nct_id_count>1].index

In [1034]:
[print(i, df1_industry[df1_industry['nct_id'] == i]['drug_name'].values) for i in trial_id]

NCT01203384 ['CHF5074 1x' 'CHF5074 2x' 'CHF5074 3x']
NCT01258452 ['CHF 5074' 'CHF 5974']
NCT02667496 ['Florbetapir F18' 'Sargramostim GZ402664']
NCT02244541 ['ANAVEX2-73 Intravenous' 'ANAVEX2-73 Oral']
NCT01211782 ['caprylic triglyceride' 'long-chain triglyceride']


[None, None, None, None, None]

In [1035]:
# manually correct some
df1_industry.loc[df1_industry['nct_id'] == 'NCT01203384', 'drug_name'] = 'CHF5074'
df1_industry.loc[df1_industry['nct_id'] == 'NCT01258452', 'drug_name'] = 'CHF5074'
df1_industry.loc[df1_industry['nct_id'] == 'NCT02667496', 'drug_name'] = 'sargramostim'
df1_industry.loc[df1_industry['nct_id'] == 'NCT01211782', 'drug_name'] = 'triglyceride'
df1_industry.loc[df1_industry['nct_id'] == 'NCT02244541', 'drug_name'] = 'ANAVEX2-73'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [1036]:
# get index for drug names have more spaces
nct_id_count = df1_industry[df1_industry['drug_name'].str.count(' ') > 1]['nct_id'].value_counts()
nct_id_count[nct_id_count>1]

NCT02788513    4
NCT02388152    4
NCT01602393    3
NCT01421056    3
NCT01303744    3
NCT02353598    3
NCT01230853    2
NCT03867253    2
NCT03790709    2
NCT01513967    2
NCT00566501    2
NCT04052737    2
NCT02710188    2
NCT01739348    2
NCT01766336    2
NCT01569516    2
NCT00141661    2
Name: nct_id, dtype: int64

In [1037]:
trial_id = nct_id_count[nct_id_count>1].index
[print(i, df1_industry[df1_industry['nct_id'] == i]['drug_name'].values) for i in trial_id]

NCT02788513 ['BI 425809 dose 1' 'BI 425809 dose 2' 'BI 425809 dose 3'
 'BI 425809 dose 4']
NCT02388152 ['Lu AF20513, double high dose' 'Lu AF20513, high dose'
 'Lu AF20513, low dose' 'Lu AF20513, medium dose']
NCT01602393 ['CHF 5074 1x' 'CHF 5074 2x' 'CHF 5074 3x']
NCT01421056 ['CHF 5074 1x' 'CHF 5074 2x' 'CHF 5074 3x']
NCT01303744 ['CHF 5074 1x' 'CHF 5074 2x' 'CHF 5074 3x']
NCT02353598 ['Crenezumab dose level 1' 'Crenezumab dose level 2'
 'Crenezumb dose level 3']
NCT01230853 ['Active Comparator: A' 'Active Comparator B']
NCT03867253 ['ORY-2001 High dose' 'ORY-2001 Low dose']
NCT03790709 ['High dose ANAVEX2-73' 'Mid dose ANAVEX2-73']
NCT01513967 ['RPh201, botanical drug product' 'RPh201, botanical extract product']
NCT00566501 ['10 mg IR in Study 326' '23 mg SR in Study 326']
NCT04052737 ['Normal Saline along with standard treatment'
 'PMZ-1620 (sovateltide) along with standard treatment']
NCT02710188 ['HTL0009936 immediate release' 'HTL0009936 modified release']
NCT01739348 ['Verubec

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [1038]:
# manually modify the name
df1_industry.loc[df1_industry['nct_id'] == 'NCT02788513', 'drug_name'] = 'BI 425809'
df1_industry.loc[df1_industry['nct_id'] == 'NCT02388152', 'drug_name'] = 'Lu AF20513'
df1_industry.loc[df1_industry['nct_id'] == 'NCT01602393', 'drug_name'] = 'CHF5074'
df1_industry.loc[df1_industry['nct_id'] == 'NCT01421056', 'drug_name'] = 'CHF5074'

df1_industry.loc[df1_industry['nct_id'] == 'NCT01303744', 'drug_name'] = 'CHF5074'
df1_industry.loc[df1_industry['nct_id'] == 'NCT02353598', 'drug_name'] = 'Crenezumab'
df1_industry.loc[df1_industry['nct_id'] == 'NCT01513967', 'drug_name'] = 'RPh201'
df1_industry.loc[df1_industry['nct_id'] == 'NCT00860275', 'drug_name'] = 'BMS-708163'
df1_industry.loc[df1_industry['nct_id'] == 'NCT01723670', 'drug_name'] = 'CHF5074'
df1_industry.loc[df1_industry['nct_id'] == 'NCT02710188', 'drug_name'] = 'HTL0009936'
df1_industry.loc[df1_industry['nct_id'] == 'NCT01766336', 'drug_name'] = 'ELND005'
df1_industry.loc[df1_industry['nct_id'] == 'NCT03867253', 'drug_name'] = 'ORY-2001'
df1_industry.loc[df1_industry['nct_id'] == 'NCT01739348', 'drug_name'] = 'Verubecestat'
df1_industry.loc[df1_industry['nct_id'] == 'NCT00566501', 'drug_name'] = 'IR' # will be removed
df1_industry.loc[df1_industry['nct_id'] == 'NCT01230853', 'drug_name'] = 'Active Comparator: A' # will be removed
df1_industry.loc[df1_industry['nct_id'] == 'NCT04052737', 'drug_name'] = 'PMZ-1620'
df1_industry.loc[df1_industry['nct_id'] == 'NCT02562989', 'drug_name'] = '[18F]MK-6240'
df1_industry.loc[df1_industry['nct_id'] == 'NCT03019536', 'drug_name'] = 'LY3303560'
df1_industry.loc[df1_industry['nct_id'] == 'NCT01569516', 'drug_name'] = 'Octohydroaminoacridine'
df1_industry.loc[df1_industry['nct_id'] == 'NCT00141661', 'drug_name'] = 'PF-04494700'
df1_industry.loc[df1_industry['nct_id'] == 'NCT03790709', 'drug_name'] = 'ANAVEX2-73'


After correcting the name, there should be some duplicates, we are going drop those

In [1039]:
print('We are going to drop {} rows'.format(len(df1_industry) - len(df1_industry.drop_duplicates())))

We are going to drop 31 rows


In [1040]:
df1_industry.drop_duplicates(inplace=True)
df1_industry.reset_index(inplace=True, drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Finally, we want all the drug names have the same format, we are going downcase the name

In [1041]:
df1_industry['drug_name'] = df1_industry['drug_name'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


### Look Into Drugs by Each Company

In [1042]:
print('There are {} drug companies have clinical trials related Alzheimer'.format(len(df1_industry['sponsor_name'].unique())))

There are 146 drug companies have clinical trials related Alzheimer


In [1116]:
# companies have more than one clinical trials related Alzheimer
trials_more_than_one = df1_industry.groupby(['sponsor_name'])[['drug_name']].nunique()[df1_industry.groupby('sponsor_name')['drug_name'].nunique()>2]
trials_more_than_one.sort_values('drug_name',ascending=False)

Unnamed: 0_level_0,drug_name
sponsor_name,Unnamed: 1_level_1
Pfizer,14
GlaxoSmithKline,13
Wyeth is now a wholly owned subsidiary of Pfizer,12
Merck Sharp & Dohme Corp.,12
Eli Lilly and Company,11
Eisai Inc.,10
AstraZeneca,7
Sanofi,7
Hoffmann-La Roche,6
"Genentech, Inc.",6


The process to correct name:
 - It looks like removing contents after space could be a way to correct name
 - Remove signs except '-' such as '+', ','
 - Double check the name only has letters
 - Pure numbers
 - length less than 3
 - words like drug, experimental, normal, active

In [1058]:
# right a function to extract drug name
def name_split(drug_name):
    try:
        return re.split(r'\s', drug_name)[0]
    except:
        return drug_name

In [1113]:
# fix name one by one
df1_industry.loc[(df1_industry['sponsor_name']=='GlaxoSmithKline'), 'drug_name'] = df1_industry[(df1_industry['sponsor_name']=='GlaxoSmithKline')]['drug_name'].apply(name_split)
df1_industry.loc[(df1_industry['sponsor_name']=='Pfizer'), 'drug_name'] = df1_industry[(df1_industry['sponsor_name']=='Pfizer')]['drug_name'].apply(name_split)
df1_industry.loc[(df1_industry['sponsor_name']=='Wyeth is now a wholly owned subsidiary of Pfizer'), 'drug_name'] = df1_industry[(df1_industry['sponsor_name']=='Wyeth is now a wholly owned subsidiary of Pfizer')]['drug_name'].apply(name_split)
df1_industry.loc[(df1_industry['sponsor_name']=='Merck Sharp & Dohme Corp.'), 'drug_name'] = df1_industry[(df1_industry['sponsor_name']=='Merck Sharp & Dohme Corp.')]['drug_name'].apply(name_split)
df1_industry.loc[(df1_industry['sponsor_name']=='Eli Lilly and Company'), 'drug_name'] = df1_industry[(df1_industry['sponsor_name']=='Eli Lilly and Company')]['drug_name'].apply(name_split)
df1_industry.loc[(df1_industry['sponsor_name']=='Eisai Inc.'), 'drug_name'] = df1_industry[(df1_industry['sponsor_name']=='Eisai Inc.')]['drug_name'].apply(name_split)
df1_industry.loc[(df1_industry['sponsor_name']=='Sanofi'), 'drug_name'] = df1_industry[(df1_industry['sponsor_name']=='Sanofi')]['drug_name'].apply(name_split)
df1_industry.loc[(df1_industry['sponsor_name']=='Hoffmann-La Roche'), 'drug_name'] = df1_industry[(df1_industry['sponsor_name']=='Hoffmann-La Roche')]['drug_name'].apply(name_split)
df1_industry.loc[(df1_industry['sponsor_name']=='AstraZeneca'), 'drug_name'] = df1_industry[(df1_industry['sponsor_name']=='AstraZeneca')]['drug_name'].apply(name_split)
df1_industry.loc[(df1_industry['sponsor_name']=='Biogen'), 'drug_name'] = df1_industry[(df1_industry['sponsor_name']=='Biogen')]['drug_name'].apply(name_split)
df1_industry.loc[(df1_industry['sponsor_name']=='Otsuka Pharmaceutical Co., Ltd.'), 'drug_name'] = df1_industry[(df1_industry['sponsor_name']=='Otsuka Pharmaceutical Co., Ltd.')]['drug_name'].apply(name_split)
df1_industry.loc[(df1_industry['sponsor_name']=='Novartis Pharmaceuticals'), 'drug_name'] = df1_industry[(df1_industry['sponsor_name']=='Novartis Pharmaceuticals')]['drug_name'].apply(name_split)
df1_industry.loc[(df1_industry['sponsor_name']=='Otsuka Pharmaceutical Development & Commercialization, Inc.'), 'drug_name'] = df1_industry[(df1_industry['sponsor_name']=='Otsuka Pharmaceutical Development & Commercialization, Inc.')]['drug_name'].apply(name_split)
df1_industry.loc[(df1_industry['sponsor_name']=='Bristol-Myers Squibb'), 'drug_name'] = df1_industry[(df1_industry['sponsor_name']=='Bristol-Myers Squibb')]['drug_name'].apply(name_split)
df1_industry.loc[(df1_industry['sponsor_name']=='Cerecin'), 'drug_name'] = df1_industry[(df1_industry['sponsor_name']=='Cerecin')]['drug_name'].apply(name_split)
df1_industry.loc[(df1_industry['sponsor_name']=='JANSSEN Alzheimer Immunotherapy Research & Development, LLC'), 'drug_name'] = df1_industry[(df1_industry['sponsor_name']=='JANSSEN Alzheimer Immunotherapy Research & Development, LLC')]['drug_name'].apply(name_split)
df1_industry.loc[(df1_industry['sponsor_name']=='Avanir Pharmaceuticals'), 'drug_name'] = df1_industry[(df1_industry['sponsor_name']=='Avanir Pharmaceuticals')]['drug_name'].apply(name_split)
df1_industry.loc[(df1_industry['sponsor_name']=='GemVax & Kael'), 'drug_name'] = df1_industry[(df1_industry['sponsor_name']=='GemVax & Kael')]['drug_name'].apply(name_split)
df1_industry.loc[(df1_industry['sponsor_name']=='Eisai Co., Ltd.'), 'drug_name'] = df1_industry[(df1_industry['sponsor_name']=='Eisai Co., Ltd.')]['drug_name'].apply(name_split)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


### Further Clean Base On Similarity

For this step, we are going to group the drug name by company and drop the duplicates
- There two rows have na, we are going to fillna as blank space
- Create a column as index helper to facilitate the function that compare the similarity of each drug name
- We'll take deeper dive into the drug name has a strong similarity
- Drop duplicated drug name based on similarity comparison
- Clean the companies name, some have their name in the datasets in different format

In [1460]:
# import text similarity tool
import difflib
import jellyfish

In [1461]:
# group drug names by company level
co_phase_drug = df1_industry[['sponsor_name','drug_name']].drop_duplicates()
co_phase_drug

Unnamed: 0,sponsor_name,drug_name
0,Eisai Inc.,ir
1,Hoffmann-La Roche,11c-l-deprenyl-d2
2,"T3D Therapeutics, Inc.",t3d-959
3,GlaxoSmithKline,[18f]gsk2647544
4,"Genentech, Inc.",[18f]gtp1
...,...,...
465,Pfizer,warfarin
466,Sanofi,xaliproden
468,Actinogen Medical,xanamem™
469,"Inmune Bio, Inc.",xpro1595


Before we do. anything, we are going to drop rows where length of the name only has 3 or less than 3 letters

In [1462]:
co_phase_drug[co_phase_drug['drug_name'].str.len()<=3]

Unnamed: 0,sponsor_name,drug_name
0,Eisai Inc.,ir
7,"Otsuka Pharmaceutical Co., Ltd.",2
9,"Otsuka Pharmaceutical Co., Ltd.",3
11,"Otsuka Pharmaceutical Co., Ltd.",6
88,Charsire Biotechnology Corp.,bac
211,Wyeth is now a wholly owned subsidiary of Pfizer,gsi
387,GlaxoSmithKline,rsg
444,Immungenetics AG,tep


In [1463]:
co_phase_drug.drop(co_phase_drug[co_phase_drug['drug_name'].str.len()<=3].index, inplace=True)

In [1464]:
# create index helper to facilate lookup function
co_phase_drug['ix_helper'] = co_phase_drug['sponsor_name'] + co_phase_drug['drug_name']

In [1475]:
# create function to find similar name
def count_similar_name(ix_helper):
    try:
    #     print(ix_helper)
        company_name = co_phase_drug[co_phase_drug['ix_helper'] == ix_helper]['sponsor_name'].values[0]
        # create drug list by company name to compare with each name within the company's trials
        drug_list = list(co_phase_drug[co_phase_drug['sponsor_name'] == company_name]['drug_name'])
        # the drug we are going to compare the list
        drug = co_phase_drug[co_phase_drug['ix_helper'] == ix_helper]['drug_name'].values[0]
        # remove the drug name we are going to compare with
        drug_list.remove(drug)
        return int(len(difflib.get_close_matches(drug, drug_list, cutoff=0.7)))
    except IndexError:
        return ''

In [1476]:
# create function to find similar name
def similar_name(ix_helper):
    try:
    #     print(ix_helper)
        company_name = co_phase_drug[co_phase_drug['ix_helper'] == ix_helper]['sponsor_name'].values[0]
        # create drug list by company name to compare with each name within the company's trials
        drug_list = list(co_phase_drug[co_phase_drug['sponsor_name'] == company_name]['drug_name'])
        # the drug we are going to compare the list
        drug = co_phase_drug[co_phase_drug['ix_helper'] == ix_helper]['drug_name'].values[0]
        # remove the drug name we are going to compare with
        drug_list.remove(drug)
        return difflib.get_close_matches(drug, drug_list, cutoff=0.7)
    except IndexError:
        return ''

In [1477]:
# apply the function to df
co_phase_drug['count_similar_name'] = co_phase_drug['ix_helper'].apply(count_similar_name)
co_phase_drug['similar_name'] = co_phase_drug['ix_helper'].apply(similar_name)

We'll start to look into these rows have ***1 similar drug name***. It looks like they would either   
- include a sign in the name, example: masitinib (ab1010)  
- or it has sort of description, example: [cor388 capsule]  
So, our solution is to keep the one has the shorter name. For example, we'll keep 'bryostatin' instead of 'bryostatin 1'  
If they are same length, we'll keep those because majority of those are different drugs

In [1478]:
# df for rows have one similar name
similar_name_one = co_phase_drug[co_phase_drug['count_similar_name']==1]

In [1479]:
# finding the length difference betwenn comparable name and drug name
diff_len = (similar_name_one['similar_name'].apply(lambda x: len(x[0])) 
            - similar_name_one['drug_name'].apply(lambda x: len(x)) )
# the one has the longer name
similar_name_one[diff_len<0]

Unnamed: 0,sponsor_name,drug_name,ix_helper,count_similar_name,similar_name
3,GlaxoSmithKline,[18f]gsk2647544,GlaxoSmithKline[18f]gsk2647544,1,[gsk2647544]
36,Pfizer,acc-001+,Pfizeracc-001+,1,[acc-001]
131,"Neurotrope Bioscience, Inc.",bryostatin 1,"Neurotrope Bioscience, Inc.bryostatin 1",1,[bryostatin]
169,FORUM Pharmaceuticals Inc,drug: evp-6124,FORUM Pharmaceuticals Incdrug: evp-6124,1,[evp-6124]
249,"Janssen Research & Development, LLC","jnj-54861911,","Janssen Research & Development, LLCjnj-54861911,",1,[jnj-54861911]
294,Merck Sharp & Dohme Corp.,mk-0249,Merck Sharp & Dohme Corp.mk-0249,1,[mk0249]
341,ACADIA Pharmaceuticals Inc.,pimavanserin tartrate,ACADIA Pharmaceuticals Inc.pimavanserin tartrate,1,[pimavanserin]


Looks like we can drop all of them, because those are duplicates, they just inputed differently

In [1480]:
# drop those rows
co_phase_drug.drop(similar_name_one[diff_len<0].index, inplace=True)

In [1481]:
co_phase_drug[co_phase_drug['count_similar_name'] == 2]

Unnamed: 0,sponsor_name,drug_name,ix_helper,count_similar_name,similar_name
27,"AbbVie (prior sponsor, Abbott)",abt-288,"AbbVie (prior sponsor, Abbott)abt-288",2,"[abt-384, abt-126]"
63,Otsuka Pharmaceutical Development & Commercial...,aripiprazole,Otsuka Pharmaceutical Development & Commercial...,2,"[brexpiprazole, brexpiprazole,]"
112,Biogen,biib037,Biogenbiib037,2,"[biib076, biib092]"
113,Biogen,biib076,Biogenbiib076,2,"[biib037, biib092]"
114,Biogen,biib092,Biogenbiib092,2,"[biib076, biib037]"
124,Otsuka Pharmaceutical Development & Commercial...,brexpiprazole,Otsuka Pharmaceutical Development & Commercial...,2,"[brexpiprazole,, aripiprazole]"
128,Otsuka Pharmaceutical Development & Commercial...,"brexpiprazole,",Otsuka Pharmaceutical Development & Commercial...,2,"[brexpiprazole, aripiprazole]"
193,FORUM Pharmaceuticals Inc,evp-6124,FORUM Pharmaceuticals Incevp-6124,2,"[evp-0962, drug: evp-6124]"
436,"FUJIFILM Toyama Chemical Co., Ltd.",t-817ma,"FUJIFILM Toyama Chemical Co., Ltd.t-817ma",2,"[t-817ma-l, t-817ma-h]"
438,"FUJIFILM Toyama Chemical Co., Ltd.",t-817ma-h,"FUJIFILM Toyama Chemical Co., Ltd.t-817ma-h",2,"[t-817ma-l, t-817ma]"


We'll manually drop these rows because not all of them are duplicates, we tried to play with cutoff parameter but we couldn't find the best one

In [1489]:
co_phase_drug.drop(co_phase_drug[co_phase_drug['drug_name']=='brexpiprazole,'].index, inplace = True)
co_phase_drug.drßop(co_phase_drug[co_phase_drug['drug_name']=='t-817ma-h'].index, inplace = True)
co_phase_drug.drop(co_phase_drug[co_phase_drug['drug_name']=='t-817ma-l'].index, inplace = True)

We fixed many drug names, we'll clean the dataframe first, we'll only keep sponsor names and drug names and drop duplicates

In [1673]:
co_phase_drug = co_phase_drug[['sponsor_name', 'drug_name']]

We'll start to cosolidate entities into one because some have their name differently multiple times, see the example below

In [1493]:
co_phase_drug[co_phase_drug['sponsor_name'].str.contains('Janssen')]['sponsor_name'].value_counts()

Janssen Research & Development, LLC    3
Janssen Pharmaceutical K.K.            2
Janssen Pharmaceutica N.V., Belgium    2
Janssen, LP                            1
Janssen-Cilag Pty Ltd                  1
Name: sponsor_name, dtype: int64

We are going to only look at the first word of the sponsor name. We'll use the first word to find out other names companies used. it works for the majority of cases

In [1637]:
# split name
sponsor_name_df = co_phase_drug[['sponsor_name']].drop_duplicates()
sponsor_name_df['short_name'] = [re.split(r'\s', name)[0] for name in sponsor_name_df['sponsor_name']]

In [1639]:
# create a df to save all other names used for each short name
short_sponsor_name = sponsor_name_df[['short_name']].drop_duplicates()
short_sponsor_name['names_used'] = [list(sponsor_name_df[sponsor_name_df['sponsor_name'].str.contains(shor_name)]['sponsor_name'].values) 
 for shor_name in short_sponsor_name['short_name']]

In [1642]:
short_sponsor_name['number_of_names'] = short_sponsor_name['names_used'].apply(lambda x: len(x))

As we can seee below, there are few cases we need adjust the name manually. 
- Abbvie is split from Abbott, so all trials should be under Abbview
- There are two Mercks, there are not the same one
- Chase should be under Allergan based on its description
- There two companies have 'High' in its name
- Short name 'H' doesn't really make sense, we need to look into it



In [1656]:
short_sponsor_name[short_sponsor_name['number_of_names']>1]

Unnamed: 0,short_name,names_used,number_of_names
12,Pfizer,"[Pfizer, Wyeth is now a wholly owned subsidiar...",2
16,AbbVie,"[AbbVie, AbbVie (prior sponsor, Abbott)]",2
40,Eisai,"[Eisai Inc., Eisai Co., Ltd., Eisai Limited]",3
51,Allergan,"[Allergan, Chase Pharmaceuticals Corporation, ...",2
63,Otsuka,[Otsuka Pharmaceutical Development & Commercia...,2
69,Janssen,"[Janssen Research & Development, LLC, Janssen ...",5
253,Kyowa,"[Kyowa Hakko Kirin Pharma, Inc., Kyowa Kirin C...",2


In [1655]:
# we are going to drop Abbott from this df, later we'll use this as a lookup table 
short_sponsor_name.drop(short_sponsor_name[short_sponsor_name['short_name']=='Abbott'].index, inplace=True)
short_sponsor_name.drop(short_sponsor_name[short_sponsor_name['short_name']=='Merck'].index, inplace=True)
short_sponsor_name.drop(short_sponsor_name[short_sponsor_name['short_name']=='High'].index, inplace=True)
short_sponsor_name.drop(short_sponsor_name[short_sponsor_name['short_name']=='H.'].index, inplace=True)
short_sponsor_name.drop(short_sponsor_name[short_sponsor_name['short_name']=='Wyeth'].index, inplace=True)
short_sponsor_name.drop(short_sponsor_name[short_sponsor_name['short_name']=='Chase'].index, inplace=True)
short_sponsor_name.drop(short_sponsor_name[short_sponsor_name['short_name']=='Changchun'].index, inplace=True)
short_sponsor_name.drop(short_sponsor_name[short_sponsor_name['short_name']=='Janssen,'].index, inplace=True)
short_sponsor_name.drop(short_sponsor_name[short_sponsor_name['short_name']=='Janssen-Cilag'].index, inplace=True)
# modified name
added_shortname = pd.DataFrame({'short_name':['Merck Sharp & Dohme', 'Merck KGaA', 'High Point', 'Changchun Huayang High-tech', 'H. Lundbeck A/S'],
             'names_used': ['Merck Sharp & Dohme Corp.', 'Merck KGaA, Darmstadt, Germany', 'High Point Pharmaceuticals, LLC.', 'Changchun Huayang High-tech Co., Ltd', 'H. Lundbeck A/S']})
# add modified names into df
short_sponsor_name = short_sponsor_name.append(added_shortname, sort=False)

In [1792]:
# there are some manually added rows above dont have number of names used, we are going to add manually
short_sponsor_name.fillna(1, inplace=True)

In [1810]:
# covert list into string to facilita our look up 
for row in range(len(short_sponsor_name[:132])):
    lt = short_sponsor_name.iloc[row, 1]
    string = ''
    for i in range(len(lt)):
        string += ' '+lt[i]
#     print(string)
    short_sponsor_name.iloc[row, 1] = string

### Final Clean

In [1721]:
co_phase_drug[co_phase_drug['drug_name'].str.contains('drug')]

Unnamed: 0,sponsor_name,drug_name


In [1720]:
# can't modify the name by one regex
co_phase_drug['drug_name'] = [re.sub(r'^drug:?\s?[-]?','', i) for i in co_phase_drug['drug_name']]
co_phase_drug['drug_name'] = [re.sub(r'^drug','', i) for i in co_phase_drug['drug_name']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [1722]:
co_phase_drug[co_phase_drug['drug_name'].str.contains('active')]

Unnamed: 0,sponsor_name,drug_name
39,Novartis Pharmaceuticals,active
40,Eisai Inc.,active
41,Cognition Therapeutics,active treatment- ct1812


In [1735]:
# can't modify the name by one regex
co_phase_drug['drug_name'] = [re.sub(r'^active\s?', '',i) for i in co_phase_drug['drug_name']]
co_phase_drug['drug_name'] = [re.sub(r'treatment-?', '',i) for i in co_phase_drug['drug_name']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [1739]:
# drop these row, these doesn't make sense. double checked with original dataset
co_phase_drug.drop(co_phase_drug[co_phase_drug['drug_name'].str.contains('normal')].index, inplace=True)
co_phase_drug.drop(co_phase_drug[co_phase_drug['drug_name'].str.contains('experimental')].index, inplace=True)
# these are the one we fixed above
co_phase_drug.drop(co_phase_drug[co_phase_drug['drug_name'] == ''].index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [1768]:
short_sponsor_name[short_sponsor_name.fillna(1)['number_of_names']>1]

Unnamed: 0,short_name,names_used,number_of_names
12,Pfizer,"[Pfizer, Wyeth is now a wholly owned subsidiar...",2.0
16,AbbVie,"[AbbVie, AbbVie (prior sponsor, Abbott)]",2.0
40,Eisai,"[Eisai Inc., Eisai Co., Ltd., Eisai Limited]",3.0
51,Allergan,"[Allergan, Chase Pharmaceuticals Corporation, ...",2.0
63,Otsuka,[Otsuka Pharmaceutical Development & Commercia...,2.0
69,Janssen,"[Janssen Research & Development, LLC, Janssen ...",5.0
253,Kyowa,"[Kyowa Hakko Kirin Pharma, Inc., Kyowa Kirin C...",2.0


In [1812]:
short_sponsor_name[short_sponsor_name.fillna(1)['number_of_names']==1]

Unnamed: 0,short_name,names_used,number_of_names
1,Hoffmann-La,Hoffmann-La Roche,1.0
2,T3D,"T3D Therapeutics, Inc.",1.0
4,"Genentech,","Genentech, Inc.",1.0
6,GlaxoSmithKline,GlaxoSmithKline,1.0
8,Bellus,Bellus Health Inc,1.0
...,...,...,...
0,Merck Sharp & Dohme,Merck Sharp & Dohme Corp.,1.0
1,Merck KGaA,"Merck KGaA, Darmstadt, Germany",1.0
2,High Point,"High Point Pharmaceuticals, LLC.",1.0
3,Changchun Huayang High-tech,"Changchun Huayang High-tech Co., Ltd",1.0


In [1814]:
short_sponsor_name['number_of_names'].value_counts()

1.0    130
2.0      5
5.0      1
3.0      1
Name: number_of_names, dtype: int64

In [1887]:
# our final step is to unify the names company filled
def name_unify(filled_name):
#     print(filled_name)
    num_names = (short_sponsor_name[short_sponsor_name['names_used'].str.contains(filled_name, regex=False)])['number_of_names'].values
    if num_names > 1:
        return (short_sponsor_name[short_sponsor_name['names_used'].str.contains(filled_name, regex=False)])['short_name'].values[0]
    else:
        return (short_sponsor_name[short_sponsor_name['names_used'].str.contains(filled_name, regex=False)])['names_used'].values[0]
         
        

In [1889]:
co_phase_drug['fixed_name'] = co_phase_drug['sponsor_name'].apply(lambda x: name_unify(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [1898]:
co_phase_drug['fixed_name'].value_counts()

Pfizer                           24
 Eli Lilly and Company           11
Merck Sharp & Dohme Corp.        11
 GlaxoSmithKline                 10
Eisai                            10
                                 ..
 PRAECIS Pharmaceuticals Inc.     1
 Epix Pharmaceuticals, Inc.       1
 Debiopharm International SA      1
 Baxalta now part of Shire        1
 VIVUS, Inc.                      1
Name: fixed_name, Length: 132, dtype: int64

In [1899]:
co_phase_drug[co_phase_drug['sponsor_name'].str.contains('Eli')]

Unnamed: 0,sponsor_name,drug_name,fixed_name
70,Eli Lilly and Company,atomoxetine,Eli Lilly and Company
273,Eli Lilly and Company,ly2062430,Eli Lilly and Company
276,Eli Lilly and Company,ly2599666,Eli Lilly and Company
277,Eli Lilly and Company,ly3002813,Eli Lilly and Company
278,Eli Lilly and Company,ly3154207,Eli Lilly and Company
279,Eli Lilly and Company,ly3202626,Eli Lilly and Company
280,Eli Lilly and Company,ly3303560,Eli Lilly and Company
281,Eli Lilly and Company,ly450139,Eli Lilly and Company
283,Eli Lilly and Company,ly451395,Eli Lilly and Company
410,Eli Lilly and Company,semagacestat,Eli Lilly and Company
