In [45]:
import pandas as pd
from collections import Counter
from string import punctuation
import re
import matplotlib.pyplot as plt 
import seaborn as sns
# from src.data.cleaning import prepare
from cleaning import prepare

import os
from sentence_transformers import SentenceTransformer

# Load Data

In [62]:
df = pd.read_csv('../data/processed/drugs.csv', usecols=['target', 'text'])

In [63]:
df.head()

Unnamed: 0,target,text
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...
4,TOPICAL,"Directions wet face, apply to hand, massage fa..."


# Split on Whitespace

The simplest tokenization just splits on whitespace. Let's try this and explore the results. 

In [64]:
pipeline = [str.lower, str.split]

In [65]:
df['tokens'] = df['text'].apply(prepare, pipeline=pipeline)

In [66]:
df.head()

Unnamed: 0,target,text,tokens
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t...","[dosage, adults-, take, 4, or, 6, pellets, by,..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...,"[directions, adults:, dissolve, 3, to, 5, unde..."
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...,"[dosage, and, administration, the, recommended..."
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...,"[2, dosage, and, administration, use, the, low..."
4,TOPICAL,"Directions wet face, apply to hand, massage fa...","[directions, wet, face,, apply, to, hand,, mas..."


In [67]:
# create list of all tokens
all_tokens = []
df['tokens'].apply(lambda x: all_tokens.extend(x))
print(len(all_tokens))

24358746


In [68]:
token_counts = Counter(all_tokens)

In [69]:
types = token_counts.keys()

In [71]:
print("Splitting on whitespace yields {0} tokens and {1} types.".format(token_counts.total(), len(types)))

AttributeError: 'Counter' object has no attribute 'total'

In [72]:
token_counts.most_common(50)

[('the', 846404),
 ('of', 765669),
 ('to', 631902),
 ('and', 574627),
 ('in', 428281),
 ('for', 367696),
 ('a', 363863),
 ('be', 331774),
 ('mg', 324571),
 ('or', 315604),
 ('with', 313071),
 ('dose', 311762),
 ('is', 273367),
 ('patients', 240487),
 ('should', 199925),
 ('dosage', 195962),
 ('not', 180478),
 ('daily', 147639),
 ('2', 147061),
 ('may', 143068),
 ('as', 141042),
 ('tablets', 135945),
 ('use', 120403),
 ('at', 119251),
 ('recommended', 114867),
 ('(', 106858),
 ('by', 98517),
 ('on', 95948),
 ('than', 91705),
 ('every', 90506),
 ('treatment', 88874),
 ('if', 88039),
 ('once', 87647),
 ('1', 86242),
 ('years', 86151),
 ('hours', 84649),
 ('administration', 82023),
 ('12', 80834),
 ('10', 78578),
 ('doses', 76065),
 ('children', 75545),
 ('after', 75072),
 ('are', 74675),
 ('clinical', 74213),
 ('day', 72662),
 ('4', 70195),
 (')', 70024),
 ('•', 68803),
 ('[see', 66986),
 ('5', 63551)]

The top 30 typtes contain many stopwords. A few of them contain punctuation. 

Let's look for other types containing punctuation.

In [73]:
punct_set = set(punctuation)

In [74]:
def contains_punct(text):
    for char in text:
        if char in punct_set:
            return True
    return False

In [75]:
types_with_punct = {t: count for t, count in token_counts.items() if contains_punct(t)}

In [76]:
types_with_punct = sorted(types_with_punct.items(), key=lambda item: item[1], reverse=True)

In [77]:
types_with_punct[:50]

[('(', 106858),
 (')', 70024),
 ('[see', 66986),
 ('mg/day', 45334),
 ('.', 43865),
 ('2.1', 36907),
 ('daily.', 36106),
 ('2.2', 32877),
 ('mg/kg', 32048),
 (',', 31251),
 ('extended-release', 28812),
 ('(see', 27649),
 (').', 24202),
 ('2.5', 24118),
 ('2.3', 24060),
 ('day.', 21921),
 (']', 19855),
 ('].', 19442),
 ('dose.', 18567),
 ('2.4', 18455),
 ('days.', 17290),
 ('hours.', 15738),
 (')]', 14937),
 ('-', 14935),
 ('mg/kg/day', 13231),
 ('injection,', 12968),
 ('tablets,', 12738),
 ('(e.g.,', 12000),
 ('mg,', 11678),
 ('delayed-release', 11676),
 ('age:', 11565),
 ('dose,', 11523),
 ('daily,', 11468),
 ('patients,', 11039),
 ('mg.', 10648),
 ('however,', 10559),
 ('and/or', 10281),
 ('doses.', 10255),
 (')].', 10096),
 ('mg/m', 10073),
 ('day,', 9960),
 ('therapy.', 9565),
 ('mg/day.', 9423),
 ('hours,', 9303),
 ('weeks.', 9232),
 ('[', 9083),
 ('response.', 8933),
 ('patients.', 8899),
 ('recommended.', 8879),
 ('patient.', 8774)]

It seems fairly common for words to be combined with '/'. Let's take a look at these specifically. 

In [78]:
[(t, count) for t, count in dict(types_with_punct).items() if '/' in t][:50]

[('mg/day', 45334),
 ('mg/kg', 32048),
 ('mg/kg/day', 13231),
 ('and/or', 10281),
 ('mg/m', 10073),
 ('mg/day.', 9423),
 ('ml/min', 8107),
 ('mg/ml', 6079),
 ('mg/day,', 4357),
 ('ml/min/1.73', 3471),
 ('mg/kg/day,', 3095),
 ('mg/5', 3033),
 ('mcg/kg/day', 2646),
 ('mg/125', 2465),
 ('ml/min)', 2311),
 ('1/2', 2206),
 ('ml/min,', 2059),
 ('mcg/kg/min', 2004),
 ('mcg/ml', 1944),
 ('mg/kg/day.', 1939),
 ('lopinavir/ritonavir', 1880),
 ('mg/day)', 1827),
 ('(ml/min)', 1771),
 ('ml/min.', 1743),
 ('ml/minute/1.73', 1721),
 ('mg/day).', 1670),
 ('atazanavir/ritonavir', 1666),
 ('(mg/day)', 1632),
 ('/', 1584),
 ('mg/ml)', 1501),
 ('/l', 1479),
 ('mg/dl', 1445),
 ('pharyngitis/tonsillitis', 1301),
 ('mcg/kg', 1171),
 ('mg/25', 1145),
 ('ng/ml', 1042),
 ('ml/min/1.73m', 908),
 ('mg/kg)', 885),
 ('cells/mm', 885),
 ('ml/min),', 865),
 ('mg/kg,', 852),
 ('mcg/day', 849),
 ('ml/minute', 831),
 ('ml/h', 807),
 ('caregiver/family', 791),
 ('mg/kg.', 782),
 ('mg/ml.', 771),
 ('(olanzapine/fluoxetin

Most of these represent units of measurement (e.g. 'mg/day'). However, some of them represent combinations of distinct concepts (e.g. 'caregiver/family', 'pharyngitis/tonsillitis'). Splitting on whitespace would treat these as a single token, which would add unnecessary noise to the corpus. Let's try splitting on whitespace AND on '/'.  

# Split on Whitespace and '/'

In [79]:
def tokenize(text):
    pattern = re.compile(r'[\s/]')
    tokens = re.split(pattern, text)
    tokens = [t for t in tokens if t != '']
    return tokens

In [80]:
tokenize('mg/day foo bar')

['mg', 'day', 'foo', 'bar']

In [81]:
pipeline = [str.lower, tokenize]

In [82]:
df['tokens_slash'] = df['text'].apply(prepare, pipeline=pipeline)

In [83]:
df.head()

Unnamed: 0,target,text,tokens,tokens_slash
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t...","[dosage, adults-, take, 4, or, 6, pellets, by,...","[dosage, adults-, take, 4, or, 6, pellets, by,..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...,"[directions, adults:, dissolve, 3, to, 5, unde...","[directions, adults:, dissolve, 3, to, 5, unde..."
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...,"[dosage, and, administration, the, recommended...","[dosage, and, administration, the, recommended..."
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...,"[2, dosage, and, administration, use, the, low...","[2, dosage, and, administration, use, the, low..."
4,TOPICAL,"Directions wet face, apply to hand, massage fa...","[directions, wet, face,, apply, to, hand,, mas...","[directions, wet, face,, apply, to, hand,, mas..."


In [84]:
# create list of all tokens
all_tokens = []
df['tokens_slash'].apply(lambda x: all_tokens.extend(x))
print(len(all_tokens))

24662826


In [85]:
token_counts = Counter(all_tokens)

In [86]:
types = token_counts.keys()

In [None]:
print("Splitting on whitespace and '/' yields {0} tokens and {1} types.".format(token_counts.total(), len(types)))

In [None]:
token_counts.most_common(50)

# Split on Whitespace and '-'

Some of the most common types also included '-'. Let's take a look at them. 

In [None]:
{t: count for t, count in token_counts.items() if '-' in t}

In contrast to words combined with '/', these combined with '-' tend to represent a single concept. Splitting them into separate tokens would lose important information (e.g. 'non-psychotic'). 

# Final Tokenization

Based on the above analysis, we will split on whitespace and '/', and remove punctuation. 

In [87]:
punct_set = set(punctuation)
punct_set.remove('/') # don't remove '/' because we need it for tokenization 

In [88]:
def remove_punctuation(text):
    return "".join([char for char in text if char not in punct_set])  

In [89]:
pipeline = [str.lower, remove_punctuation, tokenize]

In [90]:
prepare('This is an example/test sentence!', pipeline=pipeline)

['this', 'is', 'an', 'example', 'test', 'sentence']

In [91]:
df['tokens_final'] = df['text'].apply(prepare, pipeline=pipeline)

In [92]:
df.head()

Unnamed: 0,target,text,tokens,tokens_slash,tokens_final
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t...","[dosage, adults-, take, 4, or, 6, pellets, by,...","[dosage, adults-, take, 4, or, 6, pellets, by,...","[dosage, adults, take, 4, or, 6, pellets, by, ..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...,"[directions, adults:, dissolve, 3, to, 5, unde...","[directions, adults:, dissolve, 3, to, 5, unde...","[directions, adults, dissolve, 3, to, 5, under..."
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...,"[dosage, and, administration, the, recommended...","[dosage, and, administration, the, recommended...","[dosage, and, administration, the, recommended..."
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...,"[2, dosage, and, administration, use, the, low...","[2, dosage, and, administration, use, the, low...","[2, dosage, and, administration, use, the, low..."
4,TOPICAL,"Directions wet face, apply to hand, massage fa...","[directions, wet, face,, apply, to, hand,, mas...","[directions, wet, face,, apply, to, hand,, mas...","[directions, wet, face, apply, to, hand, massa..."


In [93]:
# create list of all tokens
all_tokens = []
df['tokens_final'].apply(lambda x: all_tokens.extend(x))
print(len(all_tokens))

24246051


In [94]:
token_counts = Counter(all_tokens)

In [None]:
types = token_counts.keys()

In [None]:
print("Splitting on whitespace and '/' and removing punctuation yields {0} tokens and {1} types.".format(token_counts.total(), len(types)))

In [None]:
token_counts.most_common(50)

### Vectorization - BERT (Google NLP Model)

In [112]:
## Utilizing Google's NLP vectorization model - applied via SentenceTransformer. 
model = SentenceTransformer('bert-base-nli-stsb-mean-tokens')

## Create final sentence, based on final tokens, to vectorize via semantic model

def join_text(tokens):
    new_text = ' '.join(tokens)
    
    return new_text

df['final_text'] = df.apply(lambda row: join_text(row['tokens_final']), axis=1)

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/630 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/409 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [114]:
## Create vector using BERT Model

bert_vector = model.encode(df['final_text'].astype(str))
df['bert_vector'] = list(bert_vector)

df.head()

Unnamed: 0,target,text,tokens,tokens_slash,tokens_final,final_text,bert_vector
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t...","[dosage, adults-, take, 4, or, 6, pellets, by,...","[dosage, adults-, take, 4, or, 6, pellets, by,...","[dosage, adults, take, 4, or, 6, pellets, by, ...",dosage adults take 4 or 6 pellets by mouth thr...,"[0.20267482, 0.48228976, 0.46310183, 0.0596330..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...,"[directions, adults:, dissolve, 3, to, 5, unde...","[directions, adults:, dissolve, 3, to, 5, unde...","[directions, adults, dissolve, 3, to, 5, under...",directions adults dissolve 3 to 5 under the to...,"[-0.51945925, 0.6701943, 0.34755662, -0.211656..."
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...,"[dosage, and, administration, the, recommended...","[dosage, and, administration, the, recommended...","[dosage, and, administration, the, recommended...",dosage and administration the recommended dosa...,"[0.08212981, 0.13064659, 0.39888293, -0.040129..."
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...,"[2, dosage, and, administration, use, the, low...","[2, dosage, and, administration, use, the, low...","[2, dosage, and, administration, use, the, low...",2 dosage and administration use the lowest eff...,"[-0.70748466, 0.4393211, -0.34770426, -0.28706..."
4,TOPICAL,"Directions wet face, apply to hand, massage fa...","[directions, wet, face,, apply, to, hand,, mas...","[directions, wet, face,, apply, to, hand,, mas...","[directions, wet, face, apply, to, hand, massa...",directions wet face apply to hand massage face...,"[-0.040679857, 0.66494775, 1.1360956, 0.220075..."


In [115]:
output_path = os.path.join(os.path.abspath('../data/processed'), 'df_vectorized.pkl')
output_path
df.to_pickle(output_path)  