# Classifier

This code trains a Naive Bayes classifier on the processed text to predict a drug's route of administration. The final model will be saved to the models/ directory.

In [1]:
import nltk
import random
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from datetime import datetime
import re
import pickle

## Load Data

In [2]:
df = pd.read_pickle('../data/processed/drugs.pkl')
df.head()

Unnamed: 0,target,text,tokens
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t...","[adults, take, pellets, mouth, three, times, d..."
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...,"[adults, dissolve, tongue, three, times, day, ..."
2,OPHTHALMIC,DOSAGE AND ADMINISTRATION The recommended dosa...,"[recommended, regimen, treatment, bacterial, c..."
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...,"[use, lowest, effective, shortest, duration, c..."
4,TOPICAL,"Directions wet face, apply to hand, massage fa...","[wet, face, apply, hand, massage, face, gently..."


## Classifier Prep - Target Variable

In [3]:
## Currently, there are many labels in the target variable column.
## 5 of these account for more than 90% of the data. As such, anything outside the top 5 labels by count
## will be reclassified as OTHER

list(df['target'].unique())

['ORAL',
 'OPHTHALMIC',
 'TOPICAL',
 'INTRAVENOUS',
 'RESPIRATORY (INHALATION)',
 'VAGINAL',
 'SUBLINGUAL',
 'INTRAMUSCULAR',
 'DENTAL',
 'IRRIGATION',
 'INTRATHECAL',
 'EPIDURAL',
 'SUBCUTANEOUS',
 'NASAL',
 'RECTAL',
 'CUTANEOUS',
 'INTRA-ARTICULAR',
 'TRANSDERMAL',
 'INTRAOCULAR',
 'PERCUTANEOUS',
 'INTRACARDIAC',
 'INTRAVITREAL',
 'AURICULAR (OTIC)',
 'SUBMUCOSAL',
 'BUCCAL',
 'PERINEURAL',
 'INFILTRATION',
 'INTRALESIONAL',
 'PERIODONTAL',
 'PARENTERAL',
 'INTRACAVITARY',
 'INTRAVASCULAR',
 'ENDOTRACHEAL',
 'INTRACAVERNOUS',
 'EXTRACORPOREAL',
 'INTRADERMAL',
 'INTRA-ARTERIAL',
 'SUBARACHNOID',
 'INTRAUTERINE',
 'OROPHARYNGEAL',
 'INTRATYMPANIC',
 'INTRACAMERAL',
 'HEMODIALYSIS',
 'URETHRAL',
 'INTRAPERITONEAL',
 'TRANSMUCOSAL',
 'INTRAVESICAL',
 'ENTERAL',
 'INTRABRONCHIAL',
 'INTRACANALICULAR',
 'URETERAL',
 'RETROBULBAR',
 'INTRAPLEURAL',
 'INTRASPINAL',
 'SUBGINGIVAL',
 'INTRASINAL',
 'INTRAVENTRICULAR']

In [4]:
## As seen below, the top 5 target values are ORAL, TOPICAL, INTRAVENOUS, DENTAL and INTRAMUSCULAR
## All others will be converted to OTHER
df.groupby('target') \
   .count() \
   .sort_values('text', ascending=False) \
   .head(20)

Unnamed: 0_level_0,text,tokens
target,Unnamed: 1_level_1,Unnamed: 2_level_1
ORAL,46628,46628
TOPICAL,27823,27823
INTRAVENOUS,2857,2857
DENTAL,1401,1401
INTRAMUSCULAR,1378,1378
OPHTHALMIC,1344,1344
SUBLINGUAL,798,798
NASAL,643,643
SUBCUTANEOUS,325,325
RESPIRATORY (INHALATION),325,325


In [5]:
df.loc[~df['target'].isin(['ORAL', 'TOPICAL', 'INTRAVENOUS', 'DENTAL', 'INTRAMUSCULAR']), 'target'] = 'OTHER'

list(df['target'].unique())

['ORAL', 'OTHER', 'TOPICAL', 'INTRAVENOUS', 'INTRAMUSCULAR', 'DENTAL']

## Classifier Prep - Feature Words

In [6]:
## Covnert final tokens column in df into a list of key-value pairs containing text tokens (as string) and 
## the target variable

df['tokens_str'] = df['tokens'].apply(lambda x: ' '.join(x))
df.head()

Unnamed: 0,target,text,tokens,tokens_str
0,ORAL,"DOSAGE Adults- Take 4 or 6 Pellets by mouth, t...","[adults, take, pellets, mouth, three, times, d...",adults take pellets mouth three times daily su...
1,ORAL,DIRECTIONS Adults: Dissolve 3 to 5 under the t...,"[adults, dissolve, tongue, three, times, day, ...",adults dissolve tongue three times day directe...
2,OTHER,DOSAGE AND ADMINISTRATION The recommended dosa...,"[recommended, regimen, treatment, bacterial, c...",recommended regimen treatment bacterial conjun...
3,ORAL,2 DOSAGE AND ADMINISTRATION Use the lowest eff...,"[use, lowest, effective, shortest, duration, c...",use lowest effective shortest duration consist...
4,TOPICAL,"Directions wet face, apply to hand, massage fa...","[wet, face, apply, hand, massage, face, gently...",wet face apply hand massage face gently rinse ...


In [7]:
drugs_data = []

for i in range(len(df)):
    tokens = df['tokens_str'][i]
    target = df['target'][i]
    
    drugs_data.append([tokens, target])

random.choices(drugs_data,k=5)

[['years age ask doctor use studies done show product work using product read enclosed user’s guide complete important information begin using lozenge quit day smoke first cigarette minutes waking use mg nicotine lozenge smoke first cigarette within minutes waking use mg nicotine lozenge according following week schedule weeks weeks weeks lozenge every hours lozenge every hours lozenge every hours nicotine lozenge medicine must used certain way get best results place lozenge mouth allow lozenge slowly dissolve minutes minimize swallowing chew swallow lozenge may feel warm tingling sensation occasionally move lozenge one side mouth completely dissolved minutes eat drink minutes using lozenge mouth improve chances quitting use least lozenges per day first weeks use one lozenge time continuously use one lozenge another since may cause hiccups heartburn nausea side effects use lozenges hours use lozenges per day important complete treatment feel need use lozenge longer period keep smoking 

In [8]:
word_cutoff = 5
tokens = [w for t, p in drugs_data for w in t.split()]
word_dist = nltk.FreqDist(tokens)
feature_words = set()


for word, count in word_dist.items() :
    if count > word_cutoff :
        feature_words.add(word)
print(f"With a word cutoff of {word_cutoff}, we have {len(feature_words)} as features in the model.")

With a word cutoff of 5, we have 15810 as features in the model.


In [9]:
def tokenize(text) :
    """ Splitting on whitespace. """
    
    tk = WhitespaceTokenizer()
    final_text = tk.tokenize(text)
    
    return(final_text)



def drugs_features(text,fw) :
    """Given some text, this returns a dictionary holding the
    feature words.
    Args:
    * text: a piece of text in a continuous string. Assumes
    text has been cleaned and case folded.
    * fw: the *feature words* that we're considering. A word
    in `text` must be in fw in order to be returned. This
    prevents us from considering very rarely occurring words.
    Returns:
    A dictionary with the words in `text` that appear in `fw`.
    Words are only counted once.
    If `text` were "quick quick brown fox" and `fw` = {'quick','fox','jumps'},
    then this would return a dictionary of
    {'quick' : True,
    'fox' : True}
    """
    dict_list =[]
    tokens = tokenize(text)

    for i in tokens:
        if i in fw:
            dict_list.append([i, True])

    ret_dict = dict(dict_list)
    return(ret_dict)

In [10]:
featuresets = [(drugs_features(text,feature_words), target) for (text, target) in drugs_data]

## Classifier - Modeling

In [11]:
random.seed(20220507)
random.shuffle(featuresets)
test_size = 500

In [12]:
test_set, train_set = featuresets[:test_size], featuresets[test_size:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.698


In [13]:
classifier.show_most_informative_features(25)

Most Informative Features
                  stable = True           INTRAV : TOPICA =   5773.3 : 1.0
                 reapply = True           TOPICA : ORAL   =   5678.6 : 1.0
                      iv = True           INTRAM : TOPICA =   5671.7 : 1.0
                swimming = True           TOPICA : ORAL   =   5494.4 : 1.0
                injected = True           INTRAM : TOPICA =   5389.8 : 1.0
                 diluted = True           INTRAV : TOPICA =   5323.3 : 1.0
                   aging = True           TOPICA : ORAL   =   4970.6 : 1.0
                spectrum = True           TOPICA : ORAL   =   4598.6 : 1.0
          reconstitution = True           INTRAV : TOPICA =   4483.1 : 1.0
                lactated = True           INTRAV : TOPICA =   4308.1 : 1.0
          individualized = True           INTRAM : TOPICA =   4168.2 : 1.0
                     rub = True           TOPICA : ORAL   =   4052.5 : 1.0
                 divided = True           INTRAM : TOPICA =   4047.4 : 1.0

In [14]:
list(df['target'].unique())

['ORAL', 'OTHER', 'TOPICAL', 'INTRAVENOUS', 'INTRAMUSCULAR', 'DENTAL']

In [15]:
# dictionary of counts by actual drug category vs. predicted / classified. 
# first key is actual, second is estimated
drug_types = list(df['target'].unique())
results = defaultdict(lambda: defaultdict(int))

for d in drug_types :
    for d1 in drug_types :
        results[d][d1] = 0
        
random.shuffle(drugs_data)

for idx, dd in enumerate(drugs_data) :
    text, target = dd
    estimated_party = classifier.classify(drugs_features(text, feature_words))
    results[target][estimated_party] += 1

In [16]:
results

defaultdict(<function __main__.<lambda>()>,
            {'ORAL': defaultdict(int,
                         {'ORAL': 29331,
                          'OTHER': 370,
                          'TOPICAL': 13,
                          'INTRAVENOUS': 10606,
                          'INTRAMUSCULAR': 5745,
                          'DENTAL': 563}),
             'OTHER': defaultdict(int,
                         {'ORAL': 406,
                          'OTHER': 3332,
                          'TOPICAL': 104,
                          'INTRAVENOUS': 929,
                          'INTRAMUSCULAR': 464,
                          'DENTAL': 6}),
             'TOPICAL': defaultdict(int,
                         {'ORAL': 250,
                          'OTHER': 3103,
                          'TOPICAL': 22135,
                          'INTRAVENOUS': 1412,
                          'INTRAMUSCULAR': 783,
                          'DENTAL': 140}),
             'INTRAVENOUS': defaultdict(int,
            

## Output Model -> Pickle

In [17]:
## Trained model now being output to pickle file - allowing it to be applied to new test cases in Dash app.
with open('../models/classifier.pkl', 'wb') as f:
    pickle.dump(classifier, f)
with open('../models/classifier_features.pkl', 'wb') as f:
    pickle.dump(feature_words, f)