In [427]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import random

import spacy
from wordcloud import WordCloud, STOPWORDS
from spacy.util import minibatch, compounding
from spacy.training.example import Example

In [428]:
data = pd.read_csv('drug_review_dataset_with_sentiment.csv')

In [429]:
data.head()

Unnamed: 0.1,Unnamed: 0,drugName,condition,review,rating,date,usefulCount,drug_class,sentiment,sentiment_label
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27,arb blocker,0.0,neutral
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,,0.168333,positive
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,,0.06721,positive
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,,0.179545,positive
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37,,0.194444,positive


In [436]:
# Load a English pipeline optimized for CPU
nlp0 = spacy.load('en_core_web_sm')
# Get All components of this NLP object
nlp0.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [437]:
ner0 = nlp0.get_pipe('ner')
ner0.add_label("my_ner")

1

In [448]:
# Example
ex1 = "James went to London to buy Ibuprofen last year 2019"
docx = nlp0(ex1)
type(docx)

spacy.tokens.doc.Doc

In [449]:
for entity in docx.ents:
    print(entity, entity.label_)

James PERSON
London GPE
Ibuprofen ORG
last year 2019 DATE


In [451]:
# Drugs Names
all_drugs = data['drugName'].unique().tolist()
all_drugs = [x.lower() for x in all_drugs]

In [456]:
def process_review(review):
    processed_token = []
    for token in review.split():
        token = ''.join(e.lower() for e in token if e.isalnum())
        processed_token.append(token)
    return ' '.join(processed_token)
# Get formatted training data
count = 0
TRAIN_DATA = []
for _, item in data.iterrows():
    ent_dict = {}
    if count < 1000:
        review = process_review(item['review'])
        visited_items = []
        entities = []
        for token in review.split():
            if token in all_drugs:
                 for i in re.finditer(token, review):
                        if token not in visited_items:
                            entity = (i.span()[0], i.span()[1], 'DRUG')
                            visited_items.append(token)
                            entities.append(entity)
        if len(entities) >0:
            ent_dict['entities'] = entities
            train_item = (review, ent_dict)
            TRAIN_DATA.append(train_item)
            count+=1

In [457]:
TRAIN_DATA[0]

('it has no side effect i take it in combination of bystolic 5 mg and fish oil',
 {'entities': [(50, 58, 'DRUG')]})

In [458]:
n_iter = 25
def train_ner(training_data):
    TRAIN_DATA = training_data
    nlp = spacy.blank('en')
    print("Created blank 'en' model")
    if "ner" not in nlp.pipe_names:
#         nlp.add_pipe(nlp.create_pipe("ner"))
        ner = nlp.add_pipe("ner")
#         nlp.add_pipe('ner')
    else:
        ner = nlp.get_pipe("ner")
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])
    nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        batches = minibatch(TRAIN_DATA, size = compounding(4, 32, 1.001))  
        for batch in batches:
            texts, annotations = zip(*batch)
            example = []
            # Update the model with iterating each text
            for i in range(len(texts)):
                doc = nlp.make_doc(texts[i])
                example.append(Example.from_dict(doc, annotations[i]))
            # Update the model
            nlp.update(example, drop=0.5, losses=losses) 
        print("Losses",losses)
    return nlp

# Train model on ner
nlp2 = train_ner(TRAIN_DATA)

Created blank 'en' model


  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,
  entities=ent_str[:50] + "..." if len(ent_str) > 50 else ent_str,


Losses {'ner': 4240.667081179259}
Losses {'ner': 1511.740977854308}
Losses {'ner': 1235.2724320308541}
Losses {'ner': 1083.129265892262}
Losses {'ner': 990.2446226106299}
Losses {'ner': 897.5710719975282}
Losses {'ner': 853.7843505693569}
Losses {'ner': 808.8977543206397}
Losses {'ner': 756.4445785716539}
Losses {'ner': 707.1916989792508}
Losses {'ner': 668.8769093458371}
Losses {'ner': 648.3471243869164}
Losses {'ner': 630.2421060050856}
Losses {'ner': 587.5715613316444}
Losses {'ner': 577.660072767273}
Losses {'ner': 527.2593933256702}
Losses {'ner': 550.197244776995}
Losses {'ner': 510.5873199272852}
Losses {'ner': 454.21367159656177}
Losses {'ner': 472.94112902668286}
Losses {'ner': 457.1023422283899}
Losses {'ner': 448.18269076954806}
Losses {'ner': 456.4263114455587}
Losses {'ner': 416.55294655513075}
Losses {'ner': 420.49390833231826}


In [460]:
docx2 = nlp2(ex1)
docx2

James went to London to buy Ibuprofen last year 2019

In [461]:
for entity in docx2.ents:
    print(entity, entity.label_)

Ibuprofen DRUG


In [463]:
TRAIN_DATA[:5]

[('had a cat bite me on my handfull set of teeth and sharp hand swelled and was red took augmentin 8 hours after being bitten and within 3 days swelling went down and redness had faded 6 years as an animal control officer and never injured oncehelp my friend pick out a cat at the local humane society and get bittenoh the irony this is an excellent medicine for cat bites had a friend tell me her dad got bitten by a cat and he ended up in the hospital and almost died from an infection lucky me from my experience i knew to seek medical attention immediately anyone bitten by a cat should know they have about an 85 chance of infection and need to take it very seriously or you039ll end up in hospital',
  {'entities': [(86, 95, 'DRUG')]}),
 ('i have ocd anxiety and adhd i also pick my cuticles everyday the official name is dermatillomania and it039s closely related to pulling hair out i039ve been picking my cuticles for 25 years there has never been a time when i did not pick them to some deg

In [462]:
for text, _ in TRAIN_DATA[:5]:
    doc = nlp2(text)
    result = [(ent, ent.label_) for ent in doc.ents]
    print(result)

[(augmentin, 'DRUG')]
[(wellbutrin, 'DRUG'), (paxil, 'DRUG')]
[(fiorinal, 'DRUG'), (imitrex, 'DRUG'), (percocet, 'DRUG'), (maxalt, 'DRUG')]
[(suboxone, 'DRUG')]
[(differin, 'DRUG')]


In [464]:
def extract_drug_entity(text):
    docx = nlp2(text)
    result = [(ent, ent.label_) for ent in docx.ents]
    return result

In [466]:
data['review'][0:30].apply(extract_drug_entity)

0                                                    []
1                                   [((Intuniv), DRUG)]
2                                                    []
3                                         [((-), DRUG)]
4                                 [((oxycontin), DRUG)]
5                                                    []
6                                                    []
7     [((Zoloft), DRUG), ((Clonidine), DRUG), ((Abil...
8                                                    []
9                                   [((chateal), DRUG)]
10                                        [((-), DRUG)]
11               [((Cymbalta), DRUG), ((Deplin), DRUG)]
12                                  [((Pentasa), DRUG)]
13                                   [((Delsym), DRUG)]
14                                [((Nexplanon), DRUG)]
15                                  [((Saxenda), DRUG)]
16                            [((Ciorofloxacin), DRUG)]
17    [((amitriptyline), DRUG), ((cymbalta), DRU

In [151]:
text = 'Funk is a music genre that originated in African American communities in the mid-1960s when musicians created a rhythmic, danceable new form of music through a mixture of soul, jazz, and rhythm and blues. It de-emphasizes melody and chord progressions and focuses on a strong rhythmic groove of a bassline played by an electric bassist and a drum part played by a percussionist, often at slower tempos than other popular music.'

In [152]:
doc = nlp(text)

In [153]:
doc

Funk is a music genre that originated in African American communities in the mid-1960s when musicians created a rhythmic, danceable new form of music through a mixture of soul, jazz, and rhythm and blues. It de-emphasizes melody and chord progressions and focuses on a strong rhythmic groove of a bassline played by an electric bassist and a drum part played by a percussionist, often at slower tempos than other popular music.

In [154]:
for token in doc:
    print('"' + token.text + '"')

"Funk"
"is"
"a"
"music"
"genre"
"that"
"originated"
"in"
"African"
"American"
"communities"
"in"
"the"
"mid-1960s"
"when"
"musicians"
"created"
"a"
"rhythmic"
","
"danceable"
"new"
"form"
"of"
"music"
"through"
"a"
"mixture"
"of"
"soul"
","
"jazz"
","
"and"
"rhythm"
"and"
"blues"
"."
"It"
"de"
"-"
"emphasizes"
"melody"
"and"
"chord"
"progressions"
"and"
"focuses"
"on"
"a"
"strong"
"rhythmic"
"groove"
"of"
"a"
"bassline"
"played"
"by"
"an"
"electric"
"bassist"
"and"
"a"
"drum"
"part"
"played"
"by"
"a"
"percussionist"
","
"often"
"at"
"slower"
"tempos"
"than"
"other"
"popular"
"music"
"."


In [155]:
for token in doc[:20]:
    print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
        token.text,
        token.idx,
        token.lemma_,
        token.is_punct,
        token.is_space,
        token.shape_,
        token.pos_,
        token.tag_
    ))

Funk	0	funk	False	False	Xxxx	NOUN	NN
is	5	be	False	False	xx	AUX	VBZ
a	8	a	False	False	x	DET	DT
music	10	music	False	False	xxxx	NOUN	NN
genre	16	genre	False	False	xxxx	NOUN	NN
that	22	that	False	False	xxxx	DET	WDT
originated	27	originate	False	False	xxxx	VERB	VBD
in	38	in	False	False	xx	ADP	IN
African	41	african	False	False	Xxxxx	ADJ	JJ
American	49	american	False	False	Xxxxx	ADJ	JJ
communities	58	community	False	False	xxxx	NOUN	NNS
in	70	in	False	False	xx	ADP	IN
the	73	the	False	False	xxx	DET	DT
mid-1960s	77	mid-1960	False	False	xxx-ddddx	NOUN	NNS
when	87	when	False	False	xxxx	ADV	WRB
musicians	92	musician	False	False	xxxx	NOUN	NNS
created	102	create	False	False	xxxx	VERB	VBD
a	110	a	False	False	x	DET	DT
rhythmic	112	rhythmic	False	False	xxxx	ADJ	JJ
,	120	,	True	False	,	PUNCT	,


In [156]:
for ent in doc.ents:
    print(ent.text, ent.label_)

African American NORP
the mid-1960s DATE


In [157]:
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

In [158]:
for sent in doc.sents:
    print(sent)

Funk is a music genre that originated in African American communities in the mid-1960s when musicians created a rhythmic, danceable new form of music through a mixture of soul, jazz, and rhythm and blues.
It de-emphasizes melody and chord progressions and focuses on a strong rhythmic groove of a bassline played by an electric bassist and a drum part played by a percussionist, often at slower tempos than other popular music.


In [159]:
list(doc.sents)

[Funk is a music genre that originated in African American communities in the mid-1960s when musicians created a rhythmic, danceable new form of music through a mixture of soul, jazz, and rhythm and blues.,
 It de-emphasizes melody and chord progressions and focuses on a strong rhythmic groove of a bassline played by an electric bassist and a drum part played by a percussionist, often at slower tempos than other popular music.]

In [160]:
newdoc = nlp(list(doc.sents)[0].text)

In [161]:
for token in newdoc:
    print("{0}/{1} <--{2}-- {3}/{4}".format(token.text, token.tag_, token.dep_,token.head.text, token.head.tag_))

Funk/NN <--nsubj-- is/VBZ
is/VBZ <--ROOT-- is/VBZ
a/DT <--det-- genre/NN
music/NN <--compound-- genre/NN
genre/NN <--attr-- is/VBZ
that/WDT <--nsubj-- originated/VBD
originated/VBD <--relcl-- genre/NN
in/IN <--prep-- originated/VBD
African/JJ <--amod-- American/JJ
American/JJ <--amod-- communities/NNS
communities/NNS <--pobj-- in/IN
in/IN <--prep-- originated/VBD
the/DT <--det-- mid-1960s/NNS
mid-1960s/NNS <--pobj-- in/IN
when/WRB <--advmod-- created/VBD
musicians/NNS <--nsubj-- created/VBD
created/VBD <--relcl-- mid-1960s/NNS
a/DT <--det-- form/NN
rhythmic/JJ <--amod-- form/NN
,/, <--punct-- form/NN
danceable/JJ <--amod-- form/NN
new/JJ <--amod-- form/NN
form/NN <--dobj-- created/VBD
of/IN <--prep-- form/NN
music/NN <--pobj-- of/IN
through/IN <--prep-- created/VBD
a/DT <--det-- mixture/NN
mixture/NN <--pobj-- through/IN
of/IN <--prep-- mixture/NN
soul/NN <--pobj-- of/IN
,/, <--punct-- soul/NN
jazz/NN <--conj-- soul/NN
,/, <--punct-- jazz/NN
and/CC <--cc-- jazz/NN
rhythm/NN <--conj-- j

In [162]:
displacy.render(newdoc, style = 'dep', jupyter=True, options={'distance':100})

In [164]:
!python3 -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.1.0/en_core_web_lg-3.1.0-py3-none-any.whl (777.1 MB)
[K     |████████████████████████████████| 777.1 MB 24 kB/s s eta 0:00:014     |██████████████████████████      | 628.9 MB 45.0 MB/s eta 0:00:04     |██████████████████████████████▍ | 739.3 MB 6.4 MB/s eta 0:00:06
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.1.0
You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.7/bin/python3 -m pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [201]:
nlp = spacy.load('en_core_web_lg')

In [202]:
funk = nlp.vocab['funk']
hiphop = nlp.vocab['hiphop']

In [203]:
hiphop.similarity(funk)

0.5844027

In [204]:
from scipy.spatial.distance import cosine

In [205]:
1-cosine(funk.vector, hiphop.vector)

0.5844026803970337

In [206]:
def vector_similarity(x, y):
    return 1 - cosine(x, y)

In [207]:
vector_similarity(funk.vector, hiphop.vector)

0.5844026803970337

In [229]:
def make_guess_word(words):
    [first, second, third] = words
    return nlp.vocab[first].vector - nlp.vocab[second].vector + nlp.vocab[third].vector
def get_similar_word(words, scope=nlp.vocab):
    guess_word = make_guess_word(words)
    similarities = []
    for word in scope:
        if not word.has_vector:
            continue
        similarity = vector_similarity(guess_word, word.vector)
        similarities.append((word, similarity))
    similarities = sorted(similarities, key=lambda item: -item[1])
    print([word[0].text for word in similarities[:20]])

In [238]:
# ? - woman = king - queen
words = ["king", "queen", "woman"]
get_similar_word(words)

['man', 'woman', 'He', 'he', 'Who', 'who', 'king', 'When', 'when', 'she', 'She', 'That', 'that', 'Was', 'was', 'had', 'Had', 'What', 'what', 'Not']


In [240]:
# ? - England = Paris - London
words = ["Paris","London","England"]
get_similar_word(words)

['Paris', 'England', 'Mr', 'St', 'Va', 'Mont', 'Ky', 'Oct', 'Jr', 'Tenn', 'Dr', 'Mrs', 'La', 'Sept', 'Md', 'Miss', 'Mr.', 'I', 'i', 'St.']


In [241]:
# ? - red = mandarin - apple
words = ["mandarin","apple","red"]
get_similar_word(words)

['mandarin', 'red', 'Wash', 'tea', 'hot', 'La.', 'thick', 'Sen', 'woman', 'thin', 'Cos', 'cos', 'she', 'She', 'Paris', 'Miss', 'and', 'London', 'w.', 'or']
