In [None]:
import re
import nltk
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

In [None]:
#different tokenizer
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize.casual import casual_tokenize
from nltk import ngrams, bigrams
import nltk
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')

In [None]:
#tokenizer normalization
normalized_tokens = [x.lower() for x in tokens]
#stemmer vs lemmatizer
#Two of the most popular stemming algorithms are the Porter and Snowball stemmers
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')

Lemmatization is a potentially more accurate way to normalize a word than stemming or case normalization because it takes into account a word’s meaning. A lemmatizer uses a knowledge base of word synonyms and word endings to ensure that only words that mean similar things are consolidated into a single token.

difference between stemmer and lemmatizer
1. Stemmers are generally faster to compute and require less-complex code and datasets. But stemmers will make more errors and stem a far greater number of words, reducing the information content or meaning of your text much more than a lemmatizer would. 
2. Both stemmers and lemmatizers will reduce your vocabulary size and increase the ambiguity of the text.
3. Lemmatizers do a better job retaining as much of the information content as possible based on how the word was used within the text and its intended meaning. 

In [None]:
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


## Part II: EDA for 'review' column in the training data set

### 1. review ...

In [None]:
nltk.download(['punkt','stopwords'])
stops = set(stopwords.words('english'))
not_stop = ["aren't","couldn't","didn't","doesn't","don't","hadn't","hasn't","haven't","isn't","mightn't","mustn't","needn't","no","nor","not","shan't","shouldn't","wasn't","weren't","wouldn't"]
for i in not_stop:
    stops.remove(i)

In [None]:
stemmer = SnowballStemmer('english')

def review_to_words(raw_review):
    # 1. Delete HTML 
    review_text = BeautifulSoup(raw_review, 'html.parser').get_text()
    # 2. Make a space
    letters_only = re.sub('[^a-zA-Z]', ' ', review_text)
    # 3. lower letters
    words = letters_only.lower().split()
    # 5. Stopwords 
    meaningful_words = [w for w in words if not w in stops]
    # 6. Stemming
    stemming_words = [stemmer.stem(w) for w in meaningful_words]
    # 7. space join words
    return( ' '.join(stemming_words))

In [None]:
df_all['review_clean'] = df_all['review'].apply(review_to_words)

In [None]:
#wordcloud analysis for the reviews,
#positive review, negative review, 
#positive high usefulCount review, negative high usefulCount review
#birth control positive high usefulCount review, birthcontrol negative high usefulcount review
#pain
#depression
#anxiety
#single word cloud
#bigram wordcloud
#ngram wordcloud

## Part II: subset data to better serve question-driven data analysis

### 1. reliable_reviews

   **Qs:**
   Can we find the good drugs based on high quality reviews for customers with different conditions?
   
   reliable_reviews with usefulCount > 100

In [282]:
usefulCount_count=pd.DataFrame(df_new.usefulCount.value_counts())
usefulCount_count.loc[usefulCount_count.index>400].sum()
#Is this the proper way to find the best drug? filter high quality reviews and use them to find the answer

usefulCount    21
dtype: int64

In [283]:
usefulCount_count.loc[(usefulCount_count.index>300) & (usefulCount_count.index<400)].sum()

usefulCount    39
dtype: int64

In [284]:
usefulCount_count.loc[(usefulCount_count.index>200) & (usefulCount_count.index<300)].sum()

usefulCount    228
dtype: int64

In [285]:
usefulCount_count.loc[(usefulCount_count.index>100) & (usefulCount_count.index<200)].sum()

usefulCount    2118
dtype: int64

In [286]:
usefulCount_count.loc[usefulCount_count.index<100].sum()

usefulCount    51309
dtype: int64

In [287]:
usefulCount=df_new.usefulCount.unique()
usefulCount.max()

949

   <span style='color:Blue'>Therefore, we choose reviews with usefulCount >= 100. </span>

In [288]:
reliable_review=df_new[df_new.usefulCount >= 100]
reliable_review.shape

(2457, 9)

In [289]:
reliable_review.condition.value_counts()
# We will study depression, anxiety and pain in this data set.

Depression                                      422
Anxiety                                         230
Pain                                            131
Weight Loss                                     106
Anxiety and Stress                              106
                                               ... 
Tinnitus                                          1
105</span> users found this comment helpful.      1
lic Acid Deficiency                               1
Psoriasis                                         1
Obstructive Sleep Apnea/Hypopnea Syndrome         1
Name: condition, Length: 161, dtype: int64

### 2. top_4_condition data

    
   

In [290]:
top_4_condition=df_new[df_new.condition.isin(top_4_con.index)]

In [291]:
top_4_condition.shape
#we will use this data set to do the sentiment analysis

(16751, 9)

In [292]:
#Let's check top_4_condition data first
top_4_condition.drugName.nunique()

446

In [293]:
#The top 20 drugs with most reviews for birth control, depression, pain and anxiety
top_4_condition.drugName.value_counts().head(20)

Etonogestrel                          1080
Levonorgestrel                         755
Ethinyl estradiol / norethindrone      744
Nexplanon                              734
Ethinyl estradiol / norgestimate       517
Ethinyl estradiol / levonorgestrel     507
Implanon                               401
Mirena                                 355
Escitalopram                           320
Skyla                                  252
Lo Loestrin Fe                         229
Bupropion                              218
Lexapro                                213
NuvaRing                               204
Sertraline                             204
Ethinyl estradiol / etonogestrel       196
Copper                                 193
Tramadol                               185
Venlafaxine                            183
Desvenlafaxine                         179
Name: drugName, dtype: int64

#### 2.1 birth control drugs in top_4_condition data 

   **Qs:**
   <span style='color:Blue'>What's the top 10 birth control drugs with most reviews? Are all of them hormone related? Are those drugs always the popular one from 2008 to 2019? How about their ratings? </span>

In [294]:
#top 10 drugs for birth control with most reviews
top_10_birth_control=top_4_condition[top_4_condition.condition=='Birth Control'].drugName.value_counts().head(10)
top_10_birth_control

Etonogestrel                          1080
Levonorgestrel                         755
Ethinyl estradiol / norethindrone      744
Nexplanon                              734
Ethinyl estradiol / norgestimate       517
Ethinyl estradiol / levonorgestrel     507
Implanon                               401
Mirena                                 355
Skyla                                  252
Lo Loestrin Fe                         229
Name: drugName, dtype: int64

According to the Food and Drug Administration (FDA), the effects of continuously raised estrogen levels in the female body due to taking birth control pills may include an increased risk of breast cancer, blood clotting, migraines, liver problems, increased blood pressure, weight gain, and spotting between periods.

different birth control pills and their mode of action and side effects

In [295]:
#rating, usefulCount, and review time for the top_10_most reviewed drugs for birth control
top_10_birth_control_reviews=top_4_condition[top_4_condition.drugName.isin(top_10_birth_control.index)]

In [296]:
top_10_birth_control_reviews.year.value_counts()

2016    1511
2015    1206
2017    1116
2014     380
2012     349
2011     330
2013     320
2010     211
2009     135
2008      16
Name: year, dtype: int64

In [297]:
birth_control_review=top_4_condition[top_4_condition.condition=='Birth Control']

In [298]:
set(birth_control_review.year)

{2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017}

In [299]:
birth_control_review.year.value_counts

<bound method IndexOpsMixin.value_counts of 97768     2015
215892    2016
71428     2011
79865     2017
60341     2011
          ... 
227273    2013
131040    2010
33495     2014
123432    2010
130945    2010
Name: year, Length: 9648, dtype: int64>

In [300]:
birth_control_review[birth_control_review.year==2017].drugName.value_counts().head(10)

Levonorgestrel                        217
Etonogestrel                          189
Nexplanon                             173
Ethinyl estradiol / norethindrone     158
Ethinyl estradiol / norgestimate      118
Ethinyl estradiol / levonorgestrel     90
Copper                                 68
Skyla                                  64
Kyleena                                54
ParaGard                               50
Name: drugName, dtype: int64

In [301]:
birth_control_review[birth_control_review.year==2015].drugName.value_counts().head(10)

Etonogestrel                          258
Nexplanon                             209
Levonorgestrel                        160
Ethinyl estradiol / norethindrone     152
Ethinyl estradiol / levonorgestrel     86
Ethinyl estradiol / norgestimate       80
Mirena                                 76
Implanon                               70
Skyla                                  68
Copper                                 53
Name: drugName, dtype: int64

In [302]:
birth_control_review[birth_control_review.year==2010].drugName.value_counts().head(10)

Implanon                              53
Etonogestrel                          47
Ethinyl estradiol / levonorgestrel    32
Ethinyl estradiol / norgestimate      28
Loestrin 24 Fe                        20
Ethinyl estradiol / norethindrone     19
Sprintec                              18
Levonorgestrel                        18
Mirena                                14
Drospirenone / ethinyl estradiol      10
Name: drugName, dtype: int64

In [303]:
birth_control_review[birth_control_review.year==2008].drugName.value_counts().head(10)

Levonorgestrel                        6
Ethinyl estradiol / levonorgestrel    3
Desogestrel / ethinyl estradiol       3
Drospirenone / ethinyl estradiol      2
Aviane                                2
Ethinyl estradiol / norelgestromin    2
Implanon                              2
Ethinyl estradiol / norgestimate      2
Apri                                  2
Ethinyl estradiol / norgestrel        1
Name: drugName, dtype: int64

Can we use altair draw time window for the change of top drugs in differnt year?????

#### 2.2. anti-depression drugs in top_4_condition data 

   **Qs:**
<span style='color:Blue'>What are the top anti-depression drugs from 2008 to 2017? Can we find any trends? Are the top drugs with good rating?</span>
   
 There are different kinds of medicine for depression.
 Selective Serotonin Reuptake Inhibitors (SSRIs)
 Serotonin and Norepinephrine Reuptake Inhibitors (SNRIs)
 Tricyclic and Tetracyclic Antidepressants
 Atypical Antidepressants
 Monoamine Oxidase Inhibitors (MAOIs)
 N-methyl D-aspartate (NMDA) Antagonist
 Neuroactive Steroid Gamma-Aminobutyric Acid (GABA)-A Receptor Positive Modulator

In [304]:
#top 10 drugs for depression with most reviews
top_10_depression=top_4_condition[top_4_condition.condition=='Depression'].drugName.value_counts().head(10)
top_10_depression

Sertraline        204
Bupropion         198
Desvenlafaxine    159
Venlafaxine       137
Pristiq           136
Escitalopram      135
Citalopram        125
Vortioxetine      113
Viibryd           112
Duloxetine        109
Name: drugName, dtype: int64

In [305]:
antidepressant_review=top_4_condition[top_4_condition.condition=='Depression']
antidepressant_review.shape

(3095, 9)

In [306]:
antidepressant_review[antidepressant_review.year==2017].drugName.value_counts().head(10)

Sertraline        39
Vortioxetine      36
Zoloft            20
Venlafaxine       19
Escitalopram      19
Pristiq           17
Bupropion         17
Desvenlafaxine    16
Mirtazapine       13
Duloxetine        12
Name: drugName, dtype: int64

In [307]:
antidepressant_review[antidepressant_review.year==2010].drugName.value_counts().head(10)

Desvenlafaxine    20
Pristiq           19
Bupropion         15
Sertraline        11
Cymbalta          11
Duloxetine        10
Escitalopram       9
Lexapro            8
Venlafaxine        8
Wellbutrin         6
Name: drugName, dtype: int64

In [308]:
antidepressant_review.year.value_counts()

2016    523
2015    413
2017    379
2013    324
2011    302
2012    295
2009    288
2014    262
2010    198
2008    111
Name: year, dtype: int64

#### 2.3 pain relieve drugs

   **Qs:**
   
   
The most common drugs involved in prescription opioid overdose deaths include:
1. Methadone
2. Oxycodone (such as OxyContin®)
3. Hydrocodone (such as Vicodin®)3
4. Tramadol 
and more
Are all these top 10 pain killer drugs all nacostic???

In [309]:
#top 10 drugs for pain with most reviews
top_10_pain=top_4_condition[top_4_condition.condition=='Pain'].drugName.value_counts().head(10)
top_10_pain

Tramadol                       144
Acetaminophen / hydrocodone    120
Acetaminophen / oxycodone      114
Oxycodone                       99
Tapentadol                      69
Hydromorphone                   66
Gabapentin                      63
Diclofenac                      59
Dilaudid                        53
Ketorolac                       53
Name: drugName, dtype: int64

In [310]:
pain_relif_review=top_4_condition[top_4_condition.condition=='Pain']
pain_relif_review.shape

(2100, 9)

In [311]:
#top_10_pain_rating
pain_relif_review.rating.value_counts()

10.0    714
9.0     449
8.0     312
1.0     212
7.0     124
3.0      64
6.0      63
5.0      59
2.0      55
4.0      48
Name: rating, dtype: int64

In [312]:
pain_relif_review.drugName.value_counts().head(10)

Tramadol                       144
Acetaminophen / hydrocodone    120
Acetaminophen / oxycodone      114
Oxycodone                       99
Tapentadol                      69
Hydromorphone                   66
Gabapentin                      63
Diclofenac                      59
Dilaudid                        53
Ketorolac                       53
Name: drugName, dtype: int64

In [313]:
#review time for the top_10_most reviewed drugs for pain
pain_relif_review.year.value_counts()

2009    425
2016    259
2011    253
2008    229
2010    208
2015    178
2017    168
2012    147
2013    139
2014     94
Name: year, dtype: int64

#### 2.4. anxiety control drugs 

   **questions:**

In [314]:
#top 10 drugs for anxiety with most reviews
top_10_anxiety=top_4_condition[top_4_condition.condition=='Anxiety'].drugName.value_counts().head(10)
top_10_anxiety

Escitalopram    185
Clonazepam      136
Alprazolam      132
Buspirone       131
Lorazepam       118
Lexapro         111
Hydroxyzine     107
Xanax            92
BuSpar           70
Gabapentin       67
Name: drugName, dtype: int64

In [315]:
anxiety_relif_review=top_4_condition[top_4_condition.condition=='Anxiety']
anxiety_relif_review.shape

(1908, 9)

In [316]:
anxiety_relif_review.year.value_counts()

2016    378
2015    290
2017    253
2014    170
2011    169
2013    166
2009    160
2012    142
2010    111
2008     69
Name: year, dtype: int64

In [317]:
import altair as alt
import numpy as np
anxiety=anxiety_relif_review.year.value_counts()
pain=pain_relif_review.year.value_counts()
depression=antidepressant_review.year.value_counts()
birth_control=birth_control_review.year.value_counts()
source=pd.DataFrame({'anxiety':anxiety, 'pain':pain, 'depression':depression, 'birth_control':birth_control})
source.columns

Index(['anxiety', 'pain', 'depression', 'birth_control'], dtype='object')

In [318]:
source.index.name='year'
source.head()

Unnamed: 0_level_0,anxiety,pain,depression,birth_control
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2008,69,229,111,38
2009,160,425,288,298
2010,111,208,198,369
2011,169,253,302,625
2012,142,147,295,655


In [319]:
source = source.reset_index().melt('year', var_name='condition', value_name='count')

In [323]:
nearest = alt.selection(type='single', nearest=True, on='mouseover',
                        fields=['year'], empty='none')
line = alt.Chart(source).mark_line(interpolate='basis').encode(
    x='year:N',
    y='count:Q',
    color='condition:N'
)
selectors = alt.Chart(source).mark_point().encode(
    x='year:N',
    opacity=alt.value(0),
).add_selection(
    nearest
)
points = line.mark_point().encode(
    opacity=alt.condition(nearest, alt.value(1), alt.value(0))
)
text = line.mark_text(align='left', dx=5, dy=-5).encode(
    text=alt.condition(nearest, 'count:Q', alt.value(' '))
)
rules = alt.Chart(source).mark_rule(color='gray').encode(
    x='year:N',
).transform_filter(
    nearest
)

# Put the five layers into a chart and bind the data
alt.layer(
    line, selectors, points, rules, text
).properties(
    width=800, height=500
)

Unnamed: 0,anxiety,pain,depression,birth_control
2008,69,229,111,38
2009,160,425,288,298
2010,111,208,198,369
2011,169,253,302,625
2012,142,147,295,655


In [9]:
#numerical features distribution
import altair as alt
alt.Chart(df).mark_point().encode(x='usefulCount', y='rating').interactive

<bound method Chart.interactive of alt.Chart(...)>

In [None]:
#sentiment analysis for the review with rating, can sentiment analysis give us the same result as the rating customer provided
#time scale analysis, the time trend for depression, pain and anxiety 
#drug review : birth control, depression, pain, If we use the sentiment results to classify the medicine or rating??