# import data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from __future__ import print_function

In [3]:
#display the full dataframe for all cells
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# display(df)

In [4]:
PROJ_ROOT = os.pardir
import sys
src_dir = os.path.join(PROJ_ROOT, "src")
sys.path.append(src_dir)

In [5]:
from config import TRAIN_FILE_PATH, TEST_FILE_PATH
from features.build_features import read_tsv_file

train = read_tsv_file(TRAIN_FILE_PATH)

test = read_tsv_file(TEST_FILE_PATH)
print(train.shape, test.shape)

(161297, 7) (53766, 7)


join the two datasets 

In [5]:
df = pd.concat([train, test], ignore_index = True)
df.sample(25)

Unnamed: 0,id,drugName,condition,review,rating,date,usefulCount
199058,35320,Levofloxacin,Sinusitis,"""I was very nervous when I was prescribed this medicine due to some other reviews I have read. I have had a sinus infection for the last 5 weeks and didn&#039;t clear up with the first round of Amoxicillin. I was given 500 mg of Levaquin for 10 days. I&#039;m on day 9 and the sinus infection is completely gone. It took about 5 days for it to really start working. The only side effects I had were low appetite and trouble sleeping. The trouble sleeping was hard to deal with but I offset it with Unisom and that seemed to do the trick. I also took the medicine after lunch instead of right before bed. No upset stomach like I normally get with antibiotics. Would definitely use again and recommend.""",8.0,"August 27, 2013",104
74041,44074,Ethinyl estradiol / norgestimate,,"""I recently switched from LoEstrin 21 to Trinessa. It has only been a month exactly, but I have zero complaints so far. Totally cleared all my acne that I continuously got from my last birth control pill and absolutely no breakthrough bleeding as well. I was nervous to switch after reading many negative reviews but it is the best switch I made. I am much happier on it, and haven&#039;t gained any weight on Trinessa.""",10.0,"December 4, 2011",2
183040,83949,Ethinyl estradiol / norgestimate,Abnormal Uterine Bleeding,"""I&#039;ve used Ortho for 17yrs off &amp; on. I have PCOs, PMDD &amp; painful uterine fibroids. This is the only pill that stops the most heaviest of bleeding within 24-32hr for me! Stops the horrid pain of my fibroids &amp; cysts. I&#039;ve never gained weight from it, it&#039;s always cleared up my skin &amp; periods are 60% less painful &amp; the flow is low &amp; short lived! This one has always been perfect for me.""",10.0,"August 8, 2017",1
144885,167689,Levonorgestrel,Birth Control,"""I had my IUD inserted over 6 months ago. The insertion process was debilitating, the clamp slipped off my cervix THREE times. I had excruciating cramps the rest of the day which I medicated with Aleve and a heating pad. After that, no symptoms! No period! No spotting, no cramps, no headaches etc. etc. Until last week, I&#039;ve started cramping and spotting REALLY dark blood. I&#039;m having headaches and breaking out. I&#039;m hoping this is temporary but I&#039;ve struggled to find reviews from anyone who has had it in as long as me or longer. Fingers crossed it doesn&#039;t persist or get any worse!""",9.0,"January 20, 2016",7
12626,201779,Alprazolam,Panic Disorde,"""i had real bad panic attacks so bad i thought i was going crazy...my doctor put me on zanax and i feel so much better ..i was afraid to leave my house .afraid to shop i would start to shop all of a sudden i had to run out the store.so much more things were happening..but thanks to zanax i am calm cool and collective...oh and sooooo happy""",10.0,"May 13, 2008",12
111247,84040,Ethinyl estradiol / norgestimate,Birth Control,"""Also forgot to mention that my boobs got bigger! I can&#039;t tell if I have mood swings because I&#039;ve always been so moody. No nausea. \r\n\r\nOne time though I had some pain on my leg and I was worried of a DVT so I got it checked out, turns out it was just a pulled muscle--thankfully. \r\n\r\nSo don&#039;t get scared of all the horror stories out there. Just be on the look out for dangerous side effects, obviously, but don&#039;t be afraid to try it out.""",8.0,"June 10, 2017",2
161994,3869,Bismuth subsalicylate,Indigestion,"""Very good, very fast.""",10.0,"March 26, 2012",11
151534,8306,Zolpidem,Insomnia,"""This medicine helped me to fall asleep after a solid week without sleep. However, I did not stay asleep. Going to try Lunesta tonight.""",6.0,"March 21, 2015",22
195179,89351,Celexa,Anxiety and Stress,"""DO NOT STOP taking this because you feel better. Husband been on this three years felt better stopped taking and passed out on the kitchen floor ended up in hospital did not remember not taking this so after a crap load of test and thousands of dollars it came down to the fact he stopped taking celexa and was detoxing to not try this scared the crap out of both of us. I thought I was going to have. To commit him to a mental hospital""",7.0,"March 2, 2015",34
91270,66954,Seroquel,Bipolar Disorde,"""This is magic""",10.0,"December 6, 2009",19


Several data wrangling steps need to perform:
1. the column names are inconsistant, should all change to lower cases.
2. the review contains no words characters, suach as "&#039","\r\n\r\n","+", and capitalize "YOU SHALL NOT PASS". They need to clean up.

# data wrangling

In [6]:
df.shape

(215063, 7)

In [7]:
df.isna().sum()

id                0
drugName          0
condition      1194
review            0
rating            0
date              0
usefulCount       0
dtype: int64

There are missing values in dataset.

In [8]:
# get consistent format of columns
df=df.rename(columns ={"drugName":"drugname", "usefulCount":"usefulcount"})
df.head()

Unnamed: 0,id,drugname,condition,review,rating,date,usefulcount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil""",9.0,"May 20, 2012",27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. \r\nWe have tried many different medications and so far this is the most effective.""",8.0,"April 27, 2010",192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, which had 21 pill cycle, and was very happy- very light periods, max 5 days, no other side effects. But it contained hormone gestodene, which is not available in US, so I switched to Lybrel, because the ingredients are similar. When my other pills ended, I started Lybrel immediately, on my first day of period, as the instructions said. And the period lasted for two weeks. When taking the second pack- same two weeks. And now, with third pack things got even worse- my third period lasted for two weeks and now it&#039;s the end of the third week- I still have daily brown discharge.\r\nThe positive side is that I didn&#039;t have any other side effects. The idea of being period free was so tempting... Alas.""",5.0,"December 14, 2009",17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth control. I&#039;m glad I went with the patch, I have been on it for 8 months. At first It decreased my libido but that subsided. The only downside is that it made my periods longer (5-6 days to be exact) I used to only have periods for 3-4 days max also made my cramps intense for the first two days of my period, I never had cramps before using birth control. Other than that in happy with the patch""",8.0,"November 3, 2015",10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around. I feel healthier, I&#039;m excelling at my job and I always have money in my pocket and my savings account. I had none of those before Suboxone and spent years abusing oxycontin. My paycheck was already spent by the time I got it and I started resorting to scheming and stealing to fund my addiction. All that is history. If you&#039;re ready to stop, there&#039;s a good chance that suboxone will put you on the path of great life again. I have found the side-effects to be minimal compared to oxycontin. I&#039;m actually sleeping better. Slight constipation is about it for me. It truly is amazing. The cost pales in comparison to what I spent on oxycontin.""",9.0,"November 27, 2016",37


there are missing values in condition. How to handle these missing values?

In [9]:
missing_values = df["condition"].isna()
df[missing_values].head()

Unnamed: 0,id,drugname,condition,review,rating,date,usefulcount
30,51452,Azithromycin,,"""Very good response. It is so useful for me. """,10.0,"August 18, 2010",1
148,61073,Urea,,"""Accurate information.""",10.0,"July 2, 2011",13
488,132651,Doxepin,,"""So far so good. Good for me and I can take it everyday. Can&#039;t remember the last time I slept 7 hours straight.""",10.0,"October 20, 2010",25
733,44297,Ethinyl estradiol / norgestimate,,"""I haven&#039;t been on it for a long time and I have suffered from severe nausea and have thrown up twice. My appetite has lessened even more so than it was before. My acne has gone down and my boobs have grown a lot. Although I have thrown up, if it&#039;s for acne I would suggest it.""",8.0,"January 24, 2011",1
851,68697,Medroxyprogesterone,,"""I started the shot in July 2015 and ended in January 2017. Initially, I had pretty bad headaches, hot flashes, hair loss (nothing that anyone but me noticed), and quite a bit of bleeding (about 3 weeks after my first shot). After, about a month, the side affects started to wane - I stopped bleeding almost entirely (yasss!) and stopped losing hair. That said, while I did not have initial weight gain, I have gained about 15 pounds in the last year (I have never gained this much weight in this amount of time). As a result of the weight gain alone, I am discontinuing with this method of birth control.""",6.0,"March 23, 2017",1


In [10]:
missing_value_ratio = df.isna().sum()/len(df)*100
print(round(missing_value_ratio,2))

id             0.00
drugname       0.00
condition      0.56
review         0.00
rating         0.00
date           0.00
usefulcount    0.00
dtype: float64


only 0.56% missing values, and the review of  it is safe to drop it.

In [11]:
df = df.dropna()
print(df.shape)
# check duplicate of data
print (df.duplicated(subset =["review"]).sum())

(213869, 7)
85420


In [12]:
duplicate_rows = df[df.duplicated(subset="review")]
duplicate_rows.head()

Unnamed: 0,id,drugname,condition,review,rating,date,usefulcount
524,109101,Nexplanon,Birth Control,"""First had implanon then got Nexplanon, had a period first month and I have not had one since. I&#039;m due to remove it next year. I do notice spotting sometimes for a day but it honestly usually coincides with when I&#039;m stressed. \r\nHad some weight gain also.\r\n\r\nSo far the best BC I&#039;ve had in all my years. I plan on trying for a baby next year then I will be back on it.""",9.0,"April 21, 2017",5
574,183531,Cymbalta,Anxiety,"""Prescribed via a Psychiatrist for severe Panic attacks for 2 years.\r\nIf I take dosage late or forget to take it the withdrawal symptoms kick in. \r\nGnawing physical pain, breathlessness, disorientation to time, difficulties in word finding while speaking, severe muscle pain and stiffness, nausea, labile emotions and panic.\r\n\r\n""",1.0,"September 5, 2010",27
726,5154,Orsythia,Birth Control,"""I have only been on orsythia for about 1 month and I just started my second week of my second month. I guess I didn&#039;t notice earlier but I started to get slight headaches and I didn&#039;t feel very well physically and mentally. It do help with my cramps and my period, not so much my acne but it&#039;s better. The worst part of orsythia has to be the mood swings and the sweating! I sweat a lot even if it&#039;s cold I&#039;ll start a light sweat, it&#039;s gross. But the mood swings are the worst I just started feeling this and it happened while I was talking to one of my friends I just blew up on him, for no reason. Then later on I started to feel really bad(mentally) and I cried for a while and I couldn&#039;t figure out why I was crying! I don&#039;t recommend!""",2.0,"October 8, 2015",7
1070,186190,Desvenlafaxine,Depression,"""I have suffered from severe anxiety (GAD) and was taking more and more Klonopin as time went on. I am very sensitive to medication and have tried many different SSRI/SNRI&#039;s through the year with horrible side effects. Finally, I had DNA testing to see what I would respond to and the result was Pristiq. I started it several months ago in a small dose (I split the pills even though they say don&#039;t do this) and within a few days my anxiety literally went away. I was able to cut my Klonopin in 1/2 over a two month period. The first week or two I was extremely tired but that passed. The only side effect I get from time to time is migraines. It still amazes me that my anxiety has disappeared. I no longer keep Klonopin in my pocket!""",8.0,"November 1, 2013",81
1375,73940,Ethinyl estradiol / norethindrone,Birth Control,"""I have been taking my first pack of Lo Loestrin Fe and I must say it really works for me. I was a little nervous at first because this is my first time taking birth control and I&#039;ve heard all the negative side effects of taking birth control. I have had spotting [brown-ish color] for three weeks after my period, but that&#039;s normal for the first month. I have breast tenderness and mood swings every now and then, then again it&#039;s expected for the first few months. I have not yet experienced any weight gain. So far I am satisfied, but I wish it wasn&#039;t so expensive.""",8.0,"February 1, 2012",7


There are 85420 duplicated in "reviews", for each pair of duplicates, they share the same "condition", while varied in "drugname". Therefore, the duplicate data will be dropped.

In [13]:
df = df.drop_duplicates(subset='review', keep="first")
df.shape

(128449, 7)

In [14]:
# df.set_index("date",inplace = True)

check unique of conditions and reviews

In [15]:
df.condition.unique()

array(['Left Ventricular Dysfunction', 'ADHD', 'Birth Control',
       'Opiate Dependence', 'Benign Prostatic Hyperplasia',
       'Emergency Contraception', 'Bipolar Disorde', 'Epilepsy',
       'Migraine Prevention', 'Depression', "Crohn's Disease", 'Cough',
       'Obesity', 'Urinary Tract Infection', 'ibromyalgia',
       'Chronic Myelogenous Leukemia', 'HIV Infection', 'Insomnia',
       'Rheumatoid Arthritis', 'Vaginal Yeast Infection',
       'Chlamydia Infection', 'Hirsutism', 'Panic Disorde', 'Migraine',
       'Pain', 'Irritable Bowel Syndrome', 'Osteoarthritis',
       'Constipation', 'Bowel Preparation', 'Psychosis', 'Muscle Spasm',
       'Hepatitis C', 'Overactive Bladde', 'Diabetes, Type 2',
       'Asthma, Maintenance', 'Non-Small Cell Lung Cance',
       'Schizophrenia', 'Dysuria', 'Smoking Cessation', 'Anxiety', 'Acne',
       'emale Infertility', 'Constipation, Acute',
       'Constipation, Drug Induced', 'Erectile Dysfunction',
       'Trigeminal Neuralgia', 'Undera

some conditions list are comments which can't represent the real conditions, and should be removed form the dataset.

In [16]:
#remove the comments in conditions
condition_mask =df.condition.str.contains("users found this comment helpful")

df=df[~condition_mask]
df.shape

(127283, 7)

In [17]:
#check the text of condition 
df[df['condition']=='Tic Disorde']

Unnamed: 0,id,drugname,condition,review,rating,date,usefulcount
17811,160930,Risperidone,Tic Disorde,"""I have OCD and trichotillomania (compulsive hair pulling). I started taking Risperdal in Jan of 2016 (age 18), and was up to 1mg in March. I still had not noticed changes in my behavior as far as the pulling went or the obsessive thoughts. In a last ditch effort to see if the Risperdal would be effective, we went up to 2mg. After three weeks, I noticed a huge change in behavior. I had little urge to pull anymore. It was such a relief. But then came the long term side effects. I stopped getting periods and my eyes started twitching. I&#039;m now being weened off the drug because of the side effects. The drug seems to work great for OCD and trich, but the side effects ended up being too much for me to manage.""",9.0,"January 6, 2017",7
154737,161020,Risperidone,Tic Disorde,"""I have been taking this medication for over a month at the 0.25mg dose, once daily. I have moderate facial tics that become more severe with anxiety, with my eyes, nose, and mouth twitching at least several times a minute. My psychiatrist prescribed me risperidone and I read online that many people suffer weight gain from this medication. I have had no weight gain at all and no change in appetite, most likely due to my low dosage. From the first dose I took I noticed my tics were reduced significantly, and I remember thinking, &quot;this is what it feels like to be normal!&quot;. My tics are almost entirely gone and I used to be reminded of my tics every waking second but now half of the time I forget I even have tics. This drug saved my self-esteem.""",8.0,"July 30, 2015",34


In [18]:
# replace the invalid_conditions to the correct ones.

invalid_conditions = [
    'emale Infertility', 'Not Listed / Othe', 'Breast Cance', 'Pain/Feve',
    'Prostate Cance', 'moterol)',
    't Pac with Cyclobenzaprine (cyclobenzaprine)', 'zen Shoulde', 'mis',
    'Thyroid Cance', 'tic (mycophenolic acid)', 'ailure to Thrive',
    'm Pain Disorde', 'Q Feve', 'mist (', 'Mist', 'me', 'lic Acid Deficiency',
    'min / saxagliptin)',
    'ge HCT (amlodipine / hydrochlorothiazide / valsartan)',
    'Salivary Gland Cance',
    'moterol / mometasone)',
    'eve',
    'Non-Small Cell Lung Cance',
    'Shift Work Sleep Disorde',
    'mance Anxiety',
    'Cance',
    'min)',
    'ge (amlodipine / valsartan)',
    'Testicular Cance',
    'min / rosiglitazone)',
    'llicular Lymphoma',
    'min / pioglitazone)',
    'Brain Tum',
    'Gastrointestinal Stromal Tum',
    'Pe',
    't Care',
    'Stomach Cance',
    'Rat-bite Feve',
    'llicle Stimulation',
    'Tic Disorde',
    'Head And Neck Cance',
    
]

In [19]:
corrected_conditions = {
    'emale Infertility': 'Female Infertility',
    'Not Listed / Othe': 'Not Listed or Other',
    'Breast Cance': 'Breast Cancer',
    'Pain/Feve': 'Pain or Fever',
    'Prostate Cance': 'Prostate Cancer',
    'moterol)':'Formoterol or Mometasone',
    't Pac with Cyclobenzaprine (cyclobenzaprine)':
    'Comfort Pac with Cyclobenzaprine',
    'zen Shoulde': 'Frozen Shoulder',
    'mis': 'Mist',
    'Thyroid Cance': 'Thyroid Cancer',
    'tic (mycophenolic acid)': 'Mycophenolic Acid',
    'ailure to Thrive': 'Failure To Thrive',
    'm Pain Disorde': 'Pain Disorder',
    'Q Feve': 'Q Fever',
    'mist (': 'Mist',
    'me': 'Mist',
    'lic Acid Deficiency': 'Folic Acid Deficiency',
    'min / saxagliptin)': 'Metformin or Saxagliptin',
    'ge HCT (amlodipine / hydrochlorothiazide / valsartan)':
    'Amlodipine or Hydrochlorothiazide or Valsartan',
    'Salivary Gland Cance': 'Salivary Gland Cancer',
    'moterol / mometasone)':'Formoterol or Mometasone',
    'eve':'Fever',
    'Non-Small Cell Lung Cance':'Non-Small Cell Lung Cancer',
    'Shift Work Sleep Disorde':'Shift Work Sleep Disorder',
    'mance Anxiety':'Performance Anxiety',
    'Cance':'Cancer',
    'min)':'Metformin or Saxagliptin',
    'ge (amlodipine / valsartan)':'Amlodipine or Valsartan',
    'Testicular Cance':'Testicular Cancer',
    'min / rosiglitazone)':'Metformin or Rosiglitazone',
    'llicular Lymphoma':'Follicular Lymphoma',
    'min / pioglitazone)':'Metformin or Pioglitazone',
    'Brain Tum':'Brain Tumor',
    'Gastrointestinal Stromal Tum':'Gastrointestinal Stromal Tumor',
    'Pe':"Performance Anxiety",
    't Care':'Urgent Care',
    'Stomach Cance':'Stomach Cancer',
    'Rat-bite Feve':'Rat-bite Fever',
    'llicle Stimulation':'Follicle Stimulation',
    'Tic Disorde':'Tic Disorder',
    'Head And Neck Cance':'Head And Neck Cancer',
}

In [20]:
# mask = df['condition'].isin(corrected_conditions.keys())
# df = df.loc[mask,'condition'].replace(corrected_conditions)


In [21]:
# df["condition"] = df["condition"].str.title()
# df["condition"].unique()

In [22]:
# #check what are the top conditions.
# top_conditions = df.condition.value_counts().head(30)
# top_conditions.plot(kind = "bar")

The "Birth Control" condition is the highest one, has over 35000 counts, "Depression","Pain","Anxiety","Acne","Bipolar Disorde", "Insomnia", "Weight Loss", "Obesity", "ADHD" has over 5000 counts.

How many unique drugname?

In [23]:
# df["drugname"] = df["drugname"].str.title()
# #check how many drugname
# drugname_list = df['drugname'].unique().tolist()
# print(len(drugname_list))

In [24]:
# #visualize the top 30 most reveiwed drug name
# df.drugname.value_counts().nlargest(30).plot(kind = "bar",figsize =(10,6))
# plt.title("The top 30 most reviewed drug name")
# plt.show()

In [25]:
# df.rating.value_counts().plot(kind= "bar", figsize =(8,6))
# plt.title("the counts of each rating")

Howm many unique rating values?

what's the rating and usefulcount of each condition?, what the distribution of rating?

In [26]:
# df.rating.unique()

In [27]:
# df['rating'].hist(bins=10)
# plt.title('histogram of rating in drug review')
# plt.xlabel("rating")
# plt.ylabel("counts")
# plt.show()

the rating indicate the rating either very high (rating 10), or rating very low(at 1), and overall, more positive rating (>=7).

what is the drugname distribution per condition?

In [28]:
# df.groupby('condition').drugname.nunique().sort_values(ascending=False)

In [29]:
# chunk_size = 10000
# chunks = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]
# result = pd.concat([chunk.groupby('condition').agg({'rating':'sum','usefulcount':'sum'}) for chunk in chunks])