# import data

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from __future__ import print_function


In [4]:
#display the full dataframe for all cells
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# display(df)

In [5]:
PROJ_ROOT = os.pardir
import sys
src_dir = os.path.join(PROJ_ROOT, "src")
sys.path.append(src_dir)

In [6]:
from config import TRAIN_FILE_PATH, TEST_FILE_PATH
from features.build_features import read_tsv_file
train = read_tsv_file(TRAIN_FILE_PATH)
test = read_tsv_file(TEST_FILE_PATH)
print(train.shape, test.shape)

File not found, please check the file path.
File not found, please check the file path.


AttributeError: 'NoneType' object has no attribute 'shape'

join the two datasets 

In [None]:
df = pd.concat([train, test], ignore_index = True)
df.sample(2)

Several data wrangling steps need to perform:
1. the column names are inconsistant, should all change to lower cases.
2. the review contains no words characters, suach as "&#039","\r\n\r\n","+", and capitalize "YOU SHALL NOT PASS". They need to clean up.

# data wrangling

2.1 Know the basics of the datasets:

1. shape of dataset
2. data type
3. data distribution
4. missing value and the way to handle the missing value
5. any duplicates
6. any incorrect or manipulated data?

In [None]:
print(df.shape)
print(df.info())
print(df.isnull().sum())

there are 1194 missing values in "condition", also the data type for rating should be int instead of float, the date should change to time.

In [None]:
df['rating'] = df['rating'].astype('int')
df['date'] = pd.to_datetime(df['date'])
df.set_index('date',inplace = True)
df.head()

change the unique values of numeric columns.

In [None]:
print(df.rating.unique())
print(df.describe())

 There are 10 unique rating from 1 to 10.  The average rating is 6.99, with the 25% to 75% in 5 to 10, suggesing rating is skewed. 
 the mean "usefulCount" is 28 while the max can reach to 1291 suggesting the usefulCount is widespread. 

Data cleaning 

missing values

In [None]:
missing_values = df["condition"].isna()
df[missing_values].head()

In [None]:
missing_value_ratio = df.isna().sum()/len(df)*100
print(round(missing_value_ratio,2))

only 0.56% missing values, and the review of  it is safe to drop it.

In [None]:
df = df.dropna()
print(df.shape)
# check duplicate of data
print (df.duplicated(subset =["review"]).sum())
print (df.duplicated(subset =["review","condition","rating","usefulCount"]).sum())

In [None]:
duplicate_rows = df[df.duplicated(subset=["review","condition","rating","usefulCount"])]
duplicate_rows.head()

There are 85420 duplicated in "reviews", for each pair of duplicates, they share the same "condition", while varied in "drugname". Therefore, the duplicate data will be dropped.

In [None]:
df = df.drop_duplicates(subset=["review","condition","rating","usefulCount"], keep="first")
df.shape

In [None]:
# df.set_index("date",inplace = True)

Cleaning "condition" column

In [None]:
df.condition.unique()

some conditions list are comments which can't represent the real conditions, and should be removed form the dataset. Also, some typos such as "Cance", "Disorde", and incomplete information (e.g "eve", which should be "fever")

In [7]:
#remove the comments in conditions
condition_mask =df.condition.str.contains("users found this comment helpful")

df=df[~condition_mask]
df.shape
print(df)

NameError: name 'df' is not defined

In [None]:
df.condition.unique()

In [None]:
import string
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text.lower()
    

In [None]:
df["condition"] = df['condition'].apply(remove_punctuations)
df.condition.unique()

In [None]:
corrected_conditions = {
    'emale Infertility': 'Female Infertility',
    'atigue':'Fatigue',
    'Not Listed / Othe': 'Not Listed Other',
    'moterol)':'Formoterol Mometasone',
    't Pac with Cyclobenzaprine (cyclobenzaprine)':
    'Comfort Pac with Cyclobenzaprine',
    'zen Shoulde': 'Frozen Shoulder',
    'mis': 'Mist',
    'tic (mycophenolic acid)': 'Mycophenolic Acid',
    'ailure to Thrive': 'Failure To Thrive',
    'm Pain Disorde': 'Pain Disorder',
    'mist (': 'Mist',
    'me': 'Mist',
    'lic Acid Deficiency': 'Folic Acid Deficiency',
    'min / saxagliptin)': 'Metformin Saxagliptin',
    'ge HCT (amlodipine / hydrochlorothiazide / valsartan)':
    'Amlodipine Hydrochlorothiazide Valsartan',
    'moterol / mometasone)':'Formoterol Mometasone',
    'eve':'Fever',
    'mance Anxiety':'Performance Anxiety',
    'min)':'Metformin Saxagliptin',
    'ge (amlodipine / valsartan)':'Amlodipine Valsartan',
    'min / rosiglitazone)':'Metformin Rosiglitazone',
    'llicular Lymphoma':'Follicular Lymphoma',
    'min / pioglitazone)':'Metformin Pioglitazone',
    'Pe':"Performance Anxiety",
    't Care':'Urgent Care',
    'llicle Stimulation':'Follicle Stimulation',
}

In [None]:
df.replace({'condition': corrected_conditions}, inplace = True)
df.condition.unique()

In [None]:
repl_dict = {" Disorde$":' Disorder', ' Cance$': 'Cancer',' Tum$':' Tumor', ' Feve$':' Fever',' Ulce$': ' Ulcer'}

In [None]:
df['condition'].replace({k : v for k, v in repl_dict.items()}, 
                           regex=True)                               
df.condition.unique()

how many review per year? how the rating and review change over time, etc.a year?

In [None]:
df.describe()

In [None]:
# df['date'] = pd.to_datetime(df['date'])
# df.info()

what is the correlation among numeric features?

## feature engineering and preprocessing

In [None]:
# #check the text of condition 
# df[df['condition']=='Tic Disorde']

In [None]:
import spacy
import nltk
import re
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize


In [None]:
import en_core_web_sm
nltk.download('stopwords')
from nltk.corpus import stopwords
nlp= spacy.load('en_core_web_sm')
#tokenizer = ToktokTokenizer()
stop_words = set(stopwords.words('english'))


In [None]:
import contractions 
def expand_contractions(text):
    cleaned_text = contractions.fix(text)
    return cleaned_text


In [None]:
raw_text = "&#039;ve been super irritable/moody, and I don&#039;t understand how the side effects can be so extreme for me when I previously was on Nor-Qd "
tx = expand_contractions(raw_text)
tx

### remove special characters

In [None]:
import re
def remove_special_characters(text):
    text = text.str.lower()
    text = text.str.replace('&#039', '').replace('\n','').replace('\r', '').replace('/', ' ')
    text = text.str.replace(r'[^\w\d\s]',' ')
    pattern = re.compile(r'[^a-zA-z0-9\s]+')
    cleaned_text = re.sub(pattern, '', str(text))
    cleaned_text =' '.join(word.strip() for word in cleaned_text.split())
    return cleaned_text                

In [None]:
raw_text = "&#039;ve been super irritable/moody, and I don&#039;t understand how the side effects can be so extreme for me when I previously was on Nor-Qd "

In [None]:
text = remove_special_characters(raw_text)


what are the top words in review? build a word count plot:

In [None]:
from collections import Counter
stop_words = set(stopwords.words('english'))
df['review'] = df['review'].str.lower()
text = df['review'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))
tokens = word_tokenize(text)
count = Counter(tokens)
v,counts = zip(count*.most_common(10))
plt.bar(v,count)
plt.xlabel("word")
plt.ylabel("counts")
plt.title("Top words in drug review")
plt.show()




In [None]:
#check what are the top conditions.
top_conditions = df.condition.value_counts().head(30)
top_conditions.plot(kind = "bar")

The "Birth Control" condition is the highest one, has over 35000 counts, "Depression","Pain","Anxiety","Acne","Bipolar Disorde", "Insomnia", "Weight Loss", "Obesity", "ADHD" has over 5000 counts.

How many unique drugname?

In [None]:
df["drugname"] = df["drugname"].str.title()
#check how many drugname
drugname_list = df['drugname'].unique().tolist()
print(len(drugname_list))

In [None]:
#visualize the top 30 most reveiwed drug name
df.drugname.value_counts().nlargest(30).plot(kind = "bar",figsize =(10,6))
plt.title("The top 30 most reviewed drug name")
plt.show()

In [None]:
df.rating.value_counts().plot(kind= "bar", figsize =(8,6))
plt.title("the counts of each rating")

Howm many unique rating values?

what's the rating and usefulcount of each condition?, what the distribution of rating?

In [None]:
df.rating.unique()

In [None]:
df['rating'].hist(bins=10)
plt.title('histogram of rating in drug review')
plt.xlabel("rating")
plt.ylabel("counts")
plt.show()

the rating indicate the rating either very high (rating 10), or rating very low(at 1), and overall, more positive rating (>=7).

what is the drugname distribution per condition?

In [None]:
df.groupby('condition').drugname.nunique().sort_values(ascending=False)

In [None]:
chunk_size = 50000
chunks = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]
result = pd.concat([chunk.groupby('condition').agg({'rating':'sum','usefulcount':'sum'}) for chunk in chunks])