## Text-processing

Using the package NLTK we can see what sort of text we have in the reviews over the birth control data set

In [1]:
# note: conda environment data_review is set up for this notebook
import os

import IPython

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

# these are scripts with functions made
from basic_functions import*
from text_process import*

import nltk

import plotly
from plotly import tools
import plotly.graph_objs as go

import string

In [2]:
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\melis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# load the data
bc_merged = load_BCmerged()

In [4]:
# Set stopwords
# currently is using the nltk english stopwords list
STOPWORDS = stopwords.words('english')

In [5]:
print(STOPWORDS)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
# removing not, no becuase those seem would be important
STOPWORDS.remove('no')
STOPWORDS.remove('not')

In [7]:
print(STOPWORDS)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'nor', 'only', 'own', 'same', 'so', 'than', 'too', 'very',

# Pre-process the text

Previous analysis did not have any text processing so here will preprocess and then reanalyze

In [8]:
# check if is a number? Should we change to the word?

In [9]:
rev1 = example_review(bc_merged, 2)
print(rev1)

I had been on the pill for many years. When my doctor changed my RX to chateal, it was as effective. It really did help me by completely clearing my acne, this takes about 6 months though. I did not gain extra weight, or develop any emotional health issues. I stopped taking it bc I started using a more natural method of birth control, but started to take it bc I hate that my acne came back at age 28. I really hope symptoms like depression, or weight gain do not begin to affect me as I am older now. I'm also naturally moody, so this may worsen things. I was in a negative mental rut today. Also I hope this doesn't push me over the edge, as I believe I am depressed. Hopefully it'll be just like when I was younger.


In [10]:
nltk_tokens = nltk.word_tokenize(rev1)
print(nltk_tokens)

['I', 'had', 'been', 'on', 'the', 'pill', 'for', 'many', 'years', '.', 'When', 'my', 'doctor', 'changed', 'my', 'RX', 'to', 'chateal', ',', 'it', 'was', 'as', 'effective', '.', 'It', 'really', 'did', 'help', 'me', 'by', 'completely', 'clearing', 'my', 'acne', ',', 'this', 'takes', 'about', '6', 'months', 'though', '.', 'I', 'did', 'not', 'gain', 'extra', 'weight', ',', 'or', 'develop', 'any', 'emotional', 'health', 'issues', '.', 'I', 'stopped', 'taking', 'it', 'bc', 'I', 'started', 'using', 'a', 'more', 'natural', 'method', 'of', 'birth', 'control', ',', 'but', 'started', 'to', 'take', 'it', 'bc', 'I', 'hate', 'that', 'my', 'acne', 'came', 'back', 'at', 'age', '28', '.', 'I', 'really', 'hope', 'symptoms', 'like', 'depression', ',', 'or', 'weight', 'gain', 'do', 'not', 'begin', 'to', 'affect', 'me', 'as', 'I', 'am', 'older', 'now', '.', 'I', "'m", 'also', 'naturally', 'moody', ',', 'so', 'this', 'may', 'worsen', 'things', '.', 'I', 'was', 'in', 'a', 'negative', 'mental', 'rut', 'today'

## Lemmatization

In [11]:
print('original review:')
print(rev1)

print('\n lemmatized review:')
print(lemmatize(rev1)) #lemmatize is function in text_process.py

original review:
I had been on the pill for many years. When my doctor changed my RX to chateal, it was as effective. It really did help me by completely clearing my acne, this takes about 6 months though. I did not gain extra weight, or develop any emotional health issues. I stopped taking it bc I started using a more natural method of birth control, but started to take it bc I hate that my acne came back at age 28. I really hope symptoms like depression, or weight gain do not begin to affect me as I am older now. I'm also naturally moody, so this may worsen things. I was in a negative mental rut today. Also I hope this doesn't push me over the edge, as I believe I am depressed. Hopefully it'll be just like when I was younger.

 lemmatized review:
I had been on the pill for many year . When my doctor changed my RX to chateal , it wa a effective . It really did help me by completely clearing my acne , this take about 6 month though . I did not gain extra weight , or develop any emotion

In [12]:
data = bc_merged[0:100] # taking only first few to speed up code for now while experimenting

In [13]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [14]:
# store the lemmatized review in review lemma for data
# NOTE: this is not working.... need to figure out how to lemmatize all the data then can store under "review lemma" in the dataframe
#data['review_lemma'] = data['review'].apply(lambda x: ' '.join([lemmatizer.lemmatize(w) for w in x.split()]))
data['review_lemma'] = data['review'].apply(lemmatize)

In [15]:
# here we can see the data set now has the original review still
data['review'].iloc[2]

"I had been on the pill for many years. When my doctor changed my RX to chateal, it was as effective. It really did help me by completely clearing my acne, this takes about 6 months though. I did not gain extra weight, or develop any emotional health issues. I stopped taking it bc I started using a more natural method of birth control, but started to take it bc I hate that my acne came back at age 28. I really hope symptoms like depression, or weight gain do not begin to affect me as I am older now. I'm also naturally moody, so this may worsen things. I was in a negative mental rut today. Also I hope this doesn't push me over the edge, as I believe I am depressed. Hopefully it'll be just like when I was younger."

In [16]:
# the lemmatized review is under 'review_lemma'
data['review_lemma'].iloc[2]

"I had been on the pill for many year . When my doctor changed my RX to chateal , it wa a effective . It really did help me by completely clearing my acne , this take about 6 month though . I did not gain extra weight , or develop any emotional health issue . I stopped taking it bc I started using a more natural method of birth control , but started to take it bc I hate that my acne came back at age 28 . I really hope symptom like depression , or weight gain do not begin to affect me a I am older now . I 'm also naturally moody , so this may worsen thing . I wa in a negative mental rut today . Also I hope this doe n't push me over the edge , a I believe I am depressed . Hopefully it 'll be just like when I wa younger ."

## Track key information

Examples: unique words, number of letters, upper case word counts, title case word counts, number of stop words, average length of stop words

Idea: we can use pandas to record things about each of the different reviews

In [17]:
#Word count in each review
data['count_word']=data["review"].apply(lambda x: len(str(x).split()))

#Unique word count 
data['count_unique_word']=data["review"].apply(lambda x: len(set(str(x).split())))

#Letter count
data['count_letters']=data["review"].apply(lambda x: len(str(x)))

In [18]:
print('word count: ' + str(data['count_word'].iloc[2]) + '\n')
print('unique word count: ' + str(data['count_unique_word'].iloc[2]) + '\n')

# here is what is stored in an example of the data now
data.iloc[2]

word count: 142

unique word count: 104



uniqueID                                                         48928
drugName                            Ethinyl estradiol / levonorgestrel
condition                                                Birth Control
review               I had been on the pill for many years. When my...
rating                                                               8
date                                                          8-Dec-16
usefulCount                                                          1
review_lemma         I had been on the pill for many year . When my...
count_word                                                         142
count_unique_word                                                  104
count_letters                                                      720
Name: 9, dtype: object

In [19]:
#punctuation count
data["count_punctuations"] = data["review"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))

#upper case words count
data["count_words_upper"] = data["review"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))

#title case words count
data["count_words_title"] = data["review"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))

#Number of stopwords
data["count_stopwords"] = data["review"].apply(lambda x: len([w for w in str(x).lower().split() if w in STOPWORDS]))

#Average length of the words
data["mean_word_len"] = data["review"].apply(lambda x: np.mean([len(w) for w in str(x).split()]))

In [30]:
# determine if useful review given threshold
threshold = 5
ex1=data.iloc[1]
print(ex1['usefulCount'])


# adds data frame to determine if useful or not (binary based on threshold)
data['useful'] = data['usefulCount'].apply(lambda x: is_useful(x,threshold))

10


In [31]:
data['useful'].iloc[:10]

2     1
3     1
9     0
14    1
22    1
59    0
61    1
63    0
64    1
68    0
Name: useful, dtype: int64

In [None]:
# here is what is stored in one example of the data set
data.iloc[3]

In [None]:
# and another example
data.iloc[57]

In [None]:
from spellchecker import SpellChecker
spell = SpellChecker()

In [None]:
spell.unknown(['halee', 'hapening'])

In [None]:
spell.unknown(['my', 'mom', 'weant', 'to', 'the', 'stoore'])
len(spell.unknown(['my', 'mom', 'weant', 'to', 'the', 'stoore']))

In [None]:
# count_misspelled function is written in text_process.py

# this spell checker does not seem to work super great, here did not pick up nam, iss or theyre
count_misspelled('hey theyre my nam iss Melissa')

In [None]:
ex = 'hey theyre my nam iss Melissa'
word_list = nltk.word_tokenize(ex)
print(word_list)
spell.unknown(word_list)

In [None]:
text1 = example_review(data, 10)

In [None]:
count_misspelled(text1)

In [None]:
# counts drug names as misspelled
# to do: add all the drug names to the known list
spell.unknown(nltk.word_tokenize(text1))