# Purpose of the notebook
* Use existing train data to get unique drug names
* Based on list of drug names to filter kaggle data for medical related tweets

In [1]:
import pandas as pd
import numpy as np
import nltk
from collections import Counter
import preprocessor as p

import re
import os

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('words')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/richardwang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/richardwang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/richardwang/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/richardwang/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [3]:
from nltk.corpus import stopwords
from nltk.corpus import words

In [4]:
stopwords = set(stopwords.words('english'))
words = set(words.words())

In [5]:
df = pd.read_csv('new_data/task2_en_training.tsv', sep='\t')
df.head()

Unnamed: 0,tweet_id,user_id,class,tweet
0,344266386467606528,809439366,0,"depression hurts, cymbalta can help"
1,349220537903489025,323112996,0,"@jessicama20045 right, but cipro can make thin..."
2,351421773079781378,713100330,0,@fibby1123 are you on paxil .. i need help
3,326594278472171520,543113070,0,@redicine the lamotrigine and sjs just made ch...
4,345567138376994816,138795534,0,have decided to skip my #humira shot today. my...


In [6]:
# clean the tweet and get part of speech tag
df['tweet_clean'] = df.tweet.apply(p.clean)
df['tweet_token'] = df.tweet_clean.apply(nltk.word_tokenize)
df['tweet_tag'] = df.tweet_token.apply(nltk.pos_tag)

In [1]:
def keep_noun(tag):
    '''only keep the part of speech that is noun'''
    nouns = []
    for i in tag:
        if i[1][0] == 'N':
            nouns.append(i[0])
    return nouns

In [8]:
# get nouns and rank noun based on occurrence
df['nouns'] = df.tweet_tag.apply(keep_noun)

all_nouns = df.nouns.to_list()
all_nouns = [item for sublist in all_nouns for item in sublist]

all_nouns = [i for i in all_nouns if i not in stopwords]
all_nouns = [i for i in all_nouns if i.lower() not in common_words]
all_nouns = [i for i in all_nouns if i not in words]



cnt = Counter()

for word in all_nouns:
    cnt[word] += 1

# Method to filter for medical related terms
* filter out common english stop words. However, there is still alot of common english words not filtered out
* get list of unique words from the book "Alice in Wonderland". Filter out any noun that exists in the book Alice in Wonderland

In [12]:
# read in the book Alice in wonder land
# get list of distinct words from the book

alice_file = 'alice.txt'
alice_raw = None

if not os.path.isfile(alice_file):
    from urllib import request
    url = 'http://www.gutenberg.org/cache/epub/19033/pg19033.txt'
    response = request.urlopen(url)
    alice_raw = response.read().decode('utf8')
    with open(alice_file, 'w', encoding='utf8') as f:
        f.write(alice_raw)
else:
    with open(alice_file, 'r', encoding='utf8') as f:
        alice_raw = f.read()
        

common_words = set(nltk.word_tokenize(alice_raw.lower()))

common_words = [i for i in common_words]

# filter out common words to only have list of words with drug name

In [22]:
med = [i[0] for i in cnt.most_common() if i[1]>=64]
med = [i for i in med if i not in ['lol', 'shit', '”', 'weeks', 'hours',  '+', 'fuck', 'im', 'mom', 'results', 'kids', 'women', '/', '’', 'sales', '..', 'issues', '=', 'haha', 'month', 'bananas', 'problems', 'lt', 'gt', 'vs', 'yrs', 'mg', 'amp', 'rt', 'months']]
med.sort()

# load in the kaggle dataset

In [17]:
# DATASET
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"

kaggle_df = pd.read_csv('new_data/kaggle/training.1600000.processed.noemoticon.csv', encoding =DATASET_ENCODING , names=DATASET_COLUMNS)
kaggle_df['tweet_token'] = kaggle_df.text.apply(nltk.word_tokenize)
kaggle_df.head()

Unnamed: 0,target,ids,date,flag,user,text,tweet_token
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[@, switchfoot, http, :, //twitpic.com/2y1zl, ..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,"[is, upset, that, he, ca, n't, update, his, Fa..."
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,"[@, Kenichan, I, dived, many, times, for, the,..."
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its..."
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","[@, nationwideclass, no, ,, it, 's, not, behav..."


### only keep the record contain drug names

In [18]:
def contain_med(tweet_token, med):
    words = []
    for i in tweet_token:
        if i.lower() in med:
            words.append(i)
    
    return words

kaggle_df['med_related_terms'] = kaggle_df['tweet_token'].apply(lambda x: contain_med(x, med) )
kaggle_df['med_related'] = kaggle_df['med_related_terms'].apply(lambda x: len(x) > 0 )

In [19]:
kaggle_df.med_related.mean()

0.03536125

In [24]:
kaggle_df[kaggle_df.med_related_terms.apply(len)>0]

Unnamed: 0,target,ids,date,flag,user,text,tweet_token,med_related_terms,med_related
87,0,1467835085,Mon Apr 06 22:26:06 PDT 2009,NO_QUERY,Ceejison,My tummy hurts. I wonder if the hypnosis has ...,"[My, tummy, hurts, ., I, wonder, if, the, hypn...",[hurts],True
144,0,1467843624,Mon Apr 06 22:28:24 PDT 2009,NO_QUERY,kellireneez,laid around too much today... now my head hurts,"[laid, around, too, much, today, ..., now, my,...",[hurts],True
287,0,1467882140,Mon Apr 06 22:38:32 PDT 2009,NO_QUERY,nyracat,feels like she slept the day away. Not look...,"[feels, like, she, slept, the, day, away, ., N...",[pills],True
345,0,1467898078,Mon Apr 06 22:42:49 PDT 2009,NO_QUERY,paulmoreton1978,I swear no matter how long I've been getting u...,"[I, swear, no, matter, how, long, I, 've, been...",[hurts],True
529,0,1467946592,Mon Apr 06 22:56:35 PDT 2009,NO_QUERY,Velvet_Rope,I am officially banning godaddy.com from my co...,"[I, am, officially, banning, godaddy.com, from...",[hurts],True
...,...,...,...,...,...,...,...,...,...
1596270,4,2192626085,Tue Jun 16 07:18:55 PDT 2009,NO_QUERY,snorklewacker,@Cadistra If it turns out not to be Celiac (I ...,"[@, Cadistra, If, it, turns, out, not, to, be,...",[symptoms],True
1597058,4,2192836657,Tue Jun 16 07:37:17 PDT 2009,NO_QUERY,princess967,says happy meds again weeeeeeeee! (haha) http...,"[says, happy, meds, again, weeeeeeeee, !, (, h...",[meds],True
1597557,4,2192959321,Tue Jun 16 07:47:55 PDT 2009,NO_QUERY,melbrehl,"@dunchinson good way to look at it, honestly. ...","[@, dunchinson, good, way, to, look, at, it, ,...",[meds],True
1598148,4,2193105539,Tue Jun 16 08:00:18 PDT 2009,NO_QUERY,faviDuzit,good morning!! designer drugs are the best thi...,"[good, morning, !, !, designer, drugs, are, th...",[drugs],True


In [25]:
kaggle_df_med_related = kaggle_df[kaggle_df.med_related].copy()

In [26]:
for i in kaggle_df_med_related.text[:10]:
    print(i)
    print('- - - - - ')

My tummy hurts.  I wonder if the hypnosis has anything to do with it? If so, it's working, I get it, STOP SMOKING!!!
- - - - - 
laid around too much today... now my head hurts 
- - - - - 
feels like she slept the day away.    Not looking forward to any more bouts with my gallbladder.  At least I have pills now for the pain.
- - - - - 
I swear no matter how long I've been getting up at 5am, it never gets any easier. Man my eyes hurts wah 
- - - - - 
I am officially banning godaddy.com from my comp. My head hurts from the small print AND I wasted $10 that could've happily gone to Boba 
- - - - - 
Not feeling well and back hurts 
- - - - - 
has hurt her ankle!! and is going to the dr 
- - - - - 
I so hate homeworks -.- My head hurts so bad 
- - - - - 
Back at work  @ John Muir Dr http://loopt.us/KoqAbg
- - - - - 
my little pinky finger hurts so much.. 
- - - - - 


In [27]:
kaggle_df_med_related['word'] = kaggle_df_med_related.med_related_terms.apply(lambda x: x[0])

In [28]:
agg_info = kaggle_df_med_related.groupby('word').agg({'ids': 'count', 'target':'mean' }).sort_values(by='ids', ascending=False)

In [29]:
for i in agg_info.index:
    
    print('term: {} | Count: {} | Avg sentiment: {}'.format(i, agg_info.loc[i, 'ids'], agg_info.loc[i, 'target']))

term: hurts | Count: 7305 | Avg sentiment: 0.23381245722108146
term: meds | Count: 572 | Avg sentiment: 0.8811188811188811
term: dr | Count: 336 | Avg sentiment: 1.0119047619047619
term: Dr | Count: 327 | Avg sentiment: 1.7492354740061162
term: pills | Count: 315 | Avg sentiment: 1.0412698412698413
term: drugs | Count: 306 | Avg sentiment: 1.6601307189542485
term: commercials | Count: 231 | Avg sentiment: 1.5064935064935066
term: symptoms | Count: 180 | Avg sentiment: 0.6444444444444445
term: tablets | Count: 123 | Avg sentiment: 0.8130081300813008
term: Hurts | Count: 102 | Avg sentiment: 0.47058823529411764
term: patients | Count: 95 | Avg sentiment: 1.3473684210526315
term: DR | Count: 55 | Avg sentiment: 1.3818181818181818
term: MG | Count: 29 | Avg sentiment: 2.206896551724138
term: Meds | Count: 25 | Avg sentiment: 0.32
term: mg | Count: 24 | Avg sentiment: 1.5
term: Drugs | Count: 14 | Avg sentiment: 1.7142857142857142
term: FDA | Count: 14 | Avg sentiment: 0.0
term: viagra | Co