## Idea
  
  By applying LDA on the dataset with only negative labels, we can get topic constructions and frequent words for this dataset and thus get some ideas about the blacklist. Similarly, we can also get a whitelist by applying LDA on the dataset with only positive labels. Then, we are planning to use these two lists to help us do some unsupervised classification and narrow down the original tweet dataset in order to get more useful information.

## Preprocess

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
tweets = pd.read_csv("tweets.csv")


In [3]:
tweets['query'].value_counts()

#jnj                     5033
ethicon                   832
#bwi                      368
"johnson & johnson"        95
biosense webster           90
"ETHICON"                  36
#mycompany                 12
"johnson and johnson"      11
#janssen                    8
"BIOSENSE WEBSTER"          5
actelion                    2
depuy                       2
Name: query, dtype: int64

In [4]:
queries = list(tweets['query'].value_counts().index)
queries

['#jnj',
 'ethicon',
 '#bwi',
 '"johnson & johnson"',
 'biosense webster',
 '"ETHICON"',
 '#mycompany',
 '"johnson and johnson"',
 '#janssen',
 '"BIOSENSE WEBSTER"',
 'actelion',
 'depuy']

In [5]:
query_length = len(queries)
query_length

12

In [6]:
tweets = tweets[['medical_device', 'text']]

In [7]:
#!pip install emoji
import emoji
def deEmoji(text):
    return emoji.get_emoji_regexp().sub(r'', text)

In [8]:
# Preprocess
import re
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

def text_preprocess(text):
    text = re.sub(r'#', '', text) #Replace the # symbol with '' in every tweet
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text) #Replace hyperlinks with '' in every tweet
    text = re.sub(r'[^\w\d\s]+', '', text) #Remove all punctuations
    text = deEmoji(text) #Remove emoji
    text = re.sub(r'\n', '', text) #Remove \n
    
    #Remove all stopwords
    stopwords_eng = stopwords.words('english')
    pattern = re.compile(r'\b(' + r'|'.join(stopwords_eng) + r')\b\s*')
    text = pattern.sub('', text)
    
    #Apply stem
    
    ps = PorterStemmer()
    tk = TweetTokenizer(preserve_case=False, strip_handles = True)
    tweet_tokens = tk.tokenize(text)
    tweet_finish = ''
    for token in tweet_tokens:
        stem = ps.stem(token)
        tweet_finish += (stem + ' ')
    

    return tweet_finish

In [9]:
#Split query tags into words
query_list = []
for i in range(len(queries)):
  tmp = text_preprocess(queries[i])
  tmp = tmp.split(' ')
  query_list.extend(tmp)

while("" in query_list) :
    query_list.remove("")
    
query_list

['jnj',
 'ethicon',
 'bwi',
 'johnson',
 'johnson',
 'biosens',
 'webster',
 'ethicon',
 'mycompani',
 'johnson',
 'johnson',
 'janssen',
 'biosens',
 'webster',
 'actelion',
 'depuy']

In [10]:
#Remove duplicates 
query_list = set(query_list)
query_list

{'actelion',
 'biosens',
 'bwi',
 'depuy',
 'ethicon',
 'janssen',
 'jnj',
 'johnson',
 'mycompani',
 'webster'}

In [11]:
length = len(tweets.text)
for i in range(length):
    tweets.text.iloc[i] = text_preprocess(tweets.text.iloc[i])

In [12]:
tweets.head()

Unnamed: 0,medical_device,text
0,False,yall know i know im never leav boy alon jnj
1,False,superkelli 24 such funni scene jnj
2,False,jampjohnson flagform confirm continu possibl j...
3,False,jnj daili rsi hasnt touch 69 sinc februari 202...
4,False,we outstand opportun join non clinic safeti te...


In [13]:
tweets_neg = tweets[tweets['medical_device'] == False]
tweets_pos = tweets[tweets['medical_device'] == True]

## LDA with Positive Labels

In [14]:
from gensim import corpora, models
import gensim
from pprint import pprint

In [15]:
clean_corp_pos = [tweet.split() for tweet in tweets_pos['text']]
dictionary_pos = corpora.Dictionary(clean_corp_pos)
corpus_pos = [dictionary_pos.doc2bow(text) for text in clean_corp_pos]

In [16]:
lda_pos = gensim.models.ldamodel.LdaModel(corpus_pos, num_topics = query_length, id2word = dictionary_pos, passes=20, random_state = 42)

In [17]:
pprint(lda_pos.print_topics())

[(0,
  '0.018*"ethicon" + 0.014*"jnj" + 0.010*"new" + 0.010*"amp" + 0.009*"product" '
  '+ 0.007*"studi" + 0.006*"mesh" + 0.005*"smoke" + 0.005*"compani" + '
  '0.005*"machin"'),
 (1,
  '0.024*"ethicon" + 0.019*"amp" + 0.018*"mesh" + 0.018*"jnj" + '
  '0.013*"implant" + 0.010*"johnson" + 0.007*"trump" + 0.007*"manufactur" + '
  '0.007*"ottava" + 0.007*"donor"'),
 (2,
  '0.018*"ethicon" + 0.012*"jnj" + 0.010*"surgic" + 0.009*"medic" + '
  '0.007*"market" + 0.007*"the" + 0.007*"johnson" + 0.006*"echelon" + '
  '0.006*"advanc" + 0.006*"learn"'),
 (3,
  '0.033*"amp" + 0.029*"johnson" + 0.022*"mesh" + 0.020*"ethicon" + '
  '0.018*"jnj" + 0.010*"market" + 0.007*"the" + 0.006*"trial" + 0.006*"surgic" '
  '+ 0.005*"regist"'),
 (4,
  '0.018*"ablat" + 0.017*"biosens" + 0.016*"fda" + 0.016*"cathet" + '
  '0.015*"webster" + 0.015*"ethicon" + 0.014*"use" + 0.010*"jampj" + '
  '0.009*"learn" + 0.009*"get"'),
 (5,
  '0.030*"ethicon" + 0.021*"mesh" + 0.013*"jampj" + 0.012*"day" + '
  '0.012*"johnson" 

In [18]:
tweet_topic_pos = lda_pos[corpus_pos]
topic=[]
for n in range(len(tweet_topic_pos)):
    dic = dict(tweet_topic_pos[n])
    topic.append(max(dic, key=dic.get))

In [19]:
tweets_pos['topic']=topic
tweets_pos.head()

Unnamed: 0,medical_device,text,topic
20,True,csat start research project uw sinc valid meth...,11
40,True,sambitswaraj need clear glass tri jnj acuvu le...,8
45,True,icymi huzefafn vice presid jnj digit solut gen...,0
66,True,dont miss regist complimentari csatsinc ethico...,3
69,True,biosens webster receiv ce mark approv qdot mic...,6


In [20]:
tweets_pos.topic.value_counts()

11    80
6     75
10    64
3     63
5     58
8     55
2     53
9     52
1     38
4     37
0     35
7     25
Name: topic, dtype: int64

In [21]:
#Construct whitelist
#With words whose probability >= 0.005 in each topic
whitelist = []
for index, topic in lda_pos.show_topics(formatted=False):
    tmp = [x[0].lower() for x in topic if x[1]>=0.005]
    whitelist.extend(tmp)
len(whitelist)

96

In [22]:
#remove duplicates
whitelist = set(whitelist)
len(whitelist)

47

In [23]:
whitelist

{'3d',
 'ablat',
 'access',
 'advanc',
 'amp',
 'appl',
 'biosens',
 'cathet',
 'day',
 'devic',
 'donor',
 'echelon',
 'ethicon',
 'fda',
 'health',
 'help',
 'i',
 'implant',
 'inhuman',
 'jampj',
 'jnj',
 'johnson',
 'learn',
 'line',
 'manufactur',
 'market',
 'medic',
 'medtech',
 'mesh',
 'million',
 'new',
 'ottava',
 'pain',
 'patient',
 'print',
 'product',
 'reduc',
 'stapl',
 'studi',
 'surgic',
 'technolog',
 'the',
 'transvagin',
 'trial',
 'trump',
 'use',
 'webster'}

## LDA with Negative Labels

In [24]:
clean_corp_neg = [tweet.split() for tweet in tweets_neg['text']]
dictionary_neg = corpora.Dictionary(clean_corp_neg)
corpus_neg = [dictionary_neg.doc2bow(text) for text in clean_corp_neg]

In [25]:
lda_neg = gensim.models.ldamodel.LdaModel(corpus_neg, num_topics = query_length, id2word = dictionary_neg, passes=20, random_state = 42)

In [26]:
pprint(lda_neg.print_topics())

[(0,
  '0.084*"jnj" + 0.035*"johnson" + 0.018*"amp" + 0.012*"chart" + '
  '0.009*"jnjnew" + 0.007*"buy" + 0.006*"look" + 0.006*"time" + 0.006*"thank" '
  '+ 0.006*"next"'),
 (1,
  '0.038*"jnj" + 0.016*"2" + 0.012*"the" + 0.010*"one" + 0.009*"take" + '
  '0.009*"hour" + 0.008*"video" + 0.008*"amp" + 0.007*"20" + 0.007*"year"'),
 (2,
  '0.085*"jnj" + 0.061*"i" + 0.019*"jack" + 0.019*"love" + 0.013*"24" + '
  '0.012*"superkelli" + 0.012*"get" + 0.011*"one" + 0.011*"im" + '
  '0.010*"jennif"'),
 (3,
  '0.030*"sol" + 0.020*"now" + 0.019*"jnj" + 0.019*"ive" + 0.018*"nkla" + '
  '0.017*"profit" + 0.017*"never" + 0.017*"mo" + 0.017*"coin" + 0.016*"dot"'),
 (4,
  '0.034*"johnson" + 0.032*"jnj" + 0.019*"amp" + 0.012*"fda" + 0.011*"studi" + '
  '0.007*"health" + 0.007*"know" + 0.007*"us" + 0.007*"mani" + 0.006*"batch"'),
 (5,
  '0.043*"ethicon" + 0.010*"market" + 0.008*"thank" + 0.007*"inc" + '
  '0.007*"donat" + 0.006*"biosens" + 0.006*"us" + 0.006*"webster" + 0.006*"et" '
  '+ 0.005*"learn"'),


In [27]:
tweet_topic_neg = lda_neg[corpus_neg]
topic=[]
for n in range(len(tweet_topic_neg)):
    dic = dict(tweet_topic_neg[n])
    topic.append(max(dic, key=dic.get))

In [28]:
tweets_neg['topic']=topic
tweets_neg.head()

Unnamed: 0,medical_device,text,topic
0,False,yall know i know im never leav boy alon jnj,2
1,False,superkelli 24 such funni scene jnj,2
2,False,jampjohnson flagform confirm continu possibl j...,11
3,False,jnj daili rsi hasnt touch 69 sinc februari 202...,7
4,False,we outstand opportun join non clinic safeti te...,5


In [29]:
tweets_neg.topic.value_counts()

2     1420
8     1020
0      575
9      534
11     458
5      335
4      318
1      318
7      240
3      226
10     217
6      198
Name: topic, dtype: int64

In [30]:
#Construct blacklist
#With words whose probability >= 0.005 in each topic
blacklist = []
for index, topic in lda_neg.show_topics(formatted=False):
    tmp = [x[0].lower() for x in topic if x[1]>=0.005]
    blacklist.extend(tmp)
len(blacklist)

100

In [31]:
#remove duplicates
blacklist = set(blacklist)
len(blacklist)

82

In [32]:
blacklist

{'1',
 '2',
 '20',
 '24',
 '2d',
 'airport',
 'amp',
 'appli',
 'atom',
 'bat',
 'batch',
 'biosens',
 'busi',
 'buy',
 'bwi',
 'chart',
 'coin',
 'donat',
 'done',
 'dose',
 'dot',
 'et',
 'ethicon',
 'fda',
 'get',
 'girl',
 'health',
 'hour',
 'i',
 'im',
 'inc',
 'ive',
 'jab',
 'jack',
 'jennif',
 'jnj',
 'jnjcare',
 'jnjnew',
 'johnson',
 'know',
 'learn',
 'look',
 'love',
 'mani',
 'market',
 'may',
 'meet',
 'microsoft',
 'mo',
 'never',
 'news',
 'next',
 'nkla',
 'now',
 'one',
 'peopl',
 'pfe',
 'polkadot',
 'profit',
 'program',
 'sa',
 'sol',
 'stem',
 'still',
 'studi',
 'superkelli',
 'take',
 'thank',
 'the',
 'there',
 'think',
 'time',
 'today',
 'top',
 'us',
 'video',
 'webster',
 'week',
 'wistem',
 'xrp',
 'year',
 'zec'}

## Corpus

In [33]:
blacklist

{'1',
 '2',
 '20',
 '24',
 '2d',
 'airport',
 'amp',
 'appli',
 'atom',
 'bat',
 'batch',
 'biosens',
 'busi',
 'buy',
 'bwi',
 'chart',
 'coin',
 'donat',
 'done',
 'dose',
 'dot',
 'et',
 'ethicon',
 'fda',
 'get',
 'girl',
 'health',
 'hour',
 'i',
 'im',
 'inc',
 'ive',
 'jab',
 'jack',
 'jennif',
 'jnj',
 'jnjcare',
 'jnjnew',
 'johnson',
 'know',
 'learn',
 'look',
 'love',
 'mani',
 'market',
 'may',
 'meet',
 'microsoft',
 'mo',
 'never',
 'news',
 'next',
 'nkla',
 'now',
 'one',
 'peopl',
 'pfe',
 'polkadot',
 'profit',
 'program',
 'sa',
 'sol',
 'stem',
 'still',
 'studi',
 'superkelli',
 'take',
 'thank',
 'the',
 'there',
 'think',
 'time',
 'today',
 'top',
 'us',
 'video',
 'webster',
 'week',
 'wistem',
 'xrp',
 'year',
 'zec'}

In [34]:
whitelist

{'3d',
 'ablat',
 'access',
 'advanc',
 'amp',
 'appl',
 'biosens',
 'cathet',
 'day',
 'devic',
 'donor',
 'echelon',
 'ethicon',
 'fda',
 'health',
 'help',
 'i',
 'implant',
 'inhuman',
 'jampj',
 'jnj',
 'johnson',
 'learn',
 'line',
 'manufactur',
 'market',
 'medic',
 'medtech',
 'mesh',
 'million',
 'new',
 'ottava',
 'pain',
 'patient',
 'print',
 'product',
 'reduc',
 'stapl',
 'studi',
 'surgic',
 'technolog',
 'the',
 'transvagin',
 'trial',
 'trump',
 'use',
 'webster'}

In [35]:
#intersection b/w whitelist and blacklist 
inter_black_white = blacklist.intersection(whitelist)
inter_black_white

{'amp',
 'biosens',
 'ethicon',
 'fda',
 'health',
 'i',
 'jnj',
 'johnson',
 'learn',
 'market',
 'studi',
 'the',
 'webster'}

In [36]:
#remove common words 
blacklist_final = blacklist.difference(inter_black_white)
blacklist_final

{'1',
 '2',
 '20',
 '24',
 '2d',
 'airport',
 'appli',
 'atom',
 'bat',
 'batch',
 'busi',
 'buy',
 'bwi',
 'chart',
 'coin',
 'donat',
 'done',
 'dose',
 'dot',
 'et',
 'get',
 'girl',
 'hour',
 'im',
 'inc',
 'ive',
 'jab',
 'jack',
 'jennif',
 'jnjcare',
 'jnjnew',
 'know',
 'look',
 'love',
 'mani',
 'may',
 'meet',
 'microsoft',
 'mo',
 'never',
 'news',
 'next',
 'nkla',
 'now',
 'one',
 'peopl',
 'pfe',
 'polkadot',
 'profit',
 'program',
 'sa',
 'sol',
 'stem',
 'still',
 'superkelli',
 'take',
 'thank',
 'there',
 'think',
 'time',
 'today',
 'top',
 'us',
 'video',
 'week',
 'wistem',
 'xrp',
 'year',
 'zec'}

In [37]:
whitelist_final = whitelist.difference(inter_black_white)
whitelist_final

{'3d',
 'ablat',
 'access',
 'advanc',
 'appl',
 'cathet',
 'day',
 'devic',
 'donor',
 'echelon',
 'help',
 'implant',
 'inhuman',
 'jampj',
 'line',
 'manufactur',
 'medic',
 'medtech',
 'mesh',
 'million',
 'new',
 'ottava',
 'pain',
 'patient',
 'print',
 'product',
 'reduc',
 'stapl',
 'surgic',
 'technolog',
 'transvagin',
 'trial',
 'trump',
 'use'}

In [41]:
#save sets
import json
blacklist_final = list(blacklist_final)
whitelist_final = list(whitelist_final)

with open('lda_blacklist.txt', 'w') as f:
    f.write(json.dumps(blacklist_final))

with open('lda_whitelist.txt', 'w') as f:
    f.write(json.dumps(whitelist_final))

In [42]:
#read sets
with open('lda_blacklist.txt', 'r') as f:
    bl = json.loads(f.read())
bl = set(bl)
bl

{'1',
 '2',
 '20',
 '24',
 '2d',
 'airport',
 'appli',
 'atom',
 'bat',
 'batch',
 'busi',
 'buy',
 'bwi',
 'chart',
 'coin',
 'donat',
 'done',
 'dose',
 'dot',
 'et',
 'get',
 'girl',
 'hour',
 'im',
 'inc',
 'ive',
 'jab',
 'jack',
 'jennif',
 'jnjcare',
 'jnjnew',
 'know',
 'look',
 'love',
 'mani',
 'may',
 'meet',
 'microsoft',
 'mo',
 'never',
 'news',
 'next',
 'nkla',
 'now',
 'one',
 'peopl',
 'pfe',
 'polkadot',
 'profit',
 'program',
 'sa',
 'sol',
 'stem',
 'still',
 'superkelli',
 'take',
 'thank',
 'there',
 'think',
 'time',
 'today',
 'top',
 'us',
 'video',
 'week',
 'wistem',
 'xrp',
 'year',
 'zec'}

## Problems
- Size of dataset too small, lack of representative meaning
- Hard to decide num_topics (hyperparameter in the LDA model) : now using the length of the list of query tags
- Tricky to decide the threshold of the probability of each word in topics representation (now probability>= 0.005 is chosen)