## Load data

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [None]:
tweet_nolabel = pd.read_csv('tweet_no_label_afterpreprocess.csv')
tweet_original = pd.read_csv('tweets-no-labels.csv')
tweet_nolabel.columns

Index(['Unnamed: 0', 'text', 'query'], dtype='object')

In [None]:
tweet_nolabel.rename(columns = {'text': 'text_preprocess'},inplace = True)
tweet_nolabel.drop(columns = 'Unnamed: 0', inplace = True)
tweet_nolabel.head()

Unnamed: 0,text_preprocess,query
0,timcast joerogan isnt goal beat viru let say a...,#jnj
1,ema_new frustrat ema_new jnj ship vaccin made ...,#jnj
2,alexandrosandr yorkteachingnh ethicon abby_lan...,ethicon
3,jack jennif roll today i love sinc i first lai...,#jnj
4,xotonironixo they alway jnj alway hold special...,#jnj


In [None]:
tweet_nolabel = tweet_nolabel.join(tweet_original.text)

In [None]:
tweet_nolabel.rename(columns = {'text': 'text_original'},inplace = True)

In [None]:
tweet_nolabel.head()

Unnamed: 0,text_preprocess,query,text_original
0,timcast joerogan isnt goal beat viru let say a...,#jnj,@Timcast @joerogan Isn’t the goal to beat the ...
1,ema_new frustrat ema_new jnj ship vaccin made ...,#jnj,@EMA_News Frustrating @EMA_News #JnJ ships vac...
2,alexandrosandr yorkteachingnh ethicon abby_lan...,ethicon,@AlexandrosAndre @YorkTeachingNHS @Ethicon @ab...
3,jack jennif roll today i love sinc i first lai...,#jnj,Jack and Jennifer had me rolling today. I have...
4,xotonironixo they alway jnj alway hold special...,#jnj,@xotonironixo They will always be #JnJ to me a...


In [None]:
tweet_nolabel['query'].value_counts()

#jnj                  22598
ethicon                 678
#bwi                    406
"ETHICON"               391
biosense webster         61
"BIOSENSE WEBSTER"       29
Name: query, dtype: int64

### Null values

In [None]:
#text_preprocess has null values
tweet_nolabel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24163 entries, 0 to 24162
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   text_preprocess  24130 non-null  object
 1   query            24163 non-null  object
 2   text_original    24163 non-null  object
dtypes: object(3)
memory usage: 566.4+ KB


In [None]:
# creating bool series True for NaN values 
bool_series = pd.isnull(tweet_nolabel.text_preprocess) 

# filtering data 
# displaying data only with text_preprocess = NaN 
tweet_nolabel[bool_series] 

Unnamed: 0,text_preprocess,query,text_original
1295,,#jnj,https://t.co/lVBp7nUQir early spoilers #JNJ #...
3105,,#jnj,https://t.co/1IXgFNFGp3 #StayHome no #cure no ...
4254,,#jnj,https://t.co/4gd1p2uZhN: Johnson and Johnson s...
5751,,"""BIOSENSE WEBSTER""",https://t.co/Dl7Viunqhg. Cardiac Mapping Market
6209,,#jnj,https://t.co/c2eDqi5SD8 So proud of all the wo...
6541,,#jnj,https://t.co/FfjvWmuFXK The Basic Foolishness ...
6745,,#jnj,https://t.co/BWZy3LqGfA #DrFauci what about #j...
8532,,#jnj,https://t.co/h9ygE9mU7p totally! #JnJ #Johnson...
8534,,#jnj,https://t.co/rxf9X8fK64 Thanking God young tee...
8545,,#jnj,https://t.co/ag4OQ1LURJ PLEASE PPL:VAX! YOU'RE...


In [None]:
#Drop null values
tweet_nolabel = tweet_nolabel.dropna()
tweet_nolabel.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24130 entries, 0 to 24162
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   text_preprocess  24130 non-null  object
 1   query            24130 non-null  object
 2   text_original    24130 non-null  object
dtypes: object(3)
memory usage: 754.1+ KB


## Load Blacklist and Whitelist



### LDA

In [None]:
# LDA
import json
with open('lda_blacklist.txt', 'r') as f:
    lda_bl = json.loads(f.read())
lda_bl = set(lda_bl)
lda_bl

{'1',
 '2',
 '20',
 '24',
 '2d',
 'airport',
 'appli',
 'atom',
 'bat',
 'batch',
 'busi',
 'buy',
 'bwi',
 'chart',
 'coin',
 'donat',
 'done',
 'dose',
 'dot',
 'et',
 'get',
 'girl',
 'hour',
 'im',
 'inc',
 'ive',
 'jab',
 'jack',
 'jennif',
 'jnjcare',
 'jnjnew',
 'know',
 'look',
 'love',
 'mani',
 'may',
 'meet',
 'microsoft',
 'mo',
 'never',
 'news',
 'next',
 'nkla',
 'now',
 'one',
 'peopl',
 'pfe',
 'polkadot',
 'profit',
 'program',
 'sa',
 'sol',
 'stem',
 'still',
 'superkelli',
 'take',
 'thank',
 'there',
 'think',
 'time',
 'today',
 'top',
 'us',
 'video',
 'week',
 'wistem',
 'xrp',
 'year',
 'zec'}

In [None]:
with open('lda_whitelist.txt', 'r') as f:
    lda_wl = json.loads(f.read())
lda_wl = set(lda_wl)
lda_wl

{'3d',
 'ablat',
 'access',
 'advanc',
 'appl',
 'cathet',
 'day',
 'devic',
 'donor',
 'echelon',
 'help',
 'implant',
 'inhuman',
 'jampj',
 'line',
 'manufactur',
 'medic',
 'medtech',
 'mesh',
 'million',
 'new',
 'ottava',
 'pain',
 'patient',
 'print',
 'product',
 'reduc',
 'stapl',
 'surgic',
 'technolog',
 'transvagin',
 'trial',
 'trump',
 'use'}

### Cluster

In [None]:
# CLUSTER
with open('kmeans_blacklist.txt', 'r') as f:
    cluster_bl = json.loads(f.read())
cluster_bl = set(cluster_bl)
cluster_bl

{'1765',
 '20',
 '2020',
 '24',
 '2d',
 '36',
 '5m',
 '72',
 'abbi',
 'abigail',
 'abnb',
 'action',
 'addict',
 'ak',
 'akita',
 'akt',
 'also',
 'alway',
 'amp',
 'and',
 'anyth',
 'as',
 'athforth',
 'atom',
 'aur',
 'back',
 'bat',
 'beforesnt',
 'believ',
 'best',
 'better',
 'big',
 'brav',
 'break',
 'btt',
 'but',
 'buy',
 'cant',
 'cap',
 'care',
 'carlivatiron',
 'cathyckobisk',
 'celebr',
 'chang',
 'charact',
 'chart',
 'check',
 'chr',
 'clip',
 'close',
 'coin',
 'collabor',
 'come',
 'commit',
 'commun',
 'compani',
 'continu',
 'could',
 'coupl',
 'cours',
 'cri',
 'day',
 'daysofour_l',
 'dayzedconfuz',
 'dego',
 'dent',
 'develop',
 'didnt',
 'dont',
 'dose',
 'dot',
 'effort',
 'elonmusk',
 'els',
 'elsewher',
 'elsewhereabroad',
 'elud',
 'elwoodmartinmd',
 'ema',
 'emad',
 'email',
 'embalm',
 'embank',
 'embargo',
 'embark',
 'embed',
 'embodi',
 'embrac',
 'embroideri',
 'emea',
 'emeraldrobinson',
 'emerg',
 'emergencymanag',
 'emergencyread',
 'emergingworldld'

In [None]:
with open('kmeans_whitelist.txt', 'r') as f:
    cluster_wl = json.loads(f.read())
cluster_wl = set(cluster_wl)
cluster_wl

{'10',
 '3d',
 'ablat',
 'af',
 'amaz',
 'approv',
 'atrial',
 'biosens',
 'biosensewebst',
 'ca',
 'cardiovascular',
 'carto',
 'cathet',
 'clear',
 'clinialresearch',
 'clinic',
 'clinicalstudi',
 'clinicaltri',
 'confer',
 'design',
 'devic',
 'donat',
 'dr',
 'epeep',
 'ethicon',
 'europ',
 'fibril',
 'inc',
 'includ',
 'industri',
 'irvin',
 'jnjinstitut',
 'jnjmeddeviceuki',
 'litig',
 'local',
 'medic',
 'medicaldevic',
 'medtron',
 'mesh',
 'nod',
 'persist',
 'print',
 'prisma',
 'session',
 'sf',
 'smarttouch',
 'surgeon',
 'surgeri',
 'surgic',
 'sutur',
 'system',
 'thermocool',
 'train',
 'treat',
 'treatment',
 'varipuls',
 'ventil',
 'virtual',
 'webster',
 'wound'}

### Frequent words

In [None]:
# FREQUENT WORDS
with open('frequency_blacklist.txt', 'r') as f:
    frequent_bl = json.loads(f.read())
frequent_bl = set(frequent_bl)
frequent_bl

{'24',
 'abnb',
 'akita',
 'akt',
 'atom',
 'bat',
 'bwi',
 'chr',
 'coin',
 'cri',
 'dego',
 'dose',
 'eo',
 'euro',
 'evfm',
 'fcgd',
 'financialinclus',
 'gmr',
 'ht',
 'intc',
 'inu',
 'ipof',
 'jack',
 'jpm',
 'juv',
 'lgtt',
 'mo',
 'mre',
 'nftart',
 'nkla',
 'nnvc',
 'ogn',
 'olt',
 'pfe',
 'pola',
 'polkadot',
 'qfin',
 'ramp',
 'rdd',
 'safeicaru',
 'shll',
 'sol',
 'superkelli',
 'tblt',
 'tsnp',
 'uon',
 'wink',
 'wsb',
 'xtrm',
 'zec'}

In [None]:
with open('frequency_whitelist.txt', 'r') as f:
    frequent_wl = json.loads(f.read())
frequent_wl = set(frequent_wl)
frequent_wl

{'2327',
 'agbecerra',
 'antimicrobi',
 'arthroplasti',
 'asgbieg',
 'awschat',
 'bard',
 'bellweth',
 'cdh',
 'circular',
 'commonli',
 'defect',
 'deriv',
 'echelon',
 'endopath',
 'ergonom',
 'evacu',
 'experiment',
 'getolympu',
 'hip',
 'humanright',
 'icijorg',
 'lens',
 'linda',
 'linx',
 'manual',
 'mdl',
 'mesh',
 'microwav',
 'missurgeri',
 'optic',
 'petroleum',
 'physiomesh',
 'prolaps',
 'prolift',
 'qdot',
 'sf',
 'sharjah',
 'smarttouch',
 'stapler',
 'surgonom',
 'thermocool',
 'transvagin',
 'tvm',
 'vagin',
 'varipuls',
 'verdict',
 'vicryl',
 'weaponri',
 'wearabl'}

## Algorithm
- initial score = 0
- For each tweet:
  - Have words in any whitelist: score+1
  - Have words in any blacklist: score-1
- score range: -3 to 3

In [None]:
def calculate_score(tweet):
    score = 0
    tweet = set(tweet.split())
    
    #whitelist
    if lda_wl.intersection(tweet):
        score += 1
    if cluster_wl.intersection(tweet):
        score += 1
    if frequent_wl.intersection(tweet):
        score += 1

    #blacklist
    if lda_bl.intersection(tweet):
        score -= 1
    if cluster_bl.intersection(tweet):
        score -= 1
    if frequent_bl.intersection(tweet):
        score -= 1    
    
    return score

In [None]:
length = len(tweet_nolabel)
score = []
for i in range(length):
    score.append(calculate_score(tweet_nolabel.text_preprocess.iloc[i]))

In [None]:
tweet_nolabel['score']=score
tweet_nolabel.head()

Unnamed: 0,text_preprocess,query,text_original,score
0,timcast joerogan isnt goal beat viru let say a...,#jnj,@Timcast @joerogan Isn’t the goal to beat the ...,-2
1,ema_new frustrat ema_new jnj ship vaccin made ...,#jnj,@EMA_News Frustrating @EMA_News #JnJ ships vac...,0
2,alexandrosandr yorkteachingnh ethicon abby_lan...,ethicon,@AlexandrosAndre @YorkTeachingNHS @Ethicon @ab...,-1
3,jack jennif roll today i love sinc i first lai...,#jnj,Jack and Jennifer had me rolling today. I have...,-2
4,xotonironixo they alway jnj alway hold special...,#jnj,@xotonironixo They will always be #JnJ to me a...,-1


In [None]:
tweet_nolabel['score'].value_counts()

-1    10227
-2     7787
 0     3815
-3     1542
 1      604
 2      147
 3        8
Name: score, dtype: int64

In [None]:
tweet_nolabel = tweet_nolabel.sort_values(by=['score'], ascending=False)
tweet_nolabel.head(8)

Unnamed: 0,text_preprocess,query,text_original,score
13021,jim crumbley receiv ethicon physiomesh implant...,ethicon,Jim Crumbley received an Ethicon Physiomesh im...,3
15838,nearli 15000 hernia mesh claim file against ba...,ethicon,"Nearly 15,000 Hernia Mesh Claims Filed Against...",3
15839,ethicon physiomesh case select for bellweth tr...,ethicon,Ethicon Physiomesh Cases Selected For Bellweth...,3
16909,biosens webster receiv ce mark approv qdot mic...,#jnj,Biosense Webster receives CE mark approval for...,3
15840,transvagin mesh lawsuit file after ethicon gyn...,"""ETHICON""",Transvaginal Mesh Lawsuit Filed After Ethicon ...,3
13275,carrollodea lawyer billmadden pelvic mesh liti...,ethicon,"@carrollodea Lawyers' @BillMadden : ""Pelvic M...",3
16649,vigna law group vagin mesh ethicon multidistri...,ethicon,Vigna Law Group: Vaginal Mesh Ethicon Multidis...,3
3639,ethicon royal surrey smoke evacu trial kick jn...,ethicon,Ethicon Royal Surrey Smoke Evacuation trial ki...,3


In [None]:
#Tweets with highest score
length = 8
for i in range(length):
    print(str(i) + ': ')
    print(tweet_nolabel.text_original.iloc[i])
    print()

0: 
Jim Crumbley received an Ethicon Physiomesh implant to repair his incision ventral hernia in July 2014 at a Georgia hospital.

Read more 👉 https://t.co/IEdKvQ6U9O

 #Herniamesh #Hernia #Bellwether https://t.co/GvqCJknPPc

1: 
Nearly 15,000 Hernia Mesh Claims Filed Against Bard, Ethicon and Atrium in Federal Courts
https://t.co/tLIJv0qP5k

2: 
Ethicon Physiomesh Cases Selected For Bellwether Trials Set To Begin in March, June and Sept. 2021
 https://t.co/wJQ1ftCs2D

3: 
Biosense Webster receives CE mark approval for QDOT MICRO ablation catheter https://t.co/HPvKF8izzT #mycompany #jnj

4: 
Transvaginal Mesh Lawsuit Filed After Ethicon Gynecare Prosima Eroded Into Surrounding Tissue
 https://t.co/TPQQM6oZ0F

5: 
@carrollodea Lawyers'  @BillMadden : "Pelvic Mesh Litigation – Outcome of the Federal Court of Australia Trial in Gill v Ethicon Sàrl (No 5)" Read in link.
https://t.co/vMHB2YWaem
#auslaw https://t.co/7UptB48oxH

6: 
Vigna Law Group: Vaginal Mesh Ethicon Multidistrict Litigati

## Algorithm2
- initial score = 0
- For each word, if it is in any whitelist, score+1. If it is in any blacklist, score-1.

In [None]:
def calculate_score_v2(tweet):
    score = 0
    tweet = tweet.split() #whitelist
    for t in tweet:
        if t in lda_wl:
            score += 1
        if t in cluster_wl:
            score += 1
        if t in frequent_wl:
            score += 1
        if t in frequent_bl:
            score -= 1
        if t in cluster_bl:
            score -= 1
        if t in lda_bl:
            score -= 1
    return score

In [None]:
length = len(tweet_nolabel)
score2 = []
for i in range(length):
    score2.append(calculate_score_v2(tweet_nolabel.text_preprocess.iloc[i]))
tweet_nolabel['score2']=score2
tweet_nolabel.head(20)

Unnamed: 0,text_preprocess,query,text_original,score,score2
13021,jim crumbley receiv ethicon physiomesh implant...,ethicon,Jim Crumbley received an Ethicon Physiomesh im...,3,4
15838,nearli 15000 hernia mesh claim file against ba...,ethicon,"Nearly 15,000 Hernia Mesh Claims Filed Against...",3,5
15839,ethicon physiomesh case select for bellweth tr...,ethicon,Ethicon Physiomesh Cases Selected For Bellweth...,3,4
16909,biosens webster receiv ce mark approv qdot mic...,#jnj,Biosense Webster receives CE mark approval for...,3,8
15840,transvagin mesh lawsuit file after ethicon gyn...,"""ETHICON""",Transvaginal Mesh Lawsuit Filed After Ethicon ...,3,6
13275,carrollodea lawyer billmadden pelvic mesh liti...,ethicon,"@carrollodea Lawyers' @BillMadden : ""Pelvic M...",3,6
16649,vigna law group vagin mesh ethicon multidistri...,ethicon,Vigna Law Group: Vaginal Mesh Ethicon Multidis...,3,6
3639,ethicon royal surrey smoke evacu trial kick jn...,ethicon,Ethicon Royal Surrey Smoke Evacuation trial ki...,3,5
246,who lead key player suction irrig marketkey pl...,ethicon,Who are the leading key players in the Suction...,2,6
12569,realdonaldtrump fda 2012 ethiconproduct recal ...,ethicon,@realDonaldTrump @FDA 2012 - ETHICON\nProduct ...,2,11


In [None]:
tweet_nolabel.loc[10613,'text_original']

"$JNJ's #SerialKillerCEO Alex Gorsky should be in federal prison for 31 children that died on Risperdal, the 100,000 women injured or died due to Gynecare Mesh &amp; the women who got cancer from Johnson &amp; Johnson Baby Powder.\n\n#TheSociopathicBusinessModel \n#FraudFormula #JNJ https://t.co/S0p7i0TY9L"