## Load data

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
tweet_nolabel = pd.read_csv('tweet_no_label_afterpreprocess.csv')
tweet_original = pd.read_csv('tweets-no-labels.csv')
tweet_nolabel.columns

Index(['Unnamed: 0', 'text', 'query'], dtype='object')

In [3]:
tweet_nolabel.rename(columns = {'text': 'text_preprocess'},inplace = True)
tweet_nolabel.drop(columns = 'Unnamed: 0', inplace = True)
tweet_nolabel.head()

Unnamed: 0,text_preprocess,query
0,timcast joerogan isnt goal beat viru let say a...,#jnj
1,ema_new frustrat ema_new jnj ship vaccin made ...,#jnj
2,alexandrosandr yorkteachingnh ethicon abby_lan...,ethicon
3,jack jennif roll today i love sinc i first lai...,#jnj
4,xotonironixo they alway jnj alway hold special...,#jnj


In [4]:
#Combine two tables
tweet_nolabel = tweet_nolabel.join(tweet_original.text)

In [5]:
tweet_nolabel.rename(columns = {'text': 'text_original'},inplace = True)

In [6]:
tweet_nolabel.head()

Unnamed: 0,text_preprocess,query,text_original
0,timcast joerogan isnt goal beat viru let say a...,#jnj,@Timcast @joerogan Isn’t the goal to beat the ...
1,ema_new frustrat ema_new jnj ship vaccin made ...,#jnj,@EMA_News Frustrating @EMA_News #JnJ ships vac...
2,alexandrosandr yorkteachingnh ethicon abby_lan...,ethicon,@AlexandrosAndre @YorkTeachingNHS @Ethicon @ab...
3,jack jennif roll today i love sinc i first lai...,#jnj,Jack and Jennifer had me rolling today. I have...
4,xotonironixo they alway jnj alway hold special...,#jnj,@xotonironixo They will always be #JnJ to me a...


In [7]:
tweet_nolabel['query'].value_counts()

#jnj                  22598
ethicon                 678
#bwi                    406
"ETHICON"               391
biosense webster         61
"BIOSENSE WEBSTER"       29
Name: query, dtype: int64

### Null values

In [8]:
#text_preprocess has null values
tweet_nolabel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24163 entries, 0 to 24162
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   text_preprocess  24130 non-null  object
 1   query            24163 non-null  object
 2   text_original    24163 non-null  object
dtypes: object(3)
memory usage: 566.4+ KB


In [9]:
# creating bool series True for NaN values 
bool_series = pd.isnull(tweet_nolabel.text_preprocess) 

# filtering data 
# displaying data only with text_preprocess = NaN 
tweet_nolabel[bool_series] 

Unnamed: 0,text_preprocess,query,text_original
1295,,#jnj,https://t.co/lVBp7nUQir early spoilers #JNJ #...
3105,,#jnj,https://t.co/1IXgFNFGp3 #StayHome no #cure no ...
4254,,#jnj,https://t.co/4gd1p2uZhN: Johnson and Johnson s...
5751,,"""BIOSENSE WEBSTER""",https://t.co/Dl7Viunqhg. Cardiac Mapping Market
6209,,#jnj,https://t.co/c2eDqi5SD8 So proud of all the wo...
6541,,#jnj,https://t.co/FfjvWmuFXK The Basic Foolishness ...
6745,,#jnj,https://t.co/BWZy3LqGfA #DrFauci what about #j...
8532,,#jnj,https://t.co/h9ygE9mU7p totally! #JnJ #Johnson...
8534,,#jnj,https://t.co/rxf9X8fK64 Thanking God young tee...
8545,,#jnj,https://t.co/ag4OQ1LURJ PLEASE PPL:VAX! YOU'RE...


In [10]:
#Drop null values
tweet_nolabel = tweet_nolabel.dropna()
tweet_nolabel.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24130 entries, 0 to 24162
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   text_preprocess  24130 non-null  object
 1   query            24130 non-null  object
 2   text_original    24130 non-null  object
dtypes: object(3)
memory usage: 754.1+ KB


## Load Blacklist and Whitelist



### LDA

In [11]:
# LDA
import json
with open('lda_blacklist.txt', 'r') as f:
    lda_bl = json.loads(f.read())
lda_bl = set(lda_bl)
len(lda_bl)

69

In [12]:
with open('lda_whitelist.txt', 'r') as f:
    lda_wl = json.loads(f.read())
lda_wl = set(lda_wl)
len(lda_wl)

34

### Cluster

In [13]:
# CLUSTER
with open('kmeans_blacklist.txt', 'r') as f:
    cluster_bl = json.loads(f.read())
cluster_bl = set(cluster_bl)
len(cluster_bl)

437

In [14]:
with open('kmeans_whitelist.txt', 'r') as f:
    cluster_wl = json.loads(f.read())
cluster_wl = set(cluster_wl)
len(cluster_wl)

60

### Frequent words

In [15]:
# FREQUENT WORDS
with open('frequency_blacklist.txt', 'r') as f:
    frequent_bl = json.loads(f.read())
frequent_bl = set(frequent_bl)
len(frequent_bl)

50

In [16]:
with open('frequency_whitelist.txt', 'r') as f:
    frequent_wl = json.loads(f.read())
frequent_wl = set(frequent_wl)
len(frequent_wl)

50

## Algorithm
- initial score = 0
- For each tweet:
  - Have words in any whitelist: score+1
  - Have words in any blacklist: score-1
- score range: -3 to 3

In [27]:
def calculate_score(tweet):
    score = 0
    tweet = set(tweet.split())
    
    #whitelist
    if lda_wl.intersection(tweet):
        score += 1
    if cluster_wl.intersection(tweet):
        score += 1
    if frequent_wl.intersection(tweet):
        score += 1

    #blacklist
    if lda_bl.intersection(tweet):
        score -= 1
    if cluster_bl.intersection(tweet):
        score -= 1
    if frequent_bl.intersection(tweet):
        score -= 1    
    
    return score

In [28]:
length = len(tweet_nolabel)
score = []
for i in range(length):
    score.append(calculate_score(tweet_nolabel.text_preprocess.iloc[i]))

In [29]:
tweet_nolabel['score']=score
tweet_nolabel.head()

Unnamed: 0,text_preprocess,query,text_original,score
15840,transvagin mesh lawsuit file after ethicon gyn...,"""ETHICON""",Transvaginal Mesh Lawsuit Filed After Ethicon ...,3
3639,ethicon royal surrey smoke evacu trial kick jn...,ethicon,Ethicon Royal Surrey Smoke Evacuation trial ki...,3
13275,carrollodea lawyer billmadden pelvic mesh liti...,ethicon,"@carrollodea Lawyers' @BillMadden : ""Pelvic M...",3
15839,ethicon physiomesh case select for bellweth tr...,ethicon,Ethicon Physiomesh Cases Selected For Bellweth...,3
15838,nearli 15000 hernia mesh claim file against ba...,ethicon,"Nearly 15,000 Hernia Mesh Claims Filed Against...",3


In [30]:
tweet_nolabel['score'].value_counts().sort_index(ascending = False)

 3        8
 2      147
 1      604
 0     3815
-1    10227
-2     7787
-3     1542
Name: score, dtype: int64

In [31]:
tweet_nolabel['score'].describe()

count    24130.000000
mean        -1.222752
std          0.920381
min         -3.000000
25%         -2.000000
50%         -1.000000
75%         -1.000000
max          3.000000
Name: score, dtype: float64

In [32]:
tweet_nolabel = tweet_nolabel.sort_values(by=['score'], ascending=False)
tweet_nolabel.head(8)

Unnamed: 0,text_preprocess,query,text_original,score
15840,transvagin mesh lawsuit file after ethicon gyn...,"""ETHICON""",Transvaginal Mesh Lawsuit Filed After Ethicon ...,3
16649,vigna law group vagin mesh ethicon multidistri...,ethicon,Vigna Law Group: Vaginal Mesh Ethicon Multidis...,3
13021,jim crumbley receiv ethicon physiomesh implant...,ethicon,Jim Crumbley received an Ethicon Physiomesh im...,3
16909,biosens webster receiv ce mark approv qdot mic...,#jnj,Biosense Webster receives CE mark approval for...,3
3639,ethicon royal surrey smoke evacu trial kick jn...,ethicon,Ethicon Royal Surrey Smoke Evacuation trial ki...,3
15838,nearli 15000 hernia mesh claim file against ba...,ethicon,"Nearly 15,000 Hernia Mesh Claims Filed Against...",3
13275,carrollodea lawyer billmadden pelvic mesh liti...,ethicon,"@carrollodea Lawyers' @BillMadden : ""Pelvic M...",3
15839,ethicon physiomesh case select for bellweth tr...,ethicon,Ethicon Physiomesh Cases Selected For Bellweth...,3


In [33]:
#Tweets with highest score
length = 100
for i in range(length):
    print(str(i) + ': ')
    print(tweet_nolabel.text_original.iloc[i])
    print()

0: 
Transvaginal Mesh Lawsuit Filed After Ethicon Gynecare Prosima Eroded Into Surrounding Tissue
 https://t.co/TPQQM6oZ0F

1: 
Vigna Law Group: Vaginal Mesh Ethicon Multidistrict Litigation Update https://t.co/FMSZ7tCJbz

2: 
Jim Crumbley received an Ethicon Physiomesh implant to repair his incision ventral hernia in July 2014 at a Georgia hospital.

Read more 👉 https://t.co/IEdKvQ6U9O

 #Herniamesh #Hernia #Bellwether https://t.co/GvqCJknPPc

3: 
Biosense Webster receives CE mark approval for QDOT MICRO ablation catheter https://t.co/HPvKF8izzT #mycompany #jnj

4: 
Ethicon Royal Surrey Smoke Evacuation trial kick off.... @JNJMedDeviceUKI @Ethicon #smokeevacuation #orsafety #surgicalsmoke https://t.co/37gf71tLxQ

5: 
Nearly 15,000 Hernia Mesh Claims Filed Against Bard, Ethicon and Atrium in Federal Courts
https://t.co/tLIJv0qP5k

6: 
@carrollodea Lawyers'  @BillMadden : "Pelvic Mesh Litigation – Outcome of the Federal Court of Australia Trial in Gill v Ethicon Sàrl (No 5)" Read in lin

## Algorithm2
- initial score = 0
- For each word, if it is in any whitelist, score+1. If it is in any blacklist, score-1.

In [40]:
def calculate_score_v2(tweet):
    score = 0
    tweet = tweet.split() 
    for t in tweet:
        if t in lda_wl:
            score += 1
        if t in cluster_wl:
            score += 1
        if t in frequent_wl:
            score += 1
        if t in frequent_bl:
            score -= 1
        if t in cluster_bl:
            score -= 1
        if t in lda_bl:
            score -= 1
        #score = score/len(tweet)
    return score

In [41]:
length = len(tweet_nolabel)
score2 = []
for i in range(length):
    score2.append(calculate_score_v2(tweet_nolabel.text_preprocess.iloc[i]))
tweet_nolabel['score2']=score2
tweet_nolabel.head(20)

Unnamed: 0,text_preprocess,query,text_original,score,score2
15840,transvagin mesh lawsuit file after ethicon gyn...,"""ETHICON""",Transvaginal Mesh Lawsuit Filed After Ethicon ...,3,6
16649,vigna law group vagin mesh ethicon multidistri...,ethicon,Vigna Law Group: Vaginal Mesh Ethicon Multidis...,3,6
13021,jim crumbley receiv ethicon physiomesh implant...,ethicon,Jim Crumbley received an Ethicon Physiomesh im...,3,4
16909,biosens webster receiv ce mark approv qdot mic...,#jnj,Biosense Webster receives CE mark approval for...,3,8
3639,ethicon royal surrey smoke evacu trial kick jn...,ethicon,Ethicon Royal Surrey Smoke Evacuation trial ki...,3,5
15838,nearli 15000 hernia mesh claim file against ba...,ethicon,"Nearly 15,000 Hernia Mesh Claims Filed Against...",3,5
13275,carrollodea lawyer billmadden pelvic mesh liti...,ethicon,"@carrollodea Lawyers' @BillMadden : ""Pelvic M...",3,6
15839,ethicon physiomesh case select for bellweth tr...,ethicon,Ethicon Physiomesh Cases Selected For Bellweth...,3,4
14954,liver surgeri right hepatectomi microwav ablat...,ethicon,Liver Surgery – Right Hepatectomy with Microwa...,2,4
23415,peterpitt usatoday i assur million johnson amp...,#jnj,@PeterPitts @USATODAY I can assure you the MIL...,2,1


In [43]:
tweet_nolabel.sort_values(by=['score2'], ascending=False)

Unnamed: 0,text_preprocess,query,text_original,score,score2
23364,janurban 12 could mesh weapon implant san dieg...,ethicon,@JanUrban12 Could it be that mesh was a WEAPON...,2,20
23510,fda clear biosens webster thermocool smarttouc...,biosense webster,FDA Clears Biosense Webster ThermoCool SmartTo...,2,17
21464,biosens webster receiv fda approv thermocool s...,biosense webster,Biosense Webster Receives FDA Approval for THE...,2,16
3705,radiofrequ base cathet ablat hermocool smartto...,biosense webster,Radio-frequency based catheter ablation HERMO...,2,15
21472,biosensewebst receiv fda approv thermocool sma...,biosense webster,@BiosenseWebster Receives FDA Approval for THE...,2,15
...,...,...,...,...,...
8189,ive never profit now time snt pola coin inu in...,#jnj,I've never been profitable before. Now is the ...,-3,-88
18557,ive never profit now time snt pola coin inu in...,#jnj,I've never been profitable before. Now is the ...,-3,-88
18555,ive never profit now time snt pola coin inu in...,#jnj,I've never been profitable before. Now is the ...,-3,-88
16573,ive never profit now time snt pola coin inu in...,#jnj,I've never been profitable before. Now is the ...,-3,-88


In [44]:
tweet_nolabel.score2.value_counts().sort_index(ascending = False)

 20     1
 17     1
 16     1
 15     4
 14     2
       ..
-81     3
-84     8
-86    84
-87     1
-88    26
Name: score2, Length: 76, dtype: int64

In [45]:
tweet_nolabel.score2.describe()

count    24130.000000
mean        -6.446871
std          8.780440
min        -88.000000
25%         -8.000000
50%         -4.000000
75%         -2.000000
max         20.000000
Name: score2, dtype: float64

In [51]:
#tweet_nolabel.to_csv('corpus.csv')