In [11]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
import pandas as pd
from tqdm import tqdm
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
analyzer = SentimentIntensityAnalyzer()

In [2]:
nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

In [6]:
df = pd.read_csv("test.csv")
df

Unnamed: 0,title
0,Pinnacle West Reports Lower 2022 Full-Year and...
1,Occidental edges lower after missing on Q4 ear...
2,Largest NFT Dump Ever; NFT Market To Crash?
3,Psychemedics Co. (PMD) To Go Ex-Dividend on Ma...
4,Novartis (NYSE:NVS) Shares Gap Up to $83.59
...,...
1995,Five9 Empowers Agents to Deliver More Fluid Ex...
1996,Arbitrum-based DEX ArbiSwap rugs users days af...
1997,Amcor: Shareholder Friendly But Overvalued (NY...
1998,"Catalyst Biosciences, Inc. (NASDAQ:CBIO) Sees ..."


In [7]:
res = []
for i in tqdm(df.title.values):
    res.append(nlp(i))

100%|██████████████████████████████████████████████████████████████████████████████████| 2000/2000 [01:22<00:00, 24.11it/s]


In [8]:
modelpred = []
for i in res:
    value = i[0]['label']
    if value == 'Neutral':
        modelpred.append(0.5)
    if value == 'Positive':
        modelpred.append(1)
    if value == 'Negative':
        modelpred.append(0)

In [9]:
df['model'] = modelpred

In [10]:
df

Unnamed: 0,title,model
0,Pinnacle West Reports Lower 2022 Full-Year and...,0.0
1,Occidental edges lower after missing on Q4 ear...,0.0
2,Largest NFT Dump Ever; NFT Market To Crash?,0.5
3,Psychemedics Co. (PMD) To Go Ex-Dividend on Ma...,0.5
4,Novartis (NYSE:NVS) Shares Gap Up to $83.59,1.0
...,...,...
1995,Five9 Empowers Agents to Deliver More Fluid Ex...,1.0
1996,Arbitrum-based DEX ArbiSwap rugs users days af...,0.5
1997,Amcor: Shareholder Friendly But Overvalued (NY...,0.5
1998,"Catalyst Biosciences, Inc. (NASDAQ:CBIO) Sees ...",1.0


## Vader

In [21]:
def compound_to_label(value):
    if value >= 0:
        return 1
    return 0

In [27]:
final = []
for i in range(len(df.model)):
    val = df.model[i]
    if val == 0.5:
        vs = analyzer.polarity_scores(df.title[i])
        final.append(compound_to_label(vs['compound']))
    else:
        final.append(val)
    

In [30]:
df['final'] = final
df['final'] = df['final'].astype('int')

In [40]:
df['Id'] = df.index
df['Predicted'] = df.final
df

Unnamed: 0,title,model,final,Id,Predicted
0,Pinnacle West Reports Lower 2022 Full-Year and...,0.0,0,0,0
1,Occidental edges lower after missing on Q4 ear...,0.0,0,1,0
2,Largest NFT Dump Ever; NFT Market To Crash?,0.5,0,2,0
3,Psychemedics Co. (PMD) To Go Ex-Dividend on Ma...,0.5,1,3,1
4,Novartis (NYSE:NVS) Shares Gap Up to $83.59,1.0,1,4,1
...,...,...,...,...,...
1995,Five9 Empowers Agents to Deliver More Fluid Ex...,1.0,1,1995,1
1996,Arbitrum-based DEX ArbiSwap rugs users days af...,0.5,1,1996,1
1997,Amcor: Shareholder Friendly But Overvalued (NY...,0.5,1,1997,1
1998,"Catalyst Biosciences, Inc. (NASDAQ:CBIO) Sees ...",1.0,1,1998,1


In [41]:
df[['Id', 'Predicted']].to_csv('vader_combined.csv', index = False)

In [22]:
vs['compound']

0.0

In [39]:
df

Unnamed: 0,title,model,final,Id,Predicted
0,Pinnacle West Reports Lower 2022 Full-Year and...,0.0,0,0,0.0
1,Occidental edges lower after missing on Q4 ear...,0.0,0,1,0.0
2,Largest NFT Dump Ever; NFT Market To Crash?,0.5,0,2,0.0
3,Psychemedics Co. (PMD) To Go Ex-Dividend on Ma...,0.5,1,3,1.0
4,Novartis (NYSE:NVS) Shares Gap Up to $83.59,1.0,1,4,1.0
...,...,...,...,...,...
1995,Five9 Empowers Agents to Deliver More Fluid Ex...,1.0,1,1995,1.0
1996,Arbitrum-based DEX ArbiSwap rugs users days af...,0.5,1,1996,1.0
1997,Amcor: Shareholder Friendly But Overvalued (NY...,0.5,1,1997,1.0
1998,"Catalyst Biosciences, Inc. (NASDAQ:CBIO) Sees ...",1.0,1,1998,1.0


In [47]:
res = []
for sentence in tqdm(texts):
    vs = analyzer.polarity_scores(sentence)
    res.append(vs)

100%|███████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 20650.71it/s]


In [49]:
c = [i['compound'] for i in res]

In [51]:
d = []
for i in c:
    if i >= 0:
        d.append(1)
    else:
        d.append(0)