In [203]:
pip install transformers


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [204]:
from tqdm import tqdm
import datetime,holidays
import numpy as np
import yfinance as yf

In [205]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [206]:
data1 = pd.read_csv('/content/drive/MyDrive/CSE572/analyst_ratings_processed.csv', index_col=0)
datar = pd.read_csv('/content/drive/MyDrive/CSE572/raw_analyst_ratings.csv', index_col=0)

In [207]:
data1['stock'].value_counts()

MRK      3334
MS       3242
MU       3144
NVDA     3133
QQQ      3100
         ... 
HAWKB       1
GYEN        1
GWX         1
GWL         1
FOIL        1
Name: stock, Length: 6192, dtype: int64

In [208]:
tickerSymbol = "MRK"
data1.dropna(inplace = True)
data1['date'] = data1['date'].apply(lambda x : x.split()[0])
data2 = data1[data1['stock'] == tickerSymbol]

In [209]:
data2

Unnamed: 0,title,date,stock
850715.0,Shares of several healthcare companies are tra...,2020-06-11,MRK
850716.0,Johnson & Johnson To Start Coronavirus Vaccine...,2020-06-11,MRK
850717.0,The Daily Biotech Pulse: Keytruda Setback For ...,2020-06-10,MRK
850718.0,Merck Announces That The Phase 3 KEYNOTE-361 T...,2020-06-09,MRK
850719.0,"The Week Ahead In Biotech: Viela FDA Decision,...",2020-06-07,MRK
...,...,...,...
854069.0,BenchmarkJournal.com Free Analyst Review for A...,2009-08-17,MRK
854070.0,Trends in the U.K. and Irish Pharmaceutical an...,2009-08-17,MRK
854071.0,ParagonReport.com Complimentary Market Update ...,2009-08-10,MRK
854072.0,ParagonReport.com Complimentary Market Update ...,2009-08-07,MRK


In [210]:
newData = {}
total = data2['date'].nunique()
for i in tqdm(data2[data2['stock']==tickerSymbol]['date'].unique()):
    newData[i] = data2.loc[(data2['stock']==tickerSymbol) & (data2['date'] == i)]['title'].tolist()

100%|██████████| 1600/1600 [00:04<00:00, 392.99it/s]


In [None]:
newData

In [211]:
import torch.nn.functional as F
def Sentiment(doc):
    pt_batch = tokenizer(doc,padding=True,truncation=True,max_length=512,return_tensors="pt")
    outputs = model(**pt_batch)
    pt_predictions = F.softmax(outputs.logits, dim=-1)
    return pt_predictions.detach().cpu().numpy()

In [212]:
day = datetime.timedelta(days=1)
hol = holidays.US()

def next_trade(date_string):
    date_obj = datetime.datetime.strptime(date_string, '%Y-%m-%d')
    next_day = date_obj + day
    while next_day.weekday() in holidays.WEEKEND or next_day in hol:
        next_day += day
    return next_day

In [215]:
def findSentence(sentence_list):
    sentiment_arr = np.mean(Sentiment(sentence_list), axis=0)
    return {
        'num_articles': len(sentence_list),
        'positive': sentiment_arr[0],
        'negative': sentiment_arr[1],
        'neutral': sentiment_arr[2]
    }

In [216]:
dateSentimentGroups = {}
for i in tqdm(newData):
    scores = findSentence(newData[i])
    dateSentimentGroups[i] = scores

100%|██████████| 1600/1600 [08:34<00:00,  3.11it/s]


In [None]:
ticker = yf.Ticker(tickerSymbol)
hist = ticker.history(period="max")
hist

In [254]:

data = []
ticker = yf.Ticker(tickerSymbol)
hist = ticker.history(period="max")
for i in tqdm(dateSentimentGroups):
  start = i
  nextDay = next_trade(start).strftime("%Y-%m-%d")
  try:
    prevDay = hist.loc[start]
    nextDay = hist.loc[nextDay]
    row = [ 
            start,
            dateSentimentGroups[i]['num_articles'], 
            dateSentimentGroups[i]['neutral'], 
            dateSentimentGroups[i]['positive'], 
            dateSentimentGroups[i]['negative'], 
            prevDay['Open'], 
            prevDay['Close'],
    ]
    data.append(row)
  except:
    pass

100%|██████████| 1600/1600 [00:01<00:00, 1174.08it/s]


In [255]:
df = pd.DataFrame(columns =['Date','numArticles', 'neutral', 'positive','negative','Open', 'Close'], data=data)

In [257]:
df['Price Change'] = ['Up' if df['Close'][i] > df['Open'][i] else 'Down' for i in range(len(df))]

In [258]:
df

Unnamed: 0,Date,numArticles,neutral,positive,negative,Open,Close,Price Change
0,2020-06-11,2,0.423122,0.083263,0.493615,69.961703,66.966187,Down
1,2020-06-10,1,0.134605,0.092935,0.772460,71.372880,70.792824,Down
2,2020-06-09,1,0.443555,0.543354,0.013090,72.022187,71.217033,Down
3,2020-06-05,2,0.160842,0.825178,0.013980,71.087172,71.217033,Up
4,2020-06-04,2,0.494893,0.491230,0.013877,70.359947,70.602364,Up
...,...,...,...,...,...,...,...,...
1439,2009-09-18,1,0.820001,0.169416,0.010583,19.305168,19.125195,Down
1440,2009-08-17,2,0.924749,0.045792,0.029459,18.137875,18.303843,Up
1441,2009-08-10,1,0.938043,0.034597,0.027360,17.954133,18.137882,Up
1442,2009-08-07,1,0.937031,0.042642,0.020328,17.610342,17.841511,Up
