In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("Data/merged_data.csv", index_col = 0)
df['Date'] = pd.to_datetime(df['Date'])

df.rename(columns={'Predicted_Price_Diff': 'arima_prediction'}, inplace=True)

df.head()

Unnamed: 0.1,Date,Article_title,Stock_symbol,Url,close,1d_open,1d_close,3d_close,Unnamed: 0,arima_prediction,3d_change,3d_direction,arima_3d_direction
0,2020-06-10,Tech Stocks And FAANGS Strong Again To Start D...,AAPL,https://www.benzinga.com/government/20/06/1622...,352.839996,349.309998,335.899994,342.98999,59.0,1.373906,-1.809283,-1,1
1,2020-06-10,10 Biggest Price Target Changes For Wednesday,AAPL,https://www.benzinga.com/analyst-ratings/price...,352.839996,349.309998,335.899994,342.98999,59.0,1.373906,-1.809283,-1,1
2,2020-06-09,"Benzinga Pro's Top 5 Stocks To Watch For Wed.,...",AAPL,https://www.benzinga.com/short-sellers/20/06/1...,343.98999,347.899994,352.839996,338.799988,58.0,-1.947308,-2.615696,-1,-1
3,2020-06-09,"Deutsche Bank Maintains Buy on Apple, Raises P...",AAPL,https://www.benzinga.com/news/20/06/16219873/d...,343.98999,347.899994,352.839996,338.799988,58.0,-1.947308,-2.615696,-1,-1
4,2020-06-09,Apple To Let Users Trade In Their Mac Computer...,AAPL,https://www.benzinga.com/news/20/06/16218697/a...,343.98999,347.899994,352.839996,338.799988,58.0,-1.947308,-2.615696,-1,-1


In [3]:
import gensim
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

def preprocess(text):
    words = word_tokenize(text.lower())
    return [word for word in words if word.isalpha() and word not in stop_words]

sentences = df['Article_title'].dropna().apply(preprocess).tolist()

word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

def sentence_to_vector(sentence, model):
    words = preprocess(sentence)
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

df['word2vec'] = df['Article_title'].apply(lambda x: sentence_to_vector(str(x), word2vec_model) if pd.notnull(x) else np.zeros(word2vec_model.vector_size))

df[['Article_title', 'word2vec']].head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jerry\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jerry\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,Article_title,word2vec
0,Tech Stocks And FAANGS Strong Again To Start D...,"[0.22613467, 0.120725006, -0.0344015, -0.03363..."
1,10 Biggest Price Target Changes For Wednesday,"[0.7269928, 0.45623422, 0.16811378, 0.4158756,..."
2,"Benzinga Pro's Top 5 Stocks To Watch For Wed.,...","[0.16575599, 0.11278104, 0.14067568, 0.1441355..."
3,"Deutsche Bank Maintains Buy on Apple, Raises P...","[0.77714425, 0.22397597, -0.2422117, 0.4820196..."
4,Apple To Let Users Trade In Their Mac Computer...,"[0.21648331, 0.03065869, -0.09960988, 0.049770..."


In [5]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name).to(device)

def sentence_to_cls_embedding(sentence, model, tokenizer, device):
    # Encode the sentence
    inputs = tokenizer(
        sentence,
        return_tensors='pt',
        truncation=True,
        padding=True,
        max_length=128
    ).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    # CLS token
    cls_embedding = outputs.last_hidden_state[0, 0, :].cpu().numpy()
    
    return cls_embedding

tqdm.pandas(desc="Generating BERT Embeddings")
df['bert'] = df['Article_title'].progress_apply(
    lambda x: sentence_to_cls_embedding(str(x), bert_model, tokenizer, device) if pd.notnull(x) else np.zeros(768)
)

df[['Article_title', 'bert']].head()


Using device: cuda


Generating BERT Embeddings: 100%|███████████████████████████████████████████████| 16724/16724 [02:28<00:00, 112.50it/s]


Unnamed: 0,Article_title,bert
0,Tech Stocks And FAANGS Strong Again To Start D...,"[-0.22223899, -0.21574433, 0.7697829, -0.25863..."
1,10 Biggest Price Target Changes For Wednesday,"[-0.5261865, -0.26204786, 0.08753802, -0.09534..."
2,"Benzinga Pro's Top 5 Stocks To Watch For Wed.,...","[-0.51947445, -0.5419372, 0.20520952, 0.046122..."
3,"Deutsche Bank Maintains Buy on Apple, Raises P...","[-0.663332, -0.025526464, 0.37389, 0.24639344,..."
4,Apple To Let Users Trade In Their Mac Computer...,"[-0.18243252, 0.09596918, -0.1363424, 0.259096..."


In [12]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load tokenizer and model
model_name = "yiyanghkust/finbert-pretrain"
tokenizer = AutoTokenizer.from_pretrained(model_name)
finbert_model = AutoModel.from_pretrained(model_name).to(device)

def sentence_to_finbert_cls_embedding(sentence, model, tokenizer, device):
    # Encode the input
    inputs = tokenizer(
        sentence,
        return_tensors='pt',
        truncation=True,
        padding=True,
        max_length=128
    ).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
    return cls_embedding

# Generate embeddings for each row
tqdm.pandas(desc="Generating FinBERT [CLS] Embeddings")
df['finBert'] = df['Article_title'].progress_apply(
    lambda x: sentence_to_finbert_cls_embedding(str(x), finbert_model, tokenizer, device)
    if pd.notnull(x)
    else np.zeros(768)
)

df[['Article_title', 'finBert']].head()


Generating FinBERT [CLS] Embeddings: 100%|██████████████████████████████████████| 16724/16724 [02:30<00:00, 111.23it/s]


Unnamed: 0,Article_title,finBert
0,Tech Stocks And FAANGS Strong Again To Start D...,"[-0.26630303, -0.6620456, -0.7148093, -0.27177..."
1,10 Biggest Price Target Changes For Wednesday,"[0.17167464, -0.15455426, -0.38101625, 0.53613..."
2,"Benzinga Pro's Top 5 Stocks To Watch For Wed.,...","[-0.4587289, -0.883282, -0.9349482, 0.84453505..."
3,"Deutsche Bank Maintains Buy on Apple, Raises P...","[-0.090272084, -0.32809806, -0.115412325, 0.36..."
4,Apple To Let Users Trade In Their Mac Computer...,"[0.28189945, -0.43821844, -1.1588358, 0.484726..."


In [10]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

sbert_model_name = 'sentence-transformers/all-MiniLM-L6-v2'
sbert_model = SentenceTransformer(sbert_model_name, device=str(device))

def sentence_to_sbert_embedding(sentence, model):
    return model.encode(sentence)

# Generate embeddings
tqdm.pandas(desc="Generating SBERT Embeddings")
df['sBert'] = df['Article_title'].progress_apply(
    lambda x: sentence_to_sbert_embedding(str(x), sbert_model)
    if pd.notnull(x)
    else np.zeros(384)
)

df[['Article_title', 'sBert']].head()


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating SBERT Embeddings: 100%|██████████████████████████████████████████████| 16724/16724 [01:57<00:00, 142.81it/s]


Unnamed: 0,Article_title,sBert
0,Tech Stocks And FAANGS Strong Again To Start D...,"[-0.005246668, -0.060425527, 0.07172174, 0.062..."
1,10 Biggest Price Target Changes For Wednesday,"[-0.0076999157, -0.036944382, 0.051584832, -0...."
2,"Benzinga Pro's Top 5 Stocks To Watch For Wed.,...","[-0.11140143, -0.052677345, -0.057600692, -0.0..."
3,"Deutsche Bank Maintains Buy on Apple, Raises P...","[0.03324856, -0.05314027, 0.014023768, 0.01497..."
4,Apple To Let Users Trade In Their Mac Computer...,"[-0.023000775, 0.0014990732, 0.027343368, -0.0..."


In [13]:
df.head()

Unnamed: 0.1,Date,Article_title,Stock_symbol,Url,close,1d_open,1d_close,3d_close,Unnamed: 0,arima_prediction,3d_change,3d_direction,arima_3d_direction,word2vec,bert,sBert,finBert
0,2020-06-10,Tech Stocks And FAANGS Strong Again To Start D...,AAPL,https://www.benzinga.com/government/20/06/1622...,352.839996,349.309998,335.899994,342.98999,59.0,1.373906,-1.809283,-1,1,"[0.22613467, 0.120725006, -0.0344015, -0.03363...","[-0.22223899, -0.21574433, 0.7697829, -0.25863...","[-0.005246668, -0.060425527, 0.07172174, 0.062...","[-0.26630303, -0.6620456, -0.7148093, -0.27177..."
1,2020-06-10,10 Biggest Price Target Changes For Wednesday,AAPL,https://www.benzinga.com/analyst-ratings/price...,352.839996,349.309998,335.899994,342.98999,59.0,1.373906,-1.809283,-1,1,"[0.7269928, 0.45623422, 0.16811378, 0.4158756,...","[-0.5261865, -0.26204786, 0.08753802, -0.09534...","[-0.0076999157, -0.036944382, 0.051584832, -0....","[0.17167464, -0.15455426, -0.38101625, 0.53613..."
2,2020-06-09,"Benzinga Pro's Top 5 Stocks To Watch For Wed.,...",AAPL,https://www.benzinga.com/short-sellers/20/06/1...,343.98999,347.899994,352.839996,338.799988,58.0,-1.947308,-2.615696,-1,-1,"[0.16575599, 0.11278104, 0.14067568, 0.1441355...","[-0.51947445, -0.5419372, 0.20520952, 0.046122...","[-0.11140143, -0.052677345, -0.057600692, -0.0...","[-0.4587289, -0.883282, -0.9349482, 0.84453505..."
3,2020-06-09,"Deutsche Bank Maintains Buy on Apple, Raises P...",AAPL,https://www.benzinga.com/news/20/06/16219873/d...,343.98999,347.899994,352.839996,338.799988,58.0,-1.947308,-2.615696,-1,-1,"[0.77714425, 0.22397597, -0.2422117, 0.4820196...","[-0.663332, -0.025526464, 0.37389, 0.24639344,...","[0.03324856, -0.05314027, 0.014023768, 0.01497...","[-0.090272084, -0.32809806, -0.115412325, 0.36..."
4,2020-06-09,Apple To Let Users Trade In Their Mac Computer...,AAPL,https://www.benzinga.com/news/20/06/16218697/a...,343.98999,347.899994,352.839996,338.799988,58.0,-1.947308,-2.615696,-1,-1,"[0.21648331, 0.03065869, -0.09960988, 0.049770...","[-0.18243252, 0.09596918, -0.1363424, 0.259096...","[-0.023000775, 0.0014990732, 0.027343368, -0.0...","[0.28189945, -0.43821844, -1.1588358, 0.484726..."


In [14]:
print(f'Bert vector size is {len(df["bert"][0])} and finbert is {len(df["finBert"][0])}, and sbert is {len(df["sBert"][0])}')

Bert vector size is 768 and finbert is 768, and sbert is 384


In [15]:
df.to_pickle("Data/news_embeddings_csl_sbert.pkl")