# Load the Articles

## Imports

In [256]:
import pandas as pd
import datetime
import numpy as np
#-------USE BELOW CODE ON COLLAB, else comment it out-------
# from google.colab import drive
# drive.mount('/content/drive')

## Load Articles

In [257]:
fpath = "cnbc_news_datase.csv" # "/content/drive/My Drive/cnbc_news_datase.csv"

#-------Read in article data-------
# FILE MUST BE UPLOADED TO YOUR DRIVE; NOTE: if using Collab, also remember to use the second "/content" filepath
news = pd.read_csv(fpath, usecols = [1, 3, 6,  7, 10])
#-------Check data-------
news.head(100) 

Unnamed: 0,title,published_at,short_description,keywords,description
0,Santoli’s Wednesday market notes: Could Septem...,2021-09-29T17:09:39+0000,"This is the daily notebook of Mike Santoli, CN...","cnbc, Premium, Articles, Investment strategy, ...","This is the daily notebook of Mike Santoli, CN..."
1,My take on the early Brexit winners and losers,2016-06-24T13:50:48-0400,This commentary originally ran on Facebook. Bo...,"Articles, Politics, Europe News, European Cent...",
2,Europe&#039;s recovery depends on Renzi&#039;s...,2014-03-25T13:29:45-0400,"In spring, ambitious reforms began in Italy. U...","Articles, Business News, Economy, Europe Econo...",
3,US Moves Closer to Becoming A Major Shareholde...,2009-04-22T19:49:03+0000,The US government is increasingly likely to co...,"cnbc, Articles, General Motors Co, Business Ne...",The US government is increasingly likely to co...
4,Trump: 'Mission accomplished' on 'perfectly ex...,2018-04-14T14:59:04+0000,,"cnbc, Articles, George W. Bush, Vladimir Putin...",President Donald Trump hailed the U.S.-led int...
...,...,...,...,...,...
95,GLOBAL MARKETS-Euro rises on Spain speculation...,2012-10-02T18:23:00+0000,"(Adds comment, details, updates prices)* Spain...","cnbc, Articles, Caterpillar Inc, Europe, Washi...","(Adds comment, details, updates prices)* Spain..."
96,"'I come to bury Bitcoin, not to praise it': UBS",2018-11-30T11:38:30+0000,Cryptocurrencies are nearing the end of the ro...,"cnbc, Articles, Bitcoin/USD Bitfinex, Economy,...",Cryptocurrencies are nearing the end of the ro...
97,Jon Stewart joins Stephen Colbert to mock that...,2016-07-22T11:44:12+0000,It's been 351 days since Jon Stewart sat behin...,"cnbc, Articles, Donald Trump, Media, Elections...",It's been 351 days since Jon Stewart sat behin...
98,Will Stocks Resist 'Anything but Utmost Catast...,2011-11-17T11:50:06+0000,Stock markets have taken such a beating over t...,"cnbc, Articles, Business News, Economy, World ...",Stock markets have taken such a beating over t...


## Load Helper Classes [unused] and Functions

In [258]:
from helpers import to_time, clear_time

# Load news articles

In [259]:
#-------Globals-------
# rename to avoid changing all occurrences just for a different alias
articles: pd.DataFrame = news
del news

#-------Clean articles a bit-------
# fix date
articles['date'] = articles["published_at"].apply(lambda x: to_time(x))
articles = articles.drop("published_at", axis=1)
# fix caps
str_cols = ["description", "short_description", "title"]
articles[str_cols] = articles[str_cols].apply(lambda x: x.str.lower() if x.dtype == 'O' else x)
# fix caps in keywords and turn -> np.array
articles['keywords'] = articles['keywords'].map(lambda keyword_list: np.array([keyword.lower() for keyword in keyword_list.split(",")]))

# #-------TEST CODE-------
# print("Articles about Donald Trump:")
# ex_list = find_relevant_articles(articles, "Donald Trump", 0, 200)
# print(ex_list.head())
# print(f'Example keyword representation {ex_list.iloc[0]["keywords"]}')
# print("\nArticles about Bitcoin:")
# ex_list = find_relevant_articles(articles, "cnbc", 0, 300)
# print(ex_list.head())

## Filter Things outside of the Date Range

In [260]:
#-------Constants-------
# How many days to keep track of after any given article
DAYS_AFTER = 10

#-------Download stock market data -> ****hold****-------
%run load_stocks.ipynb
hold: pd.DataFrame = from_json("stocks.json")
# NOTE: Post condition: type(hold) = df.DataFrame(ticker: str, prices: list[float], dates: list[datetime.datetime], industry: str)

#--------Process hold/articles on new information-------

# find min and max stock data range and filter out articles not in that range
all_dates = np.concatenate(hold["dates"])
min_date = np.min(all_dates)
max_date = np.max(all_dates) - datetime.timedelta(days=DAYS_AFTER)
del all_dates

articles = articles[(articles['date'] >= min_date) & (articles['date'] <= max_date)].reset_index(drop=True)

min_date = np.min(articles['date'])
max_date = np.max(articles['date']) - datetime.timedelta(days=DAYS_AFTER)

hold['dates'] = hold['dates'].apply(lambda dates: dates if (np.min(dates) <= min_date) & (np.max(dates) >= max_date) else None)
hold = hold.dropna()
del min_date, max_date
# # TEST CODE
# hold
# len(hold.loc[hold["ticker"] == "ABNB"])

## Make Price Change Matrix across the Stocks

In [261]:
# speed processing
# makes len(stocks) x len(date|price entries) array filled with NaN values
max_len = max(len(prices) for prices in hold['prices']) 
dates_and_prices = np.full((len(hold['prices']), max_len), np.nan)
del max_len
# populate
for i, (dates, prices) in enumerate(zip(hold['dates'], hold['prices'])):
    dates_and_prices[i, :len(prices)] = prices
# -> df    
dates_and_prices = pd.DataFrame(dates_and_prices[:, 1:], columns=hold['dates'][0])

# Non-multi-threading attempt
def get_date_stocks(date: datetime.datetime) -> pd.DataFrame:
    '''Takes a date [from each article] and returns an array of the corresponding stocks until DAYS_AFTER'''
    return dates_and_prices.loc[:, 
            dates_and_prices.columns[
                (date <= dates_and_prices.columns) & 
                (dates_and_prices.columns <= clear_time(date + datetime.timedelta(days=DAYS_AFTER)))
                ]
            ].diff(axis=1).mean(axis=1)

prices = articles["date"].apply(get_date_stocks).reindex()
# prices.shape = (511, 442); articles x stocks
del dates_and_prices

# # Test code
# print(prices)
# print(get_date_stocks(articles.iloc[0]["date"]))

## Do Sentiment Analysis:
(From ChatGPT)

1. Feature Extraction
2. Machine Learning Model Selection:
3. Model Training
4. Model Evaluation
5. Inference:
6. Fine-tuning:

## Feature Extraction
### Process

1. Vectorize with CountVectorizer(preprocessor=lambda word: re.sub(r'\b0+', '', word))
    -  takes out unneeded 0's at the start of phrases (which the dataset has)
2. Run it through a TfidfTransformer to basically assign importance to significant words
3. Apply a HTML token filter on a string that gets all the important information from articles
4. Fit it the filtered string on the vectorizer and then the TfidfTransformer
5. Remove all other columns from articles, and make the columns the vectors for the words over the length of articles

Credit to https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/

### TfidfTransformer

In [240]:
# from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
# import re
# #-------Vectorize articles, then run a TF-IDF library over it-------
# # Make tokenizers
# vectorizer = CountVectorizer(preprocessor=lambda word: re.sub(r'\b0+', '', word))
# tfidf_transformer = TfidfTransformer()
# # Put everything into col[text] to get all data in one place. get rid of other data as needed
# text_row_func = lambda row: re.sub(r'&#[0-9]+;', '', 
#     f'{row["title"]} {row["short_description"]} {row["description"] if pd.notna(row["description"]) else ""} {" ".join(row["keywords"])}'
#     )
# # Tokenize
# tfidf_matrix = tfidf_transformer.fit_transform(vectorizer.fit_transform(articles.apply(text_row_func, axis=1)))
# # Extract column names (i.e., the words)
# feature_names = vectorizer.get_feature_names_out()
# # Update the articles dataset with the TF-IDF vector on each article
# articles = pd.DataFrame(tfidf_matrix.toarray().transpose(), index=vectorizer.get_feature_names_out())
# # test a single row (1 x ~20000) vs (~600 x ~20000)
# articles

### TfidfVectorizer

In [262]:
# TfidfVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
import re
#-------Vectorize articles, then run a TF-IDF library over it-------
# Make tokenizer
tfidf_vectorizer = TfidfVectorizer(use_idf=True, preprocessor=lambda word: re.sub(r'\b0+', '', word), analyzer='word' , stop_words='english')
# Put everything into col[text] to get all data in one place. get rid of other data as needed
text_row_func = lambda row: re.sub(r'&#[0-9]+;', '', 
    f'{row["title"]} {row["short_description"]} {row["description"] if pd.notna(row["description"]) else ""} {" ".join(row["keywords"])}'
    )
# fit data
transform_fit = tfidf_vectorizer.fit_transform(articles.apply(text_row_func, axis=1))
articles = pd.DataFrame(transform_fit.toarray().transpose(), tfidf_vectorizer.get_feature_names_out())
# shape should be (~20000 x 500)
articles


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,501,502,503,504,505,506,507,508,509,510
10,0.018,0.013015,0.0,0.0,0.055794,0.000000,0.0,0.01776,0.0,0.0,...,0.014045,0.044891,0.033208,0.0,0.0,0.0,0.0,0.0,0.0,0.016235
100,0.000,0.000000,0.0,0.0,0.000000,0.036904,0.0,0.00000,0.0,0.0,...,0.017840,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.020623
10004,0.000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1007,0.000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
101,0.000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zuckerberg,0.000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
zuckerman,0.000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
zurich,0.000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
zyne,0.000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.00000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


## Get Relevancy of each Document

In [267]:
from sklearn.feature_extraction.text import CountVectorizer
# # result should be the same shape as prices
# def get_relevancy(keywords: list[str]):
#     '''Takes a date [from each article] and returns an array of the corresponding stocks until DAYS_AFTER'''
#     print(np.array(articles))
#     # print(np.array(articles.columns)[any(keyword in articles.columns for keyword in keywords)])
#     # print(articles.loc[:, 
#     #         articles.columns[any(keyword in articles.columns for keyword in keywords)]
#     #         ].mean(axis=1))
#     return 1
#     # return articles.loc[:, 
#     #         articles.columns[any(keyword in articles.columns for keyword in keywords)]
#     #         ].mean(axis=1)

# vectors = vectorizer.fit_transform(hold["keywords"].apply(lambda keywords_list: " ".join(keywords_list)))
# relevancy = tfidf_transformer.transform(vectors)
# relevancy = hold["keywords"].apply(lambda doc: transform_fit.transpose(tfidf_vectorizer.transform(doc)))
#relevancy = tfidf_vectorizer.transform(hold["keywords"].apply(lambda keywords: " ".join(keywords)))
count_docs = tfidf_vectorizer.transform(hold["keywords"].apply(lambda keywords: " ".join(keywords)))
# relevancy =  hold["keywords"].apply(lambda words: , CountVectorizer) transform_fit.dot()
print(f'transform_fit: {transform_fit.shape} count_docs: {count_docs.shape}')
relevancy = transform_fit.dot(count_docs.T)
relevancy = pd.DataFrame(relevancy.toarray(), columns=hold.index)
count_docs
relevancy

transform_fit: (511, 18156) count_docs: (442, 18156)


ticker,MMM,AOS,ABT,ACN,ADM,ADBE,ADP,AES,AFL,A,...,WHR,WMB,WTW,GWW,WYNN,XEL,YUM,ZBRA,ZBH,ZION
0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.001668,0.000000,0.0,0.000000,...,0.000000,0.006676,0.025762,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
1,0.005835,0.000000,0.000000,0.010569,0.003804,0.000000,0.010445,0.000000,0.0,0.000000,...,0.003157,0.000000,0.000000,0.004973,0.003176,0.000000,0.003824,0.000000,0.0,0.007647
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.007857,0.0,0.000000,...,0.000000,0.000000,0.017091,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.009150,0.000000,0.000000,0.0,0.008191,...,0.000000,0.037173,0.262874,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
4,0.000000,0.009777,0.000000,0.006273,0.010098,0.032811,0.000000,0.000000,0.0,0.024879,...,0.000000,0.015838,0.000000,0.000000,0.000000,0.000000,0.000000,0.037624,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506,0.000000,0.000000,0.011471,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.049384,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
507,0.000000,0.000000,0.000000,0.008286,0.008334,0.060721,0.011253,0.006932,0.0,0.047223,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.003540
508,0.034551,0.000000,0.000000,0.000000,0.015016,0.000000,0.000000,0.000000,0.0,0.000000,...,0.051301,0.000000,0.000000,0.029447,0.051617,0.000000,0.062149,0.000000,0.0,0.013672
509,0.000000,0.000000,0.009310,0.020332,0.000000,0.019139,0.008586,0.097941,0.0,0.000000,...,0.000000,0.117704,0.006377,0.000000,0.054009,0.072312,0.000000,0.021369,0.0,0.000000


## Data Split

In [243]:
# Data Split. Import y value array from other
from sklearn.model_selection import train_test_split

# Define your input (X) and output (y) variables
X = articles[['keywords', 'title']]  # Assuming 'date' is a datetime column
# Replace 'actual_target_column' with the actual column name you want to predict
y = articles['date'].apply(lambda row: get_date_stocks(row))

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

KeyError: "None of [Index(['keywords', 'title'], dtype='object')] are in the [columns]"

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Machine Learning Model: Linear Regression
linear_reg = LinearRegression()
linear_reg.fit(x_train, y_train['price_change'].values.flatten())

# Make predictions on the test set
predictions = linear_reg.predict(x_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

KeyError: 'price_change'