In [256]:
#-------Imports-------
import pandas as pd
import datetime
import numpy as np
#-------USE BELOW CODE ON COLLAB, else comment it out-------
# from google.colab import drive
# drive.mount('/content/drive')


In [257]:
fpath = "cnbc_news_datase.csv" # "/content/drive/My Drive/cnbc_news_datase.csv"

#-------Read in article data-------
# FILE MUST BE UPLOADED TO YOUR DRIVE; NOTE: if using Collab, also remember to use the second "/content" filepath
news = pd.read_csv(fpath, usecols = [1, 3, 6,  7, 10])
#-------Check data-------
news.head(100) 



Unnamed: 0,title,published_at,short_description,keywords,description
0,Santoli’s Wednesday market notes: Could Septem...,2021-09-29T17:09:39+0000,"This is the daily notebook of Mike Santoli, CN...","cnbc, Premium, Articles, Investment strategy, ...","This is the daily notebook of Mike Santoli, CN..."
1,My take on the early Brexit winners and losers,2016-06-24T13:50:48-0400,This commentary originally ran on Facebook. Bo...,"Articles, Politics, Europe News, European Cent...",
2,Europe&#039;s recovery depends on Renzi&#039;s...,2014-03-25T13:29:45-0400,"In spring, ambitious reforms began in Italy. U...","Articles, Business News, Economy, Europe Econo...",
3,US Moves Closer to Becoming A Major Shareholde...,2009-04-22T19:49:03+0000,The US government is increasingly likely to co...,"cnbc, Articles, General Motors Co, Business Ne...",The US government is increasingly likely to co...
4,Trump: 'Mission accomplished' on 'perfectly ex...,2018-04-14T14:59:04+0000,,"cnbc, Articles, George W. Bush, Vladimir Putin...",President Donald Trump hailed the U.S.-led int...
...,...,...,...,...,...
95,GLOBAL MARKETS-Euro rises on Spain speculation...,2012-10-02T18:23:00+0000,"(Adds comment, details, updates prices)* Spain...","cnbc, Articles, Caterpillar Inc, Europe, Washi...","(Adds comment, details, updates prices)* Spain..."
96,"'I come to bury Bitcoin, not to praise it': UBS",2018-11-30T11:38:30+0000,Cryptocurrencies are nearing the end of the ro...,"cnbc, Articles, Bitcoin/USD Bitfinex, Economy,...",Cryptocurrencies are nearing the end of the ro...
97,Jon Stewart joins Stephen Colbert to mock that...,2016-07-22T11:44:12+0000,It's been 351 days since Jon Stewart sat behin...,"cnbc, Articles, Donald Trump, Media, Elections...",It's been 351 days since Jon Stewart sat behin...
98,Will Stocks Resist 'Anything but Utmost Catast...,2011-11-17T11:50:06+0000,Stock markets have taken such a beating over t...,"cnbc, Articles, Business News, Economy, World ...",Stock markets have taken such a beating over t...


In [258]:
#--------------Define some helper functions for next code block--------------

# NOTE: datetime.datetime.fromisoformat or datetime.date.fromisoformat is better
def to_time(date_string: str) -> datetime.datetime:
  '''Turns a str in desired format to datetime.datetime'''
  return datetime.datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%S%z").replace(tzinfo=None)

# NOTE: unused class. Consider removing.
class NewsArticle: # Class to represent and manipulate NewsArticle objects
  def __init__(self, id: int, title: str, date: str, keywords: list[str], description: str, short_description: str):
    '''Creates a stock object with the given fields'''
    self.id = id
    self.title = title
    self.keywords = keywords.split(", ")
    self.date = to_time(date)
    self.description = description
    self.short_description = short_description

  def __str__(self) -> str:
    '''Gives a string representation of the object using its id, title, short_description, and date fields'''
    return f'{str(self.id)}: {str(self.title)}: {str(self.short_description)}: ({str(self.date)})'

  def __repr__(self) -> str:
    '''Gives a string representation of the object using its id, title, short_description, date, and keywords fields'''
    return f'{str(self.title)}: {str(self.description)}: ({str(self.date)}); keywords: {str(self.keywords)}'

  def add_keyword(self, keyword) -> None:
    '''Adds a keyword to the list of the article's keywords'''
    self.keywords.append(keyword)

  def contains_keyword(self, keyword_to_find) -> bool:
    '''Checks if article contains given keyword'''
    for keyword in self.keywords:
      if keyword_to_find in keyword:
        return True
    return False


## Load news articles

In [282]:
#-------Globals-------
# rename to avoid changing all occurrences just for a different alias
articles = news

#-------Clean articles a bit-------
# fix date
articles['date'] = articles["published_at"].apply(lambda x: to_time(x))
articles = articles.drop("published_at", axis=1)
# fix caps
str_cols = ["description", "short_description", "title"]
articles[str_cols] = articles[str_cols].apply(lambda x: x.str.lower() if x.dtype == 'O' else x)
# fix caps in keywords and turn -> np.array
articles['keywords'] = articles['keywords'].map(lambda keyword_list: np.array([keyword.lower() for keyword in keyword_list.split(",")]))

##-------OUTDATED CODE-------
# articles = []
# for j in range(len(news)): # process news article objects
#   article = NewsArticle(j, news.iloc[j][0], news.iloc[j][1], news.iloc[j][3], news.iloc[j][4], news.iloc[j][2])
#   articles.append(article)
##-------TEST CODE-------
# articles = articles.drop(0, axis=1)
# print(articles['date'].min())
# print(articles['date'].max())

# print(articles.head())

# NOTE: this function is here, but has not been use
# Consider removing it.
def find_relevant_articles(article_df: pd.DataFrame, keyword: str, range_start: int = 0, range_end: int = 500) -> pd.DataFrame:
    '''function to find relevant articles for a given keyword'''
    if range_start < 0:
        return "Invalid start index"

    # convert keyword to normal format
    # doing this later is costly
    keyword = keyword.lower()

    # inner -> outer
    # 1. make a vector of true/false if the keyword is found anywhere in that row's keywords
    # 2. get every row where that vector holds true
    # 3. get every row limited by the given range
    return article_df.loc[
            range_start:min(range_end, len(article_df)) - 1
            ].loc[
                article_df['keywords'].apply(lambda keyword_list: any(keyword in current_keyword.casefold() for current_keyword in keyword_list))
              ]

# #-------TEST CODE-------
# print("Articles about Donald Trump:")
# ex_list = find_relevant_articles(articles, "Donald Trump", 0, 200)
# print(ex_list.head())
# print(f'Example keyword representation {ex_list.iloc[0]["keywords"]}')
# print("\nArticles about Bitcoin:")
# ex_list = find_relevant_articles(articles, "cnbc", 0, 300)
# print(ex_list.head())

In [260]:
# # Entity recognition demo
# import spacy

# nlp = spacy.load("en_core_web_sm")

# # print entities of first ten articles
# # for i in range(10):
# #   doc = nlp(articles[i].description) # process article text
# #   print(articles[i])
# #   for ent in doc.ents:
# #       # Print the entity text and its label
# #       print(ent.text, ent.label_)

# doc = nlp('''Alphabet is a holding company. Internet media giant Google is a wholly owned subsidiary. Google generates 99% of Alphabet revenue, of which more than 85% is from online ads. Google's other revenue is from sales of apps and content on Google Play and YouTube, as well as cloud service fees and other licensing revenue. Sales of hardware such as Chromebooks, the Pixel smartphone, and smart home products, which include Nest and Google Home, also contribute to other revenue. Alphabet's moonshot investments are in its other bets segment, where it bets on technology to enhance health (Verily), faster internet access to homes (Google Fiber), self-driving cars (Waymo), and more. Alphabet's operating margin has been 25%-30%, with Google at 30% and other bets operating at a loss.
# ''')
# for ent in doc.ents:
#   print(ent.text, ent.label_)

## Filter Articles outside of the Date Range

In [284]:
#-------Constants-------
# How many days to keep track of after any given article
DAYS_AFTER = 10

#-------Download stock market data -> ****hold****-------
%run load_stocks.ipynb
hold = from_json("stocks.json")
# NOTE: Post condition: type(hold) = df.DataFrame(ticker: str, prices: list[float], dates: list[datetime.datetime], industry: str)

#--------Process hold/articles on new information-------

# find min and max stock data range and filter out articles not in that range
all_dates = np.concatenate(hold["dates"])
min_stock_date = np.min(all_dates)
max_stock_date = np.max(all_dates)
del all_dates
articles = articles[(articles['date'] >= min_stock_date) & (articles['date'] <= max_stock_date - datetime.timedelta(days=DAYS_AFTER))].reset_index(drop=True)


# dates_and_prices = zip(hold['ticker'].values, hold['dates'].values, hold['prices'].values)

def get_date_stocks(date: datetime.datetime) -> pd.DataFrame:
    date_range_end = (date + datetime.timedelta(days=DAYS_AFTER)).replace(tzinfo=None)

    stocks = {}
    for entry_ticker, entry_dates, entry_prices in dates_and_prices:
        mask = entry_dates <= date_range_end
        prices = entry_prices[mask]
        # Calculate daily price changes
        price_changes = np.diff(prices)
        # Calculate average change using numpy.mean
        stocks[entry_ticker] = np.mean(price_changes)

    return pd.DataFrame(list(stocks.items()), columns=['ticker', 'price_change'])

# # TEST CODE
# print(articles)
# hold

## Do Sentiment Analysis:
(From ChatGPT)

1. Feature Extraction
2. Machine Learning Model Selection:
3. Model Training
4. Model Evaluation
5. Inference:
6. Fine-tuning:

## Feature Extraction

In [285]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import re
#-------Vectorize articles, then run a TF-IDF library over it-------
# Make tokenizers
vectorizer = CountVectorizer(preprocessor=lambda word: re.sub(r'\b0+', '', word))
tfidf_transformer = TfidfTransformer()
# Put everything into col[text] to get all data in one place. get rid of other data as needed
text_row_func = lambda row: re.sub(r'&#[0-9]+;', '', 
    f'{row["title"]} {row["short_description"]} {row["description"] if pd.notna(row["description"]) else ""} {" ".join(row["keywords"])}'
    )
# Tokenize
tfidf_matrix = tfidf_transformer.fit_transform(vectorizer.fit_transform(articles.apply(text_row_func, axis=1)))
# Extract column names (i.e., the words)
feature_names = vectorizer.get_feature_names_out()
# Update the articles dataset with the TF-IDF vector on each article
# NOTE: all other information thrown away; only the date and the TF-IDF vector kept
#   - the date will be thrown away later as well, and there will be a parallel vector of the prices
articles = articles.rename(columns={"date": "date_"})
articles = pd.concat([articles["date_"], pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)], axis=1)
# test a single row (1 x ~20000) vs (~600 x ~20000)
print(articles.head(1))
len(articles)

                date_        10  100  10004  1007  101  1015  102  103  1035  \
0 2016-06-24 13:50:48  0.015653  0.0    0.0   0.0  0.0   0.0  0.0  0.0   0.0   

   ...  zoning  zoom  zoranradosavljevic  zscaler  zsolt  zuckerberg  \
0  ...     0.0   0.0                 0.0      0.0    0.0         0.0   

   zuckerman  zurich  zyne  zynga  
0        0.0     0.0   0.0    0.0  

[1 rows x 18447 columns]


In [264]:
# Data Split. Import y'value array from other
from sklearn.model_selection import train_test_split

# Define your input (X) and output (y) variables
X = articles[['keywords', 'title']]  # Assuming 'date' is a datetime column
# Replace 'actual_target_column' with the actual column name you want to predict
y = articles['date'].apply(lambda row: get_date_stocks(row))

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

TypeError: unsupported operand type(s) for +: 'float' and 'datetime.timedelta'

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Machine Learning Model: Linear Regression
linear_reg = LinearRegression()
linear_reg.fit(x_train, y_train['price_change'].values.flatten())

# Make predictions on the test set
predictions = linear_reg.predict(x_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

KeyError: 'price_change'