## Library imports

In [1]:
import pandas as pd
import collections
import nltk
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('stopwords')
nltk.download('universal_tagset')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')
from sklearn.feature_extraction.text import CountVectorizer



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tomva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\tomva\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\tomva\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tomva\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\tomva\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Data Import

In [2]:
apple_df = pd.read_csv("Data/AAPL.csv")
news_df = pd.read_csv("Data/us_equities_news_dataset.csv")

In [3]:
news_df

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id
0,221515,NIO,Why Shares of Chinese Electric Car Maker NIO A...,news,What s happening\nShares of Chinese electric c...,2020-01-15,The Motley Fool,https://invst.ly/pigqi,2060327
1,221516,NIO,NIO only consumer gainer Workhorse Group amon...,news,Gainers NIO NYSE NIO 7 \nLosers MGP Ingr...,2020-01-18,Seeking Alpha,https://invst.ly/pje9c,2062196
2,221517,NIO,NIO leads consumer gainers Beyond Meat and Ma...,news,Gainers NIO NYSE NIO 14 Village Farms In...,2020-01-15,Seeking Alpha,https://invst.ly/pifmv,2060249
3,221518,NIO,NIO NVAX among premarket gainers,news,Cemtrex NASDAQ CETX 85 after FY results \n...,2020-01-15,Seeking Alpha,https://invst.ly/picu8,2060039
4,221519,NIO,PLUG NIO among premarket gainers,news,aTyr Pharma NASDAQ LIFE 63 on Kyorin Pharm...,2020-01-06,Seeking Alpha,https://seekingalpha.com/news/3529772-plug-nio...,2053096
...,...,...,...,...,...,...,...,...,...
221508,443024,T,Crude And Steel Still In Sync,opinion,We have been reporting on the trade off betwee...,2012-10-04,Ivan Kitov,https://www.investing.com/analysis/crude-and-s...,138733
221509,443025,T,Forget AT T This Is The Telecom Stock You Sho...,opinion,It s the largest cell phone provider in the wo...,2012-05-30,StreetAuthority,https://www.investing.com/analysis/forget-at-t...,124829
221510,443026,T,Wall Street Exposed Part 3 How Dividends C...,opinion,Before we dicuss how the mechanism of dividend...,2012-07-16,Portfolio Cafe,https://www.investing.com/analysis/wall-street...,129651
221511,443027,T,Weighing The Week Ahead It s All About Jobs,opinion,From start to finish the coming week will hav...,2012-09-02,Jeff Miller,https://www.investing.com/analysis/weighing-th...,134926


## Preprocessing

In [4]:
#Preprocessing Steps

# Apple dataframe
# Add price_higher column based on:
# Close > Open = 1
# Close <= Open = 0
apple_df['price_higher'] = apple_df.apply(lambda row: 1 if row['Close'] > row['Open'] else 0, axis=1)

# News dataframe
# Find and delete duplicate content
print(f"Duplicate rows in news_df 'content' column: {news_df['content'].duplicated().sum()}")
news_df.drop_duplicates('content', inplace=True)

# Filter news content based on Apple stock
news_df = news_df[news_df['ticker'] == 'AAPL']

# Join price_higher column with news dataframe based on date
news_df = news_df.merge(apple_df[['Date', 'price_higher']], left_on='release_date', right_on='Date')
news_df = news_df.drop(columns=['Date'])

# Convert release_date to datetime type
news_df['release_date'] = pd.to_datetime(news_df['release_date'])

news_df['content'] = news_df['content'].apply(lambda x: ' '.join(x.split(' \n')))
news_df['content'] = news_df['content'].apply(lambda x: ' '.join(list(filter(None, [i.strip(' ') for i in x.split('\r\n')]))))
news_df['words_amount'] = news_df.apply(lambda row: len(row['content'].split(" ")), axis=1)

Duplicate rows in news_df 'content' column: 516


In [5]:
print(f"The amount of Apple articles is: {str(len(news_df['article_id'].unique()))}")
print(f"The total amount of words for all articles is: {str(len(' '.join(news_df['content']).split(' ')))}")
print(f"The amount unique words for all articles is: {str(len(set(' '.join(news_df['content']).split(' '))))}")
print(f"The average amount of words per article is: {str(round(news_df['words_amount'].mean()))}")

# TEST: Distribution of price_higher
days_higher = news_df[news_df['price_higher'] == 1]['price_higher'].count()
days_lower = news_df[news_df['price_higher'] == 0]['price_higher'].count()

print(days_higher, days_lower)
#Lexical richness bekijk https://pypi.org/project/lexicalrichness/

The amount of Apple articles is: 17624
The total amount of words for all articles is: 12740260
The amount unique words for all articles is: 126590
The average amount of words per article is: 723
9529 8095


In [6]:
# Group and aggregate content by date
news_df.groupby('release_date').agg({'content': '.'.join})
news_df['content'] = news_df['content'].str.lower()

In [7]:
english_stopwords = set(stopwords.words('english'))

test_news_df = news_df.copy()

def tokenize(text):
    tokens = text.split(' ')
    return tokens

def clean_tokens(tokens):
    test = [token for token in tokens if (token != '' and token not in english_stopwords) and token.isalpha()]
    return test

def join_tokens(tokens):
    return ' '.join(tokens)

def calc_lexical_div(tokens):
    return len(set(tokens)) / len(tokens)

# TODO
words_to_be_removed = ['apple', 'aapl']

# Tokenize content string
test_news_df['content'] = test_news_df['content'].apply(tokenize)
# Remove stopwords, empty tokens and punctuation/numbers
test_news_df['tokens'] = test_news_df['content'].apply(clean_tokens)
# Join tokens into string
test_news_df['content'] = test_news_df['tokens'].apply(join_tokens)
# Calculate lexical diversity per date
test_news_df['lexical_div'] = test_news_df['tokens'].apply(calc_lexical_div)

test_news_df[['tokens', 'lexical_div']]

Unnamed: 0,tokens,lexical_div
0,"[jpmorgan, lifts, apple, aapl, target, ahead, ...",0.857143
1,"[kim, khan, investing, com, faang, stocks, pre...",0.729167
2,"[chuck, mikolajczak, new, york, reuters, u, st...",0.717224
3,"[two, best, performing, tech, stocks, set, rep...",0.684601
4,"[yasin, ebrahim, kim, khan, apple, readies, ea...",0.702532
...,...,...
17619,"[stock, market, difficult, one, traders, inves...",0.659432
17620,"[tsx, index, leading, canadian, stocks, outper...",0.557616
17621,"[europe, flares, summer, heat, continues, summ...",0.705521
17622,"[last, quarter, apple, aapl, reported, best, q...",0.482667


In [8]:
unique_token_set = set(" ".join(test_news_df['content']).split(" "))
unique_token_tagged = pos_tag(unique_token_set, tagset='universal')
tag_abbrev_dict = {"NOUN": "n",
                   "VERB": "v",
                   "ADJ": "a",
                   "ADV": "r"}

unique_token_tagged_abbrev = [(token, tag_abbrev_dict[pos_tag]) for (token, pos_tag) in unique_token_tagged if pos_tag in tag_abbrev_dict.keys()]

lemmatizer = WordNetLemmatizer()
lemmatized_tokens_dict = {token: lemmatizer.lemmatize(token, pos=pos_tag) for (token, pos_tag) in unique_token_tagged_abbrev}
lemmatized_tokens_dict

{'pervades': 'pervades',
 'fetishism': 'fetishism',
 'drumroll': 'drumroll',
 'achieves': 'achieves',
 'confrontations': 'confrontation',
 'synovial': 'synovial',
 'fist': 'fist',
 'agx': 'agx',
 'detailsthis': 'detailsthis',
 'aixgn': 'aixgn',
 'pharmacodynamic': 'pharmacodynamic',
 'trrpx': 'trrpx',
 'dvmt': 'dvmt',
 'dalvin': 'dalvin',
 'stringency': 'stringency',
 'hiroshima': 'hiroshima',
 'peerlogix': 'peerlogix',
 'hearty': 'hearty',
 'luncheon': 'luncheon',
 'hpi': 'hpi',
 'cancerper': 'cancerper',
 'moise': 'moise',
 'antimicrobial': 'antimicrobial',
 'minivans': 'minivan',
 'dieselgate': 'dieselgate',
 'fafdx': 'fafdx',
 'miners': 'miner',
 'tightens': 'tightens',
 'withhold': 'withhold',
 'dozing': 'doze',
 'standardized': 'standardized',
 'scholars': 'scholar',
 'orderly': 'orderly',
 'propelling': 'propel',
 'undermentioned': 'undermentioned',
 'noncompetitive': 'noncompetitive',
 'linelodging': 'linelodging',
 'curious': 'curious',
 'pomegranate': 'pomegranate',
 'signifi

In [9]:
def lemmatize_content(tokens, token_dict):
    lemmatized_tokens = [lemmatized_tokens_dict[token] if token in token_dict.keys() else token for token in tokens ]
    return " ".join(lemmatized_tokens)

In [10]:
test_news_df['content'] = test_news_df['tokens'].apply(lemmatize_content, token_dict=lemmatized_tokens_dict)

In [11]:
test_news_df[test_news_df['content'].str.contains("qualcommyear")]['content']

12197    qualcomm inc nasdaq qcom large mobile chipset ...
Name: content, dtype: object

In [12]:
vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
m2 = vectorizer.fit_transform(test_news_df['content'])

In [13]:
vectorizer.get_feature_names_out()

array(['aa aa', 'aa aaa', 'aa add', ..., 'zzamoi eamazmfpaydqapaojbet',
       'zzsixxfnfuhcdofbu style', 'zzz lhmpphaewuk'], dtype=object)

In [14]:
test_news_df[test_news_df['price_higher']==0]

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id,price_higher,words_amount,tokens,lexical_div
69,270719,AAPL,Apple Set To Beat Q1 Earnings Estimates Tech ...,opinion,technology giant apple nasdaq aapl set release...,2020-01-27,Zacks Investment Research,https://www.investing.com/analysis/apple-set-t...,200501897,0,709,"[technology, giant, apple, nasdaq, aapl, set, ...",0.598101
70,270720,AAPL,Tech Daily Intel Results Netflix Surge Appl...,opinion,top story digest intel nyse earnings netflix n...,2020-01-27,Zacks Investment Research,https://www.investing.com/analysis/tech-daily-...,200501955,0,2098,"[top, stories, digest, intel, nyse, earnings, ...",0.665410
71,270722,AAPL,7 Monster Stock Market Predictions For The Wee...,opinion,p spy week pack economic data earnings extent ...,2020-01-27,Michael Kramer,https://www.investing.com/analysis/7-monster-s...,200501656,0,508,"[p, spy, week, packed, economic, data, earning...",0.648148
72,270723,AAPL,Apple Earnings Preview 5G Launch Expanding S...,opinion,report result tuesday jan revenue expectation ...,2020-01-27,Haris Anwar/Investing.com,https://www.investing.com/analysis/apple-earni...,200501661,0,647,"[reports, results, tuesday, jan, revenue, expe...",0.711599
73,270725,AAPL,Buy Surging Apple Microsoft Stock Before Qua...,opinion,today episode full court finance zacks dive ev...,2020-01-27,Zacks Investment Research,https://www.investing.com/analysis/buy-surging...,200501950,0,361,"[today, episode, full, court, finance, zacks, ...",0.793296
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17608,290911,AAPL,Chart Review Apple Breaks Out,opinion,recovery post earnings dip july officially com...,2012-08-07,Dr. Duru,https://www.investing.com/analysis/chart-revie...,132227,0,147,"[recovery, post, earnings, dip, july, official...",0.855263
17610,290913,AAPL,VIX Is Under 14 Now What,opinion,ok trader buddy bet something think would happ...,2012-08-14,ETF Prophet,https://www.investing.com/analysis/vix-is-unde...,132828,0,263,"[ok, trader, buddies, bet, something, think, w...",0.859375
17611,290914,AAPL,Largest USA Tech Companies Earnings Plunge Bu...,opinion,quarterly net incomethe large usa tech company...,2012-08-14,David Dyer,https://www.investing.com/analysis/largest-usa...,132868,0,619,"[quarterly, net, incomethe, largest, usa, tech...",0.500000
17612,290915,AAPL,Why Bearish Short Term Still Buying Stocks Lo...,opinion,understand bearish market rest year yet contin...,2012-08-15,Charles Biderman,https://www.investing.com/analysis/why-bearish...,133009,0,648,"[understand, bearish, markets, rest, year, yet...",0.648208


In [15]:
test_news_df['tokens'][2]

['chuck',
 'mikolajczak',
 'new',
 'york',
 'reuters',
 'u',
 'stocks',
 'suffered',
 'worst',
 'day',
 'three',
 'months',
 'monday',
 'china',
 'extended',
 'lunar',
 'new',
 'year',
 'holiday',
 'due',
 'virus',
 'outbreak',
 'fueling',
 'worries',
 'economic',
 'impact',
 'containment',
 'efforts',
 'world',
 'second',
 'largest',
 'economy',
 'benchmark',
 'p',
 'suffered',
 'worst',
 'weekly',
 'performance',
 'since',
 'september',
 'last',
 'week',
 'china',
 'locked',
 'several',
 'cities',
 'curbed',
 'travel',
 'reminding',
 'investors',
 'deadly',
 'sars',
 'virus',
 'killed',
 'nearly',
 'people',
 'cost',
 'global',
 'economy',
 'billions',
 'still',
 'investors',
 'viewed',
 'long',
 'term',
 'economic',
 'impact',
 'unlikely',
 'given',
 'past',
 'experiences',
 'viral',
 'outbreaks',
 'whole',
 'thing',
 'way',
 'overblown',
 'said',
 'stephen',
 'massocca',
 'senior',
 'vice',
 'president',
 'wedbush',
 'securities',
 'san',
 'francisco',
 'seems',
 'chinese',
 'much'

In [16]:
test_news_df

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id,price_higher,words_amount,tokens,lexical_div
0,270698,AAPL,JPMorgan cautious ahead of Apple earnings,news,jpmorgan lift apple aapl target ahead tomorrow...,2020-01-28,Seeking Alpha,https://invst.ly/pnjv8,2068762,1,102,"[jpmorgan, lifts, apple, aapl, target, ahead, ...",0.857143
1,270699,AAPL,FAANG s Fall but Get Some Wall Street Love,news,kim khan invest com faang stock predictably st...,2020-01-28,Investing.com,https://www.investing.com/news/stock-market-ne...,2068765,1,290,"[kim, khan, investing, com, faang, stocks, pre...",0.729167
2,270700,AAPL,Wall Street tumbles as virus fuels economic worry,news,chuck mikolajczak new york reuters u stock suf...,2020-01-28,Reuters,https://www.investing.com/news/stock-market-ne...,2068311,1,866,"[chuck, mikolajczak, new, york, reuters, u, st...",0.717224
3,270701,AAPL,Earnings Watch Apple and AMD to take earnings...,news,two best perform tech stock set report result ...,2020-01-28,MarketWatch,https://invst.ly/pnlbs,2068906,1,1306,"[two, best, performing, tech, stocks, set, rep...",0.684601
4,270702,AAPL,Day Ahead Top 3 Things to Watch for Jan 28,news,yasin ebrahim kim khan apple ready earnings in...,2020-01-28,Investing.com,https://www.investing.com/news/stock-market-ne...,2068907,1,628,"[yasin, ebrahim, kim, khan, apple, readies, ea...",0.702532
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17619,290924,AAPL,Waiting For Direction On The Markets,opinion,stock market difficult one trader investor ali...,2012-07-16,Cam Hui,https://www.investing.com/analysis/waiting-for...,129680,1,1287,"[stock, market, difficult, one, traders, inves...",0.659432
17620,290925,AAPL,Mid Year Update U S And Canadian Stock Marke...,opinion,tsx index lead canadian stock outperform p ind...,2012-07-19,Baskin Financial Blog,https://www.investing.com/analysis/mid-year-up...,130056,1,1601,"[tsx, index, leading, canadian, stocks, outper...",0.557616
17621,290926,AAPL,Summer Heat Scorches Europe And U S,opinion,europe flare summer heat continue summer heat ...,2012-07-23,John Nyaradi,https://www.investing.com/analysis/summer-heat...,130439,1,647,"[europe, flares, summer, heat, continues, summ...",0.705521
17622,290927,AAPL,Apple Earnings Preview Quarterly Dip On Deck,opinion,last quarter apple aapl report best quarter co...,2012-07-23,David Dyer,https://www.investing.com/analysis/apple-earni...,130458,1,877,"[last, quarter, apple, aapl, reported, best, q...",0.482667


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Word2Vec
import pandas as pd
import numpy as np

# Train word2vec model
word2vec_model = Word2Vec(sentences=test_news_df['tokens'], vector_size=500, window=5, min_count=1, sg=1)

def document_vector(tokens):
    tokens = [token for token in tokens if token in word2vec_model.wv]
    
    if len(tokens) > 0:
        return np.mean(word2vec_model.wv[tokens], axis=0)
    else:
        return np.zeros(word2vec_model.vector_size)

test_news_df['doc_vector'] = test_news_df['tokens'].apply(document_vector)

In [None]:
X = np.vstack(test_news_df['doc_vector'])
y = test_news_df['price_higher']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# Train Naive Bayes classifier
gnb_classifier = GaussianNB()
gnb_classifier.fit(X_train, y_train)

# Make predictions on the test set
gnb_predictions = gnb_classifier.predict(X_test)


print("Actual Labels:")
print(y_test)

print("\nPredicted Labels:")
print(gnb_predictions)

# Evaluate the model
accuracy = accuracy_score(y_test, gnb_predictions)
print(f"Gaussian Naive Bayes Accuracy: {accuracy}")

# Print classification report
print(classification_report(y_test, gnb_predictions))

Actual Labels:
7188     1
12094    1
101      1
1039     1
313      1
        ..
2236     1
15970    1
12279    0
4230     0
1747     0
Name: price_higher, Length: 3525, dtype: int64

Predicted Labels:
[1 1 0 ... 0 0 1]
Gaussian Naive Bayes Accuracy: 0.5253900709219859
              precision    recall  f1-score   support

           0       0.49      0.62      0.55      1612
           1       0.58      0.44      0.50      1913

    accuracy                           0.53      3525
   macro avg       0.53      0.53      0.52      3525
weighted avg       0.54      0.53      0.52      3525



In [25]:
from sklearn.linear_model import LogisticRegression

# Train Logistic Regression classifier
lg = LogisticRegression(random_state=42)
lg.fit(X_train, y_train)

# Make predictions on the test set
lg_pred = lg.predict(X_test)

print("\nActual Labels:")
print(y_test)

print("\nPredicted Labels:")
print(lg_pred)

# Evaluate the model
accuracy = accuracy_score(y_test, lg_pred)
print(f"Logistic Regression Accuracy: {accuracy}")

# Print classification report
print(classification_report(y_test, lg_pred))


Actual Labels:
7188     1
12094    1
101      1
1039     1
313      1
        ..
2236     1
15970    1
12279    0
4230     0
1747     0
Name: price_higher, Length: 3525, dtype: int64

Predicted Labels:
[1 1 0 ... 0 0 1]
Logistic Regression Accuracy: 0.5492198581560284
              precision    recall  f1-score   support

           0       0.51      0.27      0.35      1612
           1       0.56      0.78      0.65      1913

    accuracy                           0.55      3525
   macro avg       0.54      0.53      0.50      3525
weighted avg       0.54      0.55      0.52      3525



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
