## Library imports

In [47]:
import pandas as pd
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gaastra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Data Import

In [36]:
apple_df = pd.read_csv("Data/AAPL.csv")
news_df = pd.read_csv("Data/us_equities_news_dataset.csv")

## Preprocessing

In [37]:
#Preprocessing Steps

# Apple dataframe
# Add price_higher column based on:
# Close > Open = 1
# Close <= Open = 0
apple_df['price_higher'] = apple_df.apply(lambda row: 1 if row['Close'] > row['Open'] else 0, axis=1)

# News dataframe
# Find and delete duplicate content
print(f"Duplicate rows in news_df 'content' column: {news_df['content'].duplicated().sum()}")
news_df.drop_duplicates('content', inplace=True)

# Filter news content based on Apple stock
news_df = news_df[news_df['ticker'] == 'AAPL']

# Join price_higher column with news dataframe based on date
news_df = news_df.merge(apple_df[['Date', 'price_higher']], left_on='release_date', right_on='Date')
news_df = news_df.drop(columns=['Date'])

# Convert release_date to datetime type
news_df['release_date'] = pd.to_datetime(news_df['release_date'])

#
news_df['content'] = news_df['content'].apply(lambda x: '. '.join(x.split(' \n')))
news_df['content'] = news_df['content'].apply(lambda x: '. '.join(list(filter(None, [i.strip(' ') for i in x.split('\r\n')]))))
news_df['words_amount'] = news_df.apply(lambda row: len(row['content'].split(" ")), axis=1)

Duplicate rows in news_df 'content' column: 516


In [7]:
news_df#.head(2)

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id,price_higher,words_amount
0,270698,AAPL,JPMorgan cautious ahead of Apple earnings,news,JPMorgan lifts its Apple AAPL 2 9 target f...,2020-01-28,Seeking Alpha,https://invst.ly/pnjv8,2068762,1,102
1,270699,AAPL,FAANG s Fall but Get Some Wall Street Love,news,By Kim Khan. Investing com The FAANG stocks ...,2020-01-28,Investing.com,https://www.investing.com/news/stock-market-ne...,2068765,1,290
2,270700,AAPL,Wall Street tumbles as virus fuels economic worry,news,By Chuck Mikolajczak NEW YORK Reuters U S ...,2020-01-28,Reuters,https://www.investing.com/news/stock-market-ne...,2068311,1,866
3,270701,AAPL,Earnings Watch Apple and AMD to take earnings...,news,Two of the best performing tech stocks of 2019...,2020-01-28,MarketWatch,https://invst.ly/pnlbs,2068906,1,1306
4,270702,AAPL,Day Ahead Top 3 Things to Watch for Jan 28,news,By Yasin Ebrahim and Kim Khan. 1 Apple Readie...,2020-01-28,Investing.com,https://www.investing.com/news/stock-market-ne...,2068907,1,628
...,...,...,...,...,...,...,...,...,...,...,...
17643,290924,AAPL,Waiting For Direction On The Markets,opinion,This stock market has been a difficult one for...,2012-07-16,Cam Hui,https://www.investing.com/analysis/waiting-for...,129680,1,1287
17644,290925,AAPL,Mid Year Update U S And Canadian Stock Marke...,opinion,From 2004 to 2010 the TSX 300 Index of leading...,2012-07-19,Baskin Financial Blog,https://www.investing.com/analysis/mid-year-up...,130056,1,1601
17645,290926,AAPL,Summer Heat Scorches Europe And U S,opinion,Europe flares as summer heat continues Summer...,2012-07-23,John Nyaradi,https://www.investing.com/analysis/summer-heat...,130439,1,647
17646,290927,AAPL,Apple Earnings Preview Quarterly Dip On Deck,opinion,Last quarter Apple AAPL reported the 2nd bes...,2012-07-23,David Dyer,https://www.investing.com/analysis/apple-earni...,130458,1,877


In [44]:
print("The amount of Apple articles is: " + str(len(news_df['article_id'].unique())))
print("The average amount of words per article is: " + str(round(news_df['words_amount'].mean())))
print("The amount unique words for all articles are: " + str(len(set(' '.join(news_df['content']).split(' ')))))


# TEST: Distribution of price_higher
days_higher = news_df[news_df['price_higher'] == 1]['price_higher'].count()
days_lower = news_df[news_df['price_higher'] == 0]['price_higher'].count()

print(days_higher, days_lower)
#Lexical richness bekijk https://pypi.org/project/lexicalrichness/

The amount of Apple articles is: 17624
The average amount of words per article is: 723
The amount unique words for all articles are: 137077
9529 8095


In [None]:
news_content_test = news_df['content']
apple_news_lower = news_content_test.str.lower()

In [87]:
news_df['content'][3]

'Two of the best performing tech stocks of 2019 are set to report results Tuesday  but they ll do so with a looming shadow   Stocks suffered their worst day in more than three months Monday  amid fears of a virus spreading in and out of China  Apple Inc. AAPL   2 94. and Advanced Micro Devices Inc. AMD   2 16. aren t immune to the panic over the coronavirus  as investors fret about the companies  Asian supply chains   For Apple  there are possible impacts on both the supply and demand sides  according to Evercore ISI analyst Amit Daryanani  He said that of Apple s 381 manufacturing or component facilities  two are in Wuhan  China  the epicenter of the outbreak  and 69 are in Suzhou  which isn t under lockdown but has extended its holiday shutdown by an extra week  On the demand side  about 20  of Apple s revenue is  China centric   he said    For more  The Dow is about to face its stiffest test in years  Whether virus concerns make their way into Apple s forecast is an open question  G

In [84]:
test = [i.strip(' ') for i in news_df['content'][3].split('\r\n')]
test = list(filter(None, test))
test

['Two of the best performing tech stocks of 2019 are set to report results Tuesday  but they ll do so with a looming shadow   Stocks suffered their worst day in more than three months Monday  amid fears of a virus spreading in and out of China  Apple Inc',
 'AAPL   2 94',
 'and Advanced Micro Devices Inc',
 'AMD   2 16',
 'aren t immune to the panic over the coronavirus  as investors fret about the companies  Asian supply chains   For Apple  there are possible impacts on both the supply and demand sides  according to Evercore ISI analyst Amit Daryanani  He said that of Apple s 381 manufacturing or component facilities  two are in Wuhan  China  the epicenter of the outbreak  and 69 are in Suzhou  which isn t under lockdown but has extended its holiday shutdown by an extra week  On the demand side  about 20  of Apple s revenue is  China centric   he said    For more  The Dow is about to face its stiffest test in years  Whether virus concerns make their way into Apple s forecast is an ope