## Library imports

In [33]:
import pandas as pd
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gaastra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data Import

In [49]:
apple_df = pd.read_csv("Data/AAPL.csv")
news_df = pd.read_csv("Data/us_equities_news_dataset.csv")

## Preprocessing

In [50]:
#Preprocessing Steps

# Apple dataframe
# Add price_higher column based on:
# Close > Open = 1
# Close <= Open = 0
apple_df['price_higher'] = apple_df.apply(lambda row: 1 if row['Close'] > row['Open'] else 0, axis=1)

# News dataframe
# Find and delete duplicate content
print(f"Duplicate rows in news_df 'content' column: {news_df['content'].duplicated().sum()}")
news_df.drop_duplicates('content', inplace=True)

# Filter news content based on Apple stock
news_df = news_df[news_df['ticker'] == 'AAPL']

# Join price_higher column with news dataframe based on date
news_df = news_df.merge(apple_df[['Date', 'price_higher']], left_on='release_date', right_on='Date')
news_df = news_df.drop(columns=['Date'])

# Convert release_date to datetime type
news_df['release_date'] = pd.to_datetime(news_df['release_date'])

#
news_df['content'] = news_df['content'].apply(lambda x: '. '.join(x.split(' \n')))
news_df['content'] = news_df['content'].apply(lambda x: '. '.join(list(filter(None, [i.strip(' ') for i in x.split('\r\n')]))))
news_df['words_amount'] = news_df.apply(lambda row: len(row['content'].split(" ")), axis=1)

Duplicate rows in news_df 'content' column: 516


In [52]:
print("The amount of Apple articles is: " + str(len(news_df['article_id'].unique())))
print("The average amount of words per article is: " + str(round(news_df['words_amount'].mean())))
print("The amount unique words for all articles are: " + str(len(set(' '.join(news_df['content']).split(' ')))))


# TEST: Distribution of price_higher
days_higher = news_df[news_df['price_higher'] == 1]['price_higher'].count()
days_lower = news_df[news_df['price_higher'] == 0]['price_higher'].count()

print(days_higher, days_lower)
#Lexical richness bekijk https://pypi.org/project/lexicalrichness/

The amount of Apple articles is: 17624
The average amount of words per article is: 723
The amount unique words for all articles are: 137077
9529 8095


In [53]:
# Group and aggregate content by date
news_df.groupby('release_date').agg({'content': '.'.join})
news_df['content'] = news_df['content'].str.lower()

In [60]:
english_stopwords = set(stopwords.words('english'))

test_news_df = news_df.copy()

def tokenize(text):
    tokens = text.split(' ')
    return tokens

def clean_tokens(tokens):
    test = [token for token in tokens if (token != '' and token not in english_stopwords) and token.isalpha()]
    return test

def join_tokens(tokens):
    return ' '.join(tokens)

words_to_be_removed = ['apple', 'aapl']

# Tokenize content string
test_news_df['content'] = test_news_df['content'].apply(tokenize)
# Remove stopwords, empty tokens and punctuation/numbers
test_news_df['content'] = test_news_df['content'].apply(clean_tokens)
# Join tokens into string
test_news_df['content'] = test_news_df['content'].apply(join_tokens)
test_news_df['content']

0        jpmorgan lifts apple aapl target ahead tomorro...
1        kim investing com faang stocks predictably str...
2        chuck mikolajczak new york reuters u stocks su...
3        two best performing tech stocks set report res...
4        yasin ebrahim kim apple readies earnings inves...
                               ...                        
17619    stock market difficult one traders investors a...
17620    tsx index leading canadian stocks outperformed...
17621    europe flares summer heat continues summer hea...
17622    last quarter apple aapl reported best quarter ...
17623    may look like spider web mishmash trendlines m...
Name: content, Length: 17624, dtype: object

In [None]:
test = [i.strip(' ') for i in news_df['content'][3].split('\r\n')]
test = list(filter(None, test))
test