## Library imports

In [30]:
import pandas as pd
import nltk

nltk.download('stopwords')
nltk.download('universal_tagset')
nltk.download('averaged_perceptron_tagger')


from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gaastra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Gaastra\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gaastra\AppData\Roaming\nltk_data...


## Data Import

In [2]:
apple_df = pd.read_csv("Data/AAPL.csv")
news_df = pd.read_csv("Data/us_equities_news_dataset.csv")

## Preprocessing

In [3]:
#Preprocessing Steps

# Apple dataframe
# Add price_higher column based on:
# Close > Open = 1
# Close <= Open = 0
apple_df['price_higher'] = apple_df.apply(lambda row: 1 if row['Close'] > row['Open'] else 0, axis=1)

# News dataframe
# Find and delete duplicate content
print(f"Duplicate rows in news_df 'content' column: {news_df['content'].duplicated().sum()}")
news_df.drop_duplicates('content', inplace=True)

# Filter news content based on Apple stock
news_df = news_df[news_df['ticker'] == 'AAPL']

# Join price_higher column with news dataframe based on date
news_df = news_df.merge(apple_df[['Date', 'price_higher']], left_on='release_date', right_on='Date')
news_df = news_df.drop(columns=['Date'])

# Convert release_date to datetime type
news_df['release_date'] = pd.to_datetime(news_df['release_date'])

#
news_df['content'] = news_df['content'].apply(lambda x: '. '.join(x.split(' \n')))
news_df['content'] = news_df['content'].apply(lambda x: '. '.join(list(filter(None, [i.strip(' ') for i in x.split('\r\n')]))))
news_df['words_amount'] = news_df.apply(lambda row: len(row['content'].split(" ")), axis=1)

Duplicate rows in news_df 'content' column: 516


In [16]:
print(f"The amount of Apple articles is: {str(len(news_df['article_id'].unique()))}")
print(f"The total amount of words for all articles is: {str(len(' '.join(news_df['content']).split(' ')))}")
print(f"The amount unique words for all articles is: {str(len(set(' '.join(news_df['content']).split(' '))))}")
print(f"The average amount of words per article is: {str(round(news_df['words_amount'].mean()))}")

print(f"{+}")

# TEST: Distribution of price_higher
days_higher = news_df[news_df['price_higher'] == 1]['price_higher'].count()
days_lower = news_df[news_df['price_higher'] == 0]['price_higher'].count()

print(days_higher, days_lower)
#Lexical richness bekijk https://pypi.org/project/lexicalrichness/

The amount of Apple articles is: 17624
The total amount of words for all articles is: 12740274
The amount unique words for all articles is: 118926
The average amount of words per article is: 723
9529 8095


In [5]:
# Group and aggregate content by date
news_df.groupby('release_date').agg({'content': '.'.join})
news_df['content'] = news_df['content'].str.lower()

In [13]:
english_stopwords = set(stopwords.words('english'))

test_news_df = news_df.copy()

def tokenize(text):
    tokens = text.split(' ')
    return tokens

def clean_tokens(tokens):
    test = [token for token in tokens if (token != '' and token not in english_stopwords) and token.isalpha()]
    return test

def join_tokens(tokens):
    return ' '.join(tokens)

def calc_lexical_div(tokens):
    return len(set(tokens)) / len(tokens)

words_to_be_removed = ['apple', 'aapl']

# Tokenize content string
test_news_df['content'] = test_news_df['content'].apply(tokenize)
# Remove stopwords, empty tokens and punctuation/numbers
test_news_df['tokens'] = test_news_df['content'].apply(clean_tokens)
# Join tokens into string
test_news_df['content'] = test_news_df['tokens'].apply(join_tokens)
# Calculate lexical diversity per date
test_news_df['lexical_div'] = test_news_df['tokens'].apply(calc_lexical_div)

test_news_df[['tokens', 'lexical_div']]

Unnamed: 0,tokens,lexical_div
0,"[jpmorgan, lifts, apple, aapl, target, ahead, ...",0.866667
1,"[kim, investing, com, faang, stocks, predictab...",0.729927
2,"[chuck, mikolajczak, new, york, reuters, u, st...",0.717224
3,"[two, best, performing, tech, stocks, set, rep...",0.695817
4,"[yasin, ebrahim, kim, apple, readies, earnings...",0.720000
...,...,...
17619,"[stock, market, difficult, one, traders, inves...",0.659432
17620,"[tsx, index, leading, canadian, stocks, outper...",0.557616
17621,"[europe, flares, summer, heat, continues, summ...",0.705521
17622,"[last, quarter, apple, aapl, reported, best, q...",0.482667


In [49]:
unique_token_set = set(" ".join(test_news_df['content']).split(" "))
unique_token_tagged = pos_tag(unique_token_set, tagset='universal')
tag_abbrev_dict = {"NOUN": "n",
                   "VERB": "v",
                   "ADJ": "a",
                   "ADV": "r"}

unique_token_tagged_abbrev = [(token, tag_abbrev_dict[pos_tag]) for (token, pos_tag) in unique_token_tagged if pos_tag in tag_abbrev_dict.keys()]
unique_token_tagged_abbrev

[('advice', 'n'),
 ('playduring', 'v'),
 ('assemblies', 'n'),
 ('acquisitionon', 'v'),
 ('cabp', 'a'),
 ('pstg', 'n'),
 ('headwindnotably', 'r'),
 ('pandey', 'a'),
 ('unitsysco', 'a'),
 ('vegas', 'n'),
 ('seo', 'n'),
 ('bahia', 'n'),
 ('ervin', 'n'),
 ('contingencies', 'n'),
 ('hesitating', 'v'),
 ('podcastevery', 'n'),
 ('rasthofer', 'n'),
 ('anew', 'a'),
 ('cornelius', 'n'),
 ('buzzy', 'n'),
 ('kvaal', 'n'),
 ('spread', 'n'),
 ('mesmerized', 'v'),
 ('align', 'a'),
 ('sulisto', 'n'),
 ('tom', 'n'),
 ('marginorders', 'n'),
 ('rivet', 'v'),
 ('elonmusk', 'a'),
 ('mime', 'n'),
 ('evaporation', 'n'),
 ('detailsibm', 'n'),
 ('enchilada', 'a'),
 ('misconduct', 'n'),
 ('malta', 'n'),
 ('photronics', 'n'),
 ('funerals', 'n'),
 ('robby', 'v'),
 ('zulresso', 'n'),
 ('pcar', 'n'),
 ('suction', 'n'),
 ('directly', 'r'),
 ('rorsted', 'v'),
 ('uco', 'a'),
 ('quadrupling', 'n'),
 ('bunches', 'n'),
 ('ph', 'v'),
 ('quigley', 'a'),
 ('reversing', 'v'),
 ('lookskechers', 'n'),
 ('caretaker', 'v'),
 ('s

In [51]:
lemmatizer = WordNetLemmatizer()

lemmatized_tokens = [lemmatizer.lemmatize(token, pos=pos_tag) for (token, pos_tag) in unique_token_tagged_abbrev]
lemmatized_tokens




['advice',
 'playduring',
 'assembly',
 'acquisitionon',
 'cabp',
 'pstg',
 'headwindnotably',
 'pandey',
 'unitsysco',
 'vega',
 'seo',
 'bahia',
 'ervin',
 'contingency',
 'hesitate',
 'podcastevery',
 'rasthofer',
 'anew',
 'cornelius',
 'buzzy',
 'kvaal',
 'spread',
 'mesmerize',
 'align',
 'sulisto',
 'tom',
 'marginorders',
 'rivet',
 'elonmusk',
 'mime',
 'evaporation',
 'detailsibm',
 'enchilada',
 'misconduct',
 'malta',
 'photronics',
 'funeral',
 'robby',
 'zulresso',
 'pcar',
 'suction',
 'directly',
 'rorsted',
 'uco',
 'quadrupling',
 'bunch',
 'ph',
 'quigley',
 'reverse',
 'lookskechers',
 'caretaker',
 'slyngstad',
 'chill',
 'specify',
 'vxvf',
 'detract',
 'wozniacki',
 'vpghjcmpy',
 'repurchaseon',
 'wacom',
 'subreddits',
 'takk',
 'scrupulous',
 'fundraiser',
 'herald',
 'anticompetition',
 'pennant',
 'sg',
 'cp',
 'kerry',
 'guidanceallergan',
 'decitabine',
 'pail',
 'southborough',
 'carcache',
 'replace',
 'lardner',
 'pipelinenearly',
 'wfd',
 'electrify',
 

In [None]:
from itertools import groupby

test = sorted(set(lemmatized_tokens), key = lambda ele: lemmatized_tokens.count(ele))
test