## Library imports

In [27]:
import pandas as pd
import collections
import nltk
from nltk.corpus import stopwords
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('stopwords')
nltk.download('universal_tagset')
nltk.download('averaged_perceptron_tagger')

from sklearn.feature_extraction.text import CountVectorizer



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gaastra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Gaastra\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Gaastra\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Data Import

In [53]:
apple_df = pd.read_csv("Data/AAPL.csv")
news_df = pd.read_csv("Data/us_equities_news_dataset.csv")

In [54]:
news_df[news_df['article_id'] == 200191463]

Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id
65467,286982,AAPL,Qualcomm QCOM Updates Lawsuit To Counter Att...,opinion,Qualcomm Inc NASDAQ QCOM the largest mob...,2017-05-24,Zacks Investment Research,https://www.investing.com/analysis/qualcomm-(q...,200191463


In [55]:
news_df.iloc[65467]['content']

'Qualcomm Inc    NASDAQ QCOM    the largest mobile chipset manufacturer using baseband technology globally  recently updated one of its lawsuits providing more evidences that Apple Inc    NASDAQ AAPL   is interfering with its existing arrangements with the contract manufacturing firms Notably  in Jan 2017  Qualcomm was slapped with a  1 billion lawsuit related to licensing royalty payments by tech giant Apple  In the suit filed in the U S  District Court for the Southern District of California  Apple accused Qualcomm of overcharging for chips and refusing to pay some  1 billion in promised rebates It s interesting to note that  just a couple of days before the filing  Qualcomm had faced an anti trust lawsuit from the U S  Federal Trade Commission  FTC   The regulator filed a case with the U S  District Court for the Northern District of California claiming that the company has used anti competitive measures to maintain a monopoly in the baseband chipset market The FTC said Qualcomm had

## Preprocessing

In [41]:
#Preprocessing Steps

# Apple dataframe
# Add price_higher column based on:
# Close > Open = 1
# Close <= Open = 0
apple_df['price_higher'] = apple_df.apply(lambda row: 1 if row['Close'] > row['Open'] else 0, axis=1)

# News dataframe
# Find and delete duplicate content
print(f"Duplicate rows in news_df 'content' column: {news_df['content'].duplicated().sum()}")
news_df.drop_duplicates('content', inplace=True)

# Filter news content based on Apple stock
news_df = news_df[news_df['ticker'] == 'AAPL']

# Join price_higher column with news dataframe based on date
news_df = news_df.merge(apple_df[['Date', 'price_higher']], left_on='release_date', right_on='Date')
news_df = news_df.drop(columns=['Date'])

# Convert release_date to datetime type
news_df['release_date'] = pd.to_datetime(news_df['release_date'])

# news_df['content'] = news_df['content'].apply(lambda x: '. '.join(x.split(' \n')))
# news_df['content'] = news_df['content'].apply(lambda x: '. '.join(list(filter(None, [i.strip(' ') for i in x.split('\r\n')]))))
news_df['words_amount'] = news_df.apply(lambda row: len(row['content'].split(" ")), axis=1)

Duplicate rows in news_df 'content' column: 516


In [42]:
print(f"The amount of Apple articles is: {str(len(news_df['article_id'].unique()))}")
print(f"The total amount of words for all articles is: {str(len(' '.join(news_df['content']).split(' ')))}")
print(f"The amount unique words for all articles is: {str(len(set(' '.join(news_df['content']).split(' '))))}")
print(f"The average amount of words per article is: {str(round(news_df['words_amount'].mean()))}")

# TEST: Distribution of price_higher
days_higher = news_df[news_df['price_higher'] == 1]['price_higher'].count()
days_lower = news_df[news_df['price_higher'] == 0]['price_higher'].count()

print(days_higher, days_lower)
#Lexical richness bekijk https://pypi.org/project/lexicalrichness/

The amount of Apple articles is: 17624
The total amount of words for all articles is: 12838089
The amount unique words for all articles is: 132535
The average amount of words per article is: 728
9529 8095


In [44]:
news_df.iloc[12197]

id                                                         286982
ticker                                                       AAPL
title           Qualcomm  QCOM  Updates Lawsuit To Counter Att...
category                                                  opinion
content         Qualcomm Inc    NASDAQ QCOM    the largest mob...
release_date                                  2017-05-24 00:00:00
provider                                Zacks Investment Research
url             https://www.investing.com/analysis/qualcomm-(q...
article_id                                              200191463
price_higher                                                    0
words_amount                                                  720
Name: 12197, dtype: object

In [6]:
# Group and aggregate content by date
news_df.groupby('release_date').agg({'content': '.'.join})
news_df['content'] = news_df['content'].str.lower()

In [7]:
english_stopwords = set(stopwords.words('english'))

test_news_df = news_df.copy()

def tokenize(text):
    tokens = text.split(' ')
    return tokens

def clean_tokens(tokens):
    test = [token for token in tokens if (token != '' and token not in english_stopwords) and token.isalpha()]
    return test

def join_tokens(tokens):
    return ' '.join(tokens)

def calc_lexical_div(tokens):
    return len(set(tokens)) / len(tokens)

# TODO
words_to_be_removed = ['apple', 'aapl']

# Tokenize content string
test_news_df['content'] = test_news_df['content'].apply(tokenize)
# Remove stopwords, empty tokens and punctuation/numbers
test_news_df['tokens'] = test_news_df['content'].apply(clean_tokens)
# Join tokens into string
test_news_df['content'] = test_news_df['tokens'].apply(join_tokens)
# Calculate lexical diversity per date
test_news_df['lexical_div'] = test_news_df['tokens'].apply(calc_lexical_div)

test_news_df[['tokens', 'lexical_div']]

Unnamed: 0,tokens,lexical_div
0,"[jpmorgan, lifts, apple, aapl, target, ahead, ...",0.866667
1,"[kim, investing, com, faang, stocks, predictab...",0.729927
2,"[chuck, mikolajczak, new, york, reuters, u, st...",0.717224
3,"[two, best, performing, tech, stocks, set, rep...",0.695817
4,"[yasin, ebrahim, kim, apple, readies, earnings...",0.720000
...,...,...
17619,"[stock, market, difficult, one, traders, inves...",0.659432
17620,"[tsx, index, leading, canadian, stocks, outper...",0.557616
17621,"[europe, flares, summer, heat, continues, summ...",0.705521
17622,"[last, quarter, apple, aapl, reported, best, q...",0.482667


In [8]:
unique_token_set = set(" ".join(test_news_df['content']).split(" "))
unique_token_tagged = pos_tag(unique_token_set, tagset='universal')
tag_abbrev_dict = {"NOUN": "n",
                   "VERB": "v",
                   "ADJ": "a",
                   "ADV": "r"}

unique_token_tagged_abbrev = [(token, tag_abbrev_dict[pos_tag]) for (token, pos_tag) in unique_token_tagged if pos_tag in tag_abbrev_dict.keys()]

lemmatizer = WordNetLemmatizer()
lemmatized_tokens_dict = {token: lemmatizer.lemmatize(token, pos=pos_tag) for (token, pos_tag) in unique_token_tagged_abbrev}
lemmatized_tokens_dict

{'atms': 'atms',
 'imprecise': 'imprecise',
 'garner': 'garner',
 'readjusting': 'readjust',
 'adrny': 'adrny',
 'englesson': 'englesson',
 'qualcommyear': 'qualcommyear',
 'platinum': 'platinum',
 'unsavory': 'unsavory',
 'rao': 'rao',
 'unsustainable': 'unsustainable',
 'ockene': 'ockene',
 'bluebay': 'bluebay',
 'plungesouthwest': 'plungesouthwest',
 'nazanin': 'nazanin',
 'rcexfo': 'rcexfo',
 'whet': 'whet',
 'radarso': 'radarso',
 'deluard': 'deluard',
 'selloffadding': 'selloffadding',
 'communiction': 'communiction',
 'costa': 'costa',
 'drown': 'drown',
 'peos': 'peos',
 'disappointingly': 'disappointingly',
 'dustbin': 'dustbin',
 'superhighway': 'superhighway',
 'syh': 'syh',
 'climax': 'climax',
 'bmps': 'bmps',
 'quadrupled': 'quadruple',
 'cris': 'cris',
 'low': 'low',
 'fixing': 'fix',
 'mcconnel': 'mcconnel',
 'intravitreal': 'intravitreal',
 'rdm': 'rdm',
 'considertelus': 'considertelus',
 'chee': 'chee',
 'object': 'object',
 'avearge': 'avearge',
 'glamglow': 'glamgl

In [9]:
def lemmatize_content(tokens, token_dict):
    lemmatized_tokens = [lemmatized_tokens_dict[token] for token in tokens if token if token in token_dict.keys()]
    return " ".join(lemmatized_tokens)

In [10]:
test_news_df['content'] = test_news_df['tokens'].apply(lemmatize_content, token_dict=lemmatized_tokens_dict)

In [16]:
test_news_df[test_news_df['content'].str.contains("qualcommyear")]['content']

'12197    qualcomm inc nasdaq qcom large mobile chipset ...\nName: content, dtype: object'

In [None]:
vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 2))
m2 = vectorizer.fit_transform(test_news_df['content'])

In [50]:
vectorizer.get_feature_names_out()

array(['aa aa', 'aa aaa', 'aa add', ..., 'zzamoi eamazmfpaydqapaojbet',
       'zzsixxfnfuhcdofbu style', 'zzz lhmpphaewuk'], dtype=object)