In [40]:
import pandas
import os
import pandas as pd
import nltk
import json
from datetime import datetime, timezone

In [41]:
from nltk.corpus import stopwords, words
from nltk.stem import LancasterStemmer, PorterStemmer, SnowballStemmer

In [42]:
NEWS_DIRECTORY = '../data/News/'

In [43]:
# Download libraries
nltk.download('punkt')
nltk.download('words')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/local/ASUAD/falhinda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /home/local/ASUAD/falhinda/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/local/ASUAD/falhinda/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [44]:
def get_news_articles(path):
    news_text = []
    news_publish_time = []
    news_source = []
    domains_to_select = ['ae', 'au', 'bb', 'biz', 'ca', 'in', 'io', 'net', 'uk', 'com']
    
    
    with os.scandir(path) as news_directories:
        for directory in news_directories:
            with os.scandir(os.path.join(path, directory.name)) as folder:
                for article in folder:
                    with open(os.path.join(path, directory.name, article.name), encoding='latin1') as f:
                        try:
                            news_data = json.load(f)
                        except json.JSONDecodeError as e:
                            print(f"Error loading JSON from {article.name}: {e}")
                            continue
                    if 'site' in news_data['thread'] and news_data['thread']['site']:
                        source = news_data['thread']['site'] 
                        domain = source.split('.')[-1]
                        # Skip news from domain not in the list
                        if domain not in domains_to_select:
                            continue
                        news_source.append(source)
                    else:
                        news_source.append(None)
                    if 'published' in news_data and news_data['published']:
                        news_publish_time.append(news_data['published'])
                    else:
                        news_publish_time.append(None)
                    if 'text' in news_data and news_data['text']:
                        news_text.append(news_data['text'])
                    else:
                        news_text.append(None)

    df = pd.DataFrame({
        'timestamp': news_publish_time,
        'text': news_text,
        'source': news_source,
    })




    return df

news_df = get_news_articles(NEWS_DIRECTORY)

Error loading JSON from 2018_07_d157b48c57be246ec7dd80e7af4388a2.zip: Expecting value: line 1 column 1 (char 0)


In [45]:
news_df

Unnamed: 0,timestamp,text,source
0,2018-11-14T14:12:00.000+02:00,Reddit\nThereâs been no shortage of AAPL pes...,9to5mac.com
1,2018-11-05T02:00:00.000+02:00,"By Mark DeCambre, MarketWatch\nU.S. stock futu...",morningstar.com
2,2018-11-05T14:45:00.000+02:00,"Last month, I posted my quarterly results and ...",seekingalpha.com
3,2018-11-26T02:00:00.000+02:00,By Cristina Roca\nKering (KER.FR) is implement...,morningstar.com
4,2018-11-23T02:00:00.000+02:00,Wall Street closed mostly higher on Wednesday ...,zacks.com
...,...,...,...
73496,2019-01-24T09:19:00.000+02:00,Shutterstock photo Top Tech Stocks\nMSFT -1.08...,nasdaq.com
73497,2019-01-14T22:18:00.000+02:00,"As Microsoft (MSFT) Stock Declined, Shareholde...",moveefy.com
73498,2018-12-31T02:00:00.000+02:00,Â© Reuters. US STOCKS-Wall Street falters afte...,investing.com
73499,2019-01-28T21:00:00.000+02:00,Apple Might Challenge Microsoft and Amazon Wit...,yahoo.com


In [46]:
news_df.isnull().sum()

timestamp    0
text         0
source       0
dtype: int64

In [47]:
# Convert timezone to UTC and drop the timezone
news_df['timestamp'] = news_df['timestamp'].apply(lambda x: datetime.fromisoformat(x).astimezone(tz=timezone.utc))
news_df['timestamp'] = news_df['timestamp'].dt.tz_localize(None)

In [48]:
# news_df['time'] = news_df['publish_timestamp'].apply(lambda x: x.time())
# news_df['date'] = news_df['publish_timestamp'].apply(lambda x: x.date())
news_df['text'] = news_df['text'].apply(lambda x: x.lower())

In [49]:
news_df.to_csv('RawNewsData.csv')

In [50]:
news_df

Unnamed: 0,timestamp,text,source
0,2018-11-14 12:12:00,reddit\nthereâs been no shortage of aapl pes...,9to5mac.com
1,2018-11-05 00:00:00,"by mark decambre, marketwatch\nu.s. stock futu...",morningstar.com
2,2018-11-05 12:45:00,"last month, i posted my quarterly results and ...",seekingalpha.com
3,2018-11-26 00:00:00,by cristina roca\nkering (ker.fr) is implement...,morningstar.com
4,2018-11-23 00:00:00,wall street closed mostly higher on wednesday ...,zacks.com
...,...,...,...
73496,2019-01-24 07:19:00,shutterstock photo top tech stocks\nmsft -1.08...,nasdaq.com
73497,2019-01-14 20:18:00,"as microsoft (msft) stock declined, shareholde...",moveefy.com
73498,2018-12-31 00:00:00,â© reuters. us stocks-wall street falters afte...,investing.com
73499,2019-01-28 19:00:00,apple might challenge microsoft and amazon wit...,yahoo.com


In [51]:
news_df.columns

Index(['timestamp', 'text', 'source'], dtype='object')

In [52]:
columns = ['timestamp', 'source', 'sentences']
processed_amzn_news_df = pd.DataFrame(columns=columns)
processed_aapl_news_df = pd.DataFrame(columns=columns)

In [53]:
# Extract and separate sentences containing AAPL and AMZN
for index, row in news_df.iterrows():
    text = row['text']
    aapl_sentences, amzn_sentences = [], []
    for sentence in nltk.sent_tokenize(text):
        if 'amazon' in sentence or 'amzn' in sentence:
            amzn_sentences.append(sentence)
        if 'apple' in sentence or 'aapl' in sentence:
            aapl_sentences.append(sentence)
    if aapl_sentences:
        processed_aapl_news_df.loc[len(processed_aapl_news_df)] = [row['timestamp'], row['source'], aapl_sentences]
    if amzn_sentences:
        processed_amzn_news_df.loc[len(processed_amzn_news_df)] = [row['timestamp'], row['source'], amzn_sentences]
        
print(len(processed_amzn_news_df), len(processed_aapl_news_df))

20236 73175


In [None]:
# del news_df
processed_amzn_news_df.to_csv('AmznExtractedSentences.csv')
processed_aapl_news_df.to_csv('AaplExtractedSentences.csv')

In [38]:
import string
import re


def extract_words(input_words):
    from nltk.corpus import words
    
    # Remove all non-ascii words
    processed_words = [w for w in input_words if w.isascii()]
    
    # Remove punctuation words
    tr_dict = str.maketrans(dict.fromkeys(string.punctuation))
    processed_words = [w.translate(tr_dict) for w in processed_words if w]
    
    # Remove links
    final_words = []
    for word in processed_words:
        if not re.match('[www]', word):
            final_words.append(word)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    processed_words = [w for w in final_words if w not in stop_words]
    
    # Stem words and return unique words
    stemmer = SnowballStemmer('english')
    seen = set()
    processed_words = [stemmer.stem(word) for word in processed_words if word]
    processed_words = [x for x in processed_words if not (x in seen or seen.add(x))]
    del seen
    
    # Keep only words from English dictionary
    english_words = set([w.lower() for w in words.words()])
    processed_words = [w for w in processed_words if w in english_words]
    
    return processed_words

In [None]:
tokenized_df_columns = ['timestamp', 'source', 'tokens']
tokenized_amzn_news_df = pd.DataFrame(columns=tokenized_df_columns)
tokenized_aapl_news_df = pd.DataFrame(columns=tokenized_df_columns)

print("\n\nProcessing %d records" % len(processed_amzn_news_df))
for index, row in processed_amzn_news_df.iterrows():
    # This break is only for testing purpose
#     if index >= 1000:
#         break
        
    if index % 500 == 0:
        print("Completed %d rows" % index)
    token_words = []
    for sentence in row['sentences']:
        token_words.extend(nltk.wordpunct_tokenize(sentence))
    token_words = extract_words(token_words)
    print(token_words)
    tokenized_amzn_news_df.loc[index] = [
        processed_amzn_news_df.loc[index]['timestamp'],
        processed_amzn_news_df.loc[index]['source'], 
        token_words
    ]
tokenized_amzn_news_df.to_csv('AmznExtractedTokens.csv')


print("\n\nProcessing %d records" % len(processed_aapl_news_df))
for index, row in processed_aapl_news_df.iterrows():
    # This break is only for testing purpose
#     if index >= 1000:
#         break
    if index % 500 == 0:
        print("Completed %d rows" % index)
    token_words = []
    for sentence in row['sentences']:
        token_words.extend(nltk.wordpunct_tokenize(sentence))
    token_words = extract_words(token_words)
    print(token_words)
    tokenized_aapl_news_df.loc[index] = [
        processed_aapl_news_df.loc[index]['timestamp'], 
        processed_aapl_news_df.loc[index]['source'],
        token_words
    ]
tokenized_aapl_news_df.to_csv('AaplExtractedTokens.csv')
        

In [59]:
tokenized_amzn_news_df = pd.read_csv("AmznExtractedTokens.csv")
tokenized_aapl_news_df = pd.read_csv('AaplExtractedTokens.csv')

tokenized_aapl_news_df.head()

Unnamed: 0.1,Unnamed: 0,timestamp,source,tokens
0,0,2018-11-14 12:12:00,9to5mac.com,"['thing', 'kick', 'monday', 'news', 'ming', 'c..."
1,1,2018-11-05 00:00:00,morningstar.com,"['share', 'trade', 'monday', 'follow', 'maker'..."
2,2,2018-11-05 12:45:00,seekingalpha.com,"['portfolio', 'current', 'sit', 'core', 'divid..."
3,3,2018-11-26 00:00:00,morningstar.com,"['cristina', 'ker', 'implement', 'sever', 'boo..."
4,4,2018-11-23 00:00:00,zacks.com,"['trade', 'high', 'three', 'major', 'stock', '..."


In [18]:
# Sort dataframes based on timestamps
# tokenized_amzn_news_df.sort_values(['day', 'time'], axis=0, ascending=(True, True), inplace=True)
# tokenized_aapl_news_df.sort_values(['day', 'time'], axis=0, ascending=(True, True), inplace=True)
tokenized_amzn_news_df.sort_values(['timestamp'], axis=0, ascending=True, inplace=True)
tokenized_aapl_news_df.sort_values(['timestamp'], axis=0, ascending=True, inplace=True)

In [60]:
import pickle

with open('AmznExtractedTokens.pkl', 'wb') as f:
    pickle.dump(tokenized_amzn_news_df, f)
with open('AaplExtractedTokens.pkl', 'wb') as f:
    pickle.dump(tokenized_aapl_news_df, f)