In [2]:
#Import Required Packages
import requests
import os
import json
import newspaper
from newspaper import fulltext
import pandas as pd
import numpy as np
import pickle

### Import Urls

In [None]:
#Set chromedriver executable path. 
chromedriver = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

In [None]:
#Open pickled list of article news links
with open('./data_frames/news_article_links_15000_30000.pkl', 'rb') as file:
     news_article_links = pickle.load(file)

In [None]:
news_article_links = [item for sublist in news_article_links for item in sublist]

### From individual API files, download news source link and extract text using newspaper python package

In [8]:
def saving_json(urls, file_path_text_folder, file_path_dates_folder):
    
    """
    The following function utilizes the newspaper api to scrape text and dates from a news article. The text and dates are
    then stored locally in their designated folder. 
    """
    num = len(urls)
    print('saving ...')
    
    #Loop through Urls list and download article text utilizing newspaper API
    for url in urls:
        try:
            #print(url)
            url = url.strip()
            link = newspaper.Article(url)
            link.download()
            html = link.html
            
            #Disregard any urls with the word 'video' as they can't be scraped via newspaper api
            if 'video' in url:
                pass

            elif link: 
                try:
                    #Parse article text from url
                    link.parse()
                    text = fulltext(html)
                    date = link.publish_date

                    #Save Article text and Dates locally on device
                    filename = file_path_text_folder+'article_{0}.txt'.format(num)
                    os.makedirs(os.path.dirname(filename), exist_ok=True)
                    with open(filename, 'w',encoding="utf8",newline='') as file:
                        json.dump(text,file)
                    
                    filename = file_path_dates_folder+'article_{0}_date.txt'.format(num)
                    os.makedirs(os.path.dirname(filename), exist_ok=True)
                    with open(filename, 'w',encoding="utf8",newline='') as file:
                        json.dump(str(date),file)
                        
                    
                except:
                    pass

            else:
                pass
            
        except:
            pass
        
        num -=1
                
    print('Finished')
    return None

In [None]:
#Run Cell to Collect Articles
saving_json(news_article_links,'./article_data/text/article_text_15000_30000/','./article_data/dates/article_dates_15000_30000/')

## Create Initial Dataframes, Only Run the Following cell once then comment out!

In [69]:
# # Create initial modeling DataFrame - Only Ran Once then commented out
# modeling_text = pd.DataFrame(columns=['text'])

# # Save initial DataFrame - Only Ran Once - Only Ran Once then commented out
# with open('./data_frames/bitcoin_news_raw_text_df.pickle', 'wb') as file:
#            pickle.dump(modeling_text, file)

# # Create initial modeling DataFrame - Only Ran Once then commented out
# modeling_dates = pd.DataFrame(columns=['date'])

# # Save initial DataFrame - Only Ran Once - Only Ran Once then commented out
# with open('./data_frames/bitcoin_news_raw_dates_df.pickle', 'wb') as file:
#            pickle.dump(modeling_dates, file)

In [70]:
#Open Corpus of Bitcoin News Article Text
with open('./data_frames/bitcoin_news_raw_text_df.pickle', 'rb') as file:
    bitcoin_news_text = pickle.load(file)

with open('./data_frames/bitcoin_news_raw_dates_df.pickle', 'rb') as file:
    bitcoin_news_dates = pickle.load(file)

In [71]:
def add_text_data_to_dateframe(df, file_directory):
    """
    This function takes in an empty dataframe. Opens text files avaialble on host machine, and adds the text within
    the files to the dataframe.
    """
    i = df.shape[0]  #Will start adding at the last row of the dataframe
    for source in os.listdir(file_directory):
        for file in os.listdir(file_directory+source):
             if file.endswith(".txt"):
                    curr_file = os.path.join(file_directory+source, file)
                    with open(curr_file) as json_file:
                        try:
                            data = json.load(json_file)
                            df.loc[i] = data 
                            i = i + 1
                        except ValueError:
                            continue
    return df

In [72]:
bitcoin_news_text  = add_text_data_to_dateframe(bitcoin_news_text,'./article_data/text/' )
bitcoin_news_dates = add_text_data_to_dateframe(bitcoin_news_dates,'./article_data/dates/')

In [73]:
#Save Bitcoin Article Text Dataframe
with open('./data_frames/bitcoin_news_raw_text_df.pickle', 'wb') as file:
    pickle.dump(bitcoin_news_text, file)

#Save Bitcoin Article Dates Dataframe
with open('./data_frames/bitcoin_news_raw_dates_df.pickle', 'wb') as file:
    pickle.dump(bitcoin_news_dates, file)

In [74]:
#Open Saved Dataframes
with open('./data_frames/bitcoin_news_raw_text_df.pickle', 'rb') as file:
    bitcoin_news_raw_text_df = pickle.load(file)
    
#Combine Dataframes
with open('./data_frames/bitcoin_news_raw_dates_df.pickle', 'rb') as file:
    bitcoin_news_raw_dates_df = pickle.load(file)

In [75]:
#Merge Dataframes at the index
bitcoin_news_text_dates_combined_df = pd.merge(bitcoin_news_raw_text_df, bitcoin_news_raw_dates_df, left_index=True, right_index=True)

In [76]:
#Sample the dataframe
bitcoin_news_text_dates_combined.sample(10)

Unnamed: 0,text,date
20240,We’ve been a little ‘crypto crazy’ the last fe...,2017-06-09 00:00:00
4726,"Top influential organizations: Blockchain, Coi...",2016-06-06 13:43:55+00:00
8666,Bitcoin Europe Lowers Card Interchange Fees – ...,2015-12-15 00:00:00
13427,"Can Blockchain break Facebook, Uber, Airbnb, e...",2016-05-11 16:12:24+00:00
9074,Bitcoin: The End of Money As We Know It is a 2...,2015-12-26 01:02:10+00:00
13023,With a new wave of volatility hitting the cryp...,2016-05-01 15:59:38+00:00
14582,"Mar 08, 2018 at 12:29 // News\n\nCoin Idol Aut...",2018-03-12 21:55:10+00:00
7614,Bitcoin enthusiasts look for new ways to safeg...,
11556,Bitcoin is collapsing. Martin Hunter/Getty Ima...,2016-03-04 15:13:08+00:00
6060,"Cybercriminal group DD4BC, which stands for “D...",2015-09-09 16:19:58+00:00


In [77]:
#Save Joined Article Text and Dates Dataframe
with open('./data_frames/bitcoin_news_text_dates_combined_df.pickle', 'wb') as file:
    pickle.dump(bitcoin_news_text_dates_combined_df, file)