In [2]:
# Initial imports
import os
import pandas as pd
from dotenv import load_dotenv
import nltk as nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

%matplotlib inline

import json
import time
import requests
import datetime
import dateutil
from dateutil.relativedelta import relativedelta

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\prabh\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
end = datetime.date.today()
start = end - relativedelta(years=1)

In [4]:
months_in_range = [x.split(' ') for x in pd.date_range(start, end, freq='MS').strftime("%Y %#m").tolist()]
months_in_range

[['2021', '4'],
 ['2021', '5'],
 ['2021', '6'],
 ['2021', '7'],
 ['2021', '8'],
 ['2021', '9'],
 ['2021', '10'],
 ['2021', '11'],
 ['2021', '12'],
 ['2022', '1'],
 ['2022', '2'],
 ['2022', '3']]

In [5]:
# Read your api key environment variable
load_dotenv()
api_key = os.getenv("NEWS_API_KEY")

In [6]:
def send_request(date):
    '''Sends a request to the NYT Archive API for given date.'''
    base_url = 'https://api.nytimes.com/svc/archive/v1/'
    url = base_url + '/' + date[0] + '/' + date[1] + '.json?api-key=' + api_key
    response = requests.get(url).json()
    time.sleep(6)
    return response


def is_valid(article, date):
    '''An article is only worth checking if it is in range, and has a headline.'''
    is_in_range = date > start and date < end
    has_headline = type(article['headline']) == dict and 'main' in article['headline'].keys()
    return is_in_range and has_headline


def parse_response(response):
    '''Parses and returns response as pandas data frame.'''
    data = {'headline': [],  
        'date': [], 
        'doc_type': [],
        'material_type': [],
        'section': [],
        'keywords': []}
    
    articles = response['response']['docs'] 
    for article in articles: # For each article, make sure it falls within our date range
        date = dateutil.parser.parse(article['pub_date']).date()
        if is_valid(article, date):
            data['date'].append(date)
            data['headline'].append(article['headline']['main']) 
            if 'section' in article:
                data['section'].append(article['section_name'])
            else:
                data['section'].append(None)
            data['doc_type'].append(article['document_type'])
            if 'type_of_material' in article: 
                data['material_type'].append(article['type_of_material'])
            else:
                data['material_type'].append(None)
            keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
            data['keywords'].append(keywords)
    return pd.DataFrame(data) 


def get_data(dates):
    '''Sends and parses request/response to/from NYT Archive API for given dates.'''
    total = 0
    print('Date range: ' + str(dates[0]) + ' to ' + str(dates[-1]))
    if not os.path.exists('headlines'):
        os.mkdir('headlines')
    for date in dates:
        response = send_request(date)
        df = parse_response(response)
        total += len(df)
        df.to_csv('headlines/' + date[0] + '-' + date[1] + '.csv', index=False)
        print('Saving headlines/' + date[0] + '-' + date[1] + '.csv...')
    print('Number of articles collected: ' + str(total))

In [7]:
get_data(months_in_range)

Date range: ['2021', '4'] to ['2022', '3']
Saving headlines/2021-4.csv...
Saving headlines/2021-5.csv...
Saving headlines/2021-6.csv...
Saving headlines/2021-7.csv...
Saving headlines/2021-8.csv...
Saving headlines/2021-9.csv...
Saving headlines/2021-10.csv...
Saving headlines/2021-11.csv...
Saving headlines/2021-12.csv...
Saving headlines/2022-1.csv...
Saving headlines/2022-2.csv...
Saving headlines/2022-3.csv...
Number of articles collected: 47697


In [8]:
import glob
os.chdir("/Users/prabh/Desktop/Bootcamp/Natural-Language-Processing - Copy/headlines")

extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
#export to csv
combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8-sig')

In [9]:
# Fetch the Oil news articles
#Crude_news_articles = news_api.get_everything(q = "Crude" and "Canada", language = "en")
from pathlib import Path
file_path = Path('/Users/prabh/Desktop/Bootcamp/Natural-Language-Processing - Copy/headlines/combined_csv.csv')
Crude_news_articles = pd.read_csv(file_path)


In [25]:
Crude_news_article = Crude_news_articles.set_index('date')
Crude_news_article.head()

Unnamed: 0_level_0,headline,doc_type,material_type,section,keywords
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-10-01,When the Doctor’s Notes Hurt Your Feelings,article,News,,"['Electronic Health Records', 'Doctors', 'internal-sub-only']"
2021-10-01,House Delays Vote on Infrastructure Bill as Democrats Feud,article,News,,"['Infrastructure (Public Works)', 'American Jobs Plan (2021)', 'Federal Budget (US)', 'Law and Legislation', 'United States Politics and Government']"
2021-10-01,Tirzah’s Genre-Less Pop Embraces the Beauty of Uncertainty,article,Review,,['Music']
2021-10-01,Senate Confirms Biden’s Pick to Lead the Bureau of Land Management,article,News,,"['Global Warming', 'Greenhouse Gas Emissions', 'Federal Lands', 'Oil (Petroleum) and Gasoline', 'Drilling and Boring']"
2021-10-01,UMass Amherst Hires Cybersecurity Firm to Investigate Racist Emails,article,News,,"['Colleges and Universities', 'Discrimination', 'Black People', 'Blacks', 'E-Mail']"


In [26]:
crude_news_article = Crude_news_article[Crude_news_article["keywords"].str.contains("Petroleum")==True]
pd.set_option('display.max_colwidth', None)
crude_news_article.head()

Unnamed: 0_level_0,headline,doc_type,material_type,section,keywords
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-10-01,Senate Confirms Biden’s Pick to Lead the Bureau of Land Management,article,News,,"['Global Warming', 'Greenhouse Gas Emissions', 'Federal Lands', 'Oil (Petroleum) and Gasoline', 'Drilling and Boring']"
2021-10-01,Britain Is Heading Into a Nightmarish Winter,article,Op-Ed,,"['Great Britain Withdrawal from EU (Brexit)', 'Politics and Government', 'Shortages', 'Oil (Petroleum) and Gasoline', 'Trucks and Trucking', 'Labor and Jobs', 'Coronavirus (2019-nCoV)', 'Foreign Workers']"
2021-10-02,Harvard and Other Schools Make a Choice on Fossil Fuels,article,Op-Ed,,"['Global Warming', 'Greenhouse Gas Emissions', 'Endowments', 'Colleges and Universities', 'Oil (Petroleum) and Gasoline']"
2021-10-03,‘Major’ Oil Spill Off California Coast Threatens Wetlands and Wildlife,article,News,,"['Orange County, Calif, Oil Spill (2021)', 'Oil (Petroleum) and Gasoline', 'Oil Spills', 'Offshore Drilling and Exploration', 'Pipelines']"
2021-10-04,Oil prices hit a seven-year high as OPEC and its allies stick with a modest increase.,article,News,,"['Oil (Petroleum) and Gasoline', 'Prices (Fares, Fees and Rates)']"


In [27]:
crude_news_article = crude_news_article.drop(['doc_type',"material_type",'section', 'keywords' ],axis=1)

In [34]:
# Sentiment score columns
crude_df = crude_news_article
crude_df["compound"] = 0.0000
crude_df["positive"] = 0.0000
crude_df["neutral"] = 0.0000
crude_df["negative"] = 0.0000
crude_df.head()


Unnamed: 0_level_0,headline,compound,positive,neutral,negative
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-10-01,Senate Confirms Biden’s Pick to Lead the Bureau of Land Management,0.0,0.0,0.0,0.0
2021-10-01,Britain Is Heading Into a Nightmarish Winter,0.0,0.0,0.0,0.0
2021-10-02,Harvard and Other Schools Make a Choice on Fossil Fuels,0.0,0.0,0.0,0.0
2021-10-03,‘Major’ Oil Spill Off California Coast Threatens Wetlands and Wildlife,0.0,0.0,0.0,0.0
2021-10-04,Oil prices hit a seven-year high as OPEC and its allies stick with a modest increase.,0.0,0.0,0.0,0.0


In [35]:
# Get sentiment for the text and the title
for index, row in crude_news_article.iterrows():
    try:
        # Sentiment scoring with VADER
        title_sentiment = analyzer.polarity_scores(row["headline"])
        crude_df["compound"][index] = title_sentiment["compound"]
        crude_df["positive"][index] = title_sentiment["pos"]
        crude_df["neutral"][index] = title_sentiment["neu"]
        crude_df["negative"][index] = title_sentiment["neg"]
        
    except AttributeError:
        pass
 
crude_df.head()



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0_level_0,headline,compound,positive,neutral,negative
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-10-01,Senate Confirms Biden’s Pick to Lead the Bureau of Land Management,0.0,0.0,1.0,0.0
2021-10-01,Britain Is Heading Into a Nightmarish Winter,0.0,0.0,1.0,0.0
2021-10-02,Harvard and Other Schools Make a Choice on Fossil Fuels,0.0,0.0,1.0,0.0
2021-10-03,‘Major’ Oil Spill Off California Coast Threatens Wetlands and Wildlife,-0.3818,0.0,0.776,0.224
2021-10-04,Oil prices hit a seven-year high as OPEC and its allies stick with a modest increase.,-0.5574,0.0,0.735,0.265


In [40]:
mean_crude_df = crude_df.groupby(crude_df.index).mean()

In [41]:
mean_crude_df.to_csv( "oil_sentiments.csv")

In [43]:
mean_crude_df.describe()

Unnamed: 0,compound,positive,neutral,negative
count,179.0,179.0,179.0,179.0
mean,-0.101658,0.057844,0.817335,0.124832
std,0.340041,0.106161,0.169368,0.156445
min,-0.8807,0.0,0.329,0.0
25%,-0.3612,0.0,0.7,0.0
50%,0.0,0.0,0.813,0.0
75%,0.0,0.0375,1.0,0.2365
max,0.6597,0.425,1.0,0.671
