## **New York Times Data Collection**

# Importing New York Times News Category Dataset and Sentiment Analysis

In [3]:
pip install textblob

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import requests
import os
import json
from datetime import datetime
import pandas as pd
from textblob import TextBlob

apikey = os.getenv('NYTIMES_APIKEY', 'SqzyHe7mmHI7o9uARoryVwi8wCVHdKzJ')

ny_data = {}
year = 2022
month = 9

filename = f"ny_stories.json"
query_url = f"https://api.nytimes.com/svc/archive/v1/{year}/{month}.json?api-key={apikey}"

r = requests.get(query_url)

ny_data.update(r.json())

with open(filename, 'w') as f:
    json.dump(ny_data, f)

ny_docs = ny_data['response']['docs']
cleaned_ny_data = {
    'headline': [],
    'abstract': [],
    'pub_date': [],
    'section': [],
    'sentiment_analysis': []
}

start_date = datetime.strptime('2022-09-13', "%Y-%m-%d").date()
end_date = datetime.strptime('2023-03-13', "%Y-%m-%d").date()

for doc in ny_docs:
    sec_name = doc['section_name']
    if sec_name not in ['Business Day', 'Technology', 'Real Estate']:
        continue

    pub_date = doc['pub_date']
    date = datetime.strptime(pub_date.split('T')[0], "%Y-%m-%d").date()
    if date < start_date or date > end_date:
        continue

    headline = doc['headline']['main']
    abstract = doc['abstract']

    cleaned_ny_data['headline'].append(headline)
    cleaned_ny_data['abstract'].append(abstract)
    cleaned_ny_data['pub_date'].append(str(date))
    cleaned_ny_data['section'].append(doc['section_name'])

    # Sentiment analysis
    cleaned_ny_data['sentiment_analysis'].append(TextBlob(abstract).sentiment.polarity)

cleaned_ny_data_df = pd.DataFrame(cleaned_ny_data)

# Save the results to a CSV file
cleaned_ny_data_df.to_csv("ny_times_sentiment_analysis.csv", index=False)

print(f"Time frame: {start_date} to {end_date}")
cleaned_ny_data_df.head(10)

Time frame: 2022-09-13 to 2023-03-13


Unnamed: 0,headline,abstract,pub_date,section,sentiment_analysis
0,A $100 Million Bet on Finding the Next ‘Mr. Be...,"Reed Duchscher, the manager of the YouTube meg...",2022-09-13,Business Day,-0.2
1,Why You Should Decorate Your Hallway (and How ...,"When you’re furnishing your home, it’s easy to...",2022-09-13,Real Estate,0.216667
2,Markets Plunge as Inflation Data Undercuts Wal...,"Stocks plunged, government bond yields soared ...",2022-09-13,Business Day,0.08
3,"Inflation Remained Stubbornly High in August, ...",Overall inflation moderated less than anticipa...,2022-09-13,Business Day,-0.288889
4,"Inflation Explained: The Good, the Bad and the...",Amid hopeful signs that U.S. inflation was abo...,2022-09-13,Business Day,0.255
5,U.S. Gas Prices Have Fallen for 91 Straight Da...,The steady decline is also a welcome developme...,2022-09-13,Business Day,0.438889
6,Twitter Whistle-blower Is Set to Testify in Wa...,Elon Musk’s legal team will follow closely the...,2022-09-13,Business Day,0.1
7,‘The Future of Hospitals’: Flexible Space for ...,After struggling to respond to a crushing Covi...,2022-09-13,Business Day,0.38
8,Pandemic Aid Cut U.S. Poverty to New Low in 20...,A measure that accounts for all federal subsid...,2022-09-13,Business Day,-0.166667
9,How Is the Economy Doing?,"Most of the economy appears to be doing well, ...",2022-09-13,Business Day,0.148611


In [6]:
# Assuming cleaned_ny_data_df is a DataFrame with a column named 'pub_date'

# Convert the date strings to datetime objects
cleaned_ny_data_df['pub_date'] = pd.to_datetime(cleaned_ny_data_df['pub_date'])

# Display the DataFrame
cleaned_ny_data_df.head()

Unnamed: 0,headline,abstract,pub_date,section,sentiment_analysis
0,A $100 Million Bet on Finding the Next ‘Mr. Be...,"Reed Duchscher, the manager of the YouTube meg...",2022-09-13,Business Day,-0.2
1,Why You Should Decorate Your Hallway (and How ...,"When you’re furnishing your home, it’s easy to...",2022-09-13,Real Estate,0.216667
2,Markets Plunge as Inflation Data Undercuts Wal...,"Stocks plunged, government bond yields soared ...",2022-09-13,Business Day,0.08
3,"Inflation Remained Stubbornly High in August, ...",Overall inflation moderated less than anticipa...,2022-09-13,Business Day,-0.288889
4,"Inflation Explained: The Good, the Bad and the...",Amid hopeful signs that U.S. inflation was abo...,2022-09-13,Business Day,0.255


In [8]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

# Assuming cleaned_ny_data_df is a DataFrame containing the columns: 'pub_date', 'headline', and 'abstract'

# Calculate sentiment scores for headlines and abstracts
cleaned_ny_data_df['headline_sentiment'] = cleaned_ny_data_df['headline'].apply(lambda x: sia.polarity_scores(x)['compound'])
cleaned_ny_data_df['abstract_sentiment'] = cleaned_ny_data_df['abstract'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Group by date and calculate mean and median sentiment scores
grouped_data = cleaned_ny_data_df.groupby('pub_date').agg({
    'headline_sentiment': ['mean', 'median'],
    'abstract_sentiment': ['mean', 'median']
}).reset_index()

# Flatten the multi-level column names
grouped_data.columns = ['_'.join(col).strip() for col in grouped_data.columns.values]

# Rename columns
grouped_data.columns = ['Date', 'Headline Mean', 'Headline Median', 'Body Mean', 'Body Median']

# Save the DataFrame to a CSV file
grouped_data.to_csv("ny_times_grouped_data.csv", index=False)

grouped_data.head()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


Unnamed: 0,Date,Headline Mean,Headline Median,Body Mean,Body Median
0,2022-09-13,-0.051256,0.0,-0.029289,-0.0258
1,2022-09-14,-0.146156,0.0,-0.0406,-0.0129
2,2022-09-15,0.034504,0.0,0.072886,0.0
3,2022-09-16,-0.089775,0.0,0.059435,0.0
4,2022-09-17,-0.07615,0.0,0.07625,0.16085


In [9]:
# Set the 'Date' column as the index of the grouped_data DataFrame
grouped_data.set_index('Date', inplace=True)

# Save the DataFrame to a CSV file without repeating the date values
grouped_data.to_csv("ny_times_grouped_data.csv")

In [10]:
# Assuming grouped_data is a DataFrame with 'Date' as the index and in datetime format
start_date = pd.to_datetime('2022-09-13')
end_date = pd.to_datetime('2023-03-13')

# Filter the DataFrame based on the date range
filtered_grouped_data = grouped_data.loc[(grouped_data.index >= start_date) & (grouped_data.index <= end_date)]

# Save the filtered DataFrame to a CSV file without repeating the date values
filtered_grouped_data.to_csv("ny_times_filtered_grouped_data.csv")

filtered_grouped_data.head()


Unnamed: 0_level_0,Headline Mean,Headline Median,Body Mean,Body Median
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-09-13,-0.051256,0.0,-0.029289,-0.0258
2022-09-14,-0.146156,0.0,-0.0406,-0.0129
2022-09-15,0.034504,0.0,0.072886,0.0
2022-09-16,-0.089775,0.0,0.059435,0.0
2022-09-17,-0.07615,0.0,0.07625,0.16085


In [15]:
# Save the entire grouped_data DataFrame to a CSV file
grouped_data.to_csv("ny_times_grouped_data.csv")