## **New York Times Data Collection**

# Importing New York Times News Category Dataset and Sentiment Analysis

In [1]:
pip install textblob

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
pip install pandas

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import requests
import os
import json
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from datetime import datetime, timedelta

nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

apikey = os.getenv('NYTIMES_APIKEY', 'SqzyHe7mmHI7o9uARoryVwi8wCVHdKzJ')

start_date = datetime.strptime('2022-03-13', "%Y-%m-%d")
end_date = datetime.strptime('2022-09-13', "%Y-%m-%d")

ny_data = {}
all_articles = []

# Iterate through months in the date range
current_date = start_date
while current_date <= end_date:
    year = current_date.year
    month = current_date.month

    query_url = f"https://api.nytimes.com/svc/archive/v1/{year}/{month}.json?api-key={apikey}"

    r = requests.get(query_url)
    ny_data.update(r.json())
    all_articles.extend(ny_data['response']['docs'])

    # Move to the next month
    if current_date.month == 12:
        current_date = current_date.replace(year=current_date.year + 1, month=1)
    else:
        current_date = current_date.replace(month=current_date.month + 1)

# Filter articles and calculate sentiment
filtered_articles = []
for doc in all_articles:
    sec_name = doc['section_name']
    if sec_name not in ['Business Day', 'Technology', 'Real Estate']:
        continue

    pub_date = doc['pub_date']
    date = datetime.strptime(pub_date.split('T')[0], "%Y-%m-%d").date()
    if date < start_date.date() or date > end_date.date():
        continue

    headline = doc['headline']['main']
    abstract = doc['abstract']

    headline_sentiment = sia.polarity_scores(headline)['compound']
    abstract_sentiment = sia.polarity_scores(abstract)['compound']

    filtered_articles.append({
        'pub_date': str(date),
        'headline': headline,
        'abstract': abstract,
        'section': sec_name,
        'headline_sentiment': headline_sentiment,
        'abstract_sentiment': abstract_sentiment
    })

# Create DataFrame
cleaned_ny_data_df = pd.DataFrame(filtered_articles)

# Group by date and calculate mean and median sentiment scores
grouped_data = cleaned_ny_data_df.groupby('pub_date').agg({
    'headline_sentiment': ['mean', 'median'],
    'abstract_sentiment': ['mean', 'median']
}).reset_index()

# Flatten the multi-level column names
grouped_data.columns = ['_'.join(col).strip() for col in grouped_data.columns.values]

# Rename columns
grouped_data.columns = ['Date', 'Headline Mean', 'Headline Median', 'Body Mean', 'Body Median']

# Set the 'Date' column as the index of the grouped_data DataFrame
grouped_data.set_index('Date', inplace=True)

# Round the sentiment scores to two decimal places
grouped_data_rounded = grouped_data.round(2)

# Reindex the DataFrame to include the entire date range
start_date = pd.to_datetime('2022-03-13')
end_date = pd.to_datetime('2022-09-13')
date_range = pd.date_range(start_date, end_date, freq='D')
grouped_data_rounded_reindexed = grouped_data_rounded.reindex(date_range)

# Save the entire rounded grouped_data DataFrame to a CSV file
grouped_data_rounded.to_csv("ny_times_data.csv")

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [4]:
# Filter articles and calculate sentiment
filtered_articles = []
for doc in all_articles:
    pub_date = doc['pub_date']
    date = datetime.strptime(pub_date.split('T')[0], "%Y-%m-%d").date()
    if date < start_date.date() or date > end_date.date():
        continue

    headline = doc['headline']['main']
    abstract = doc['abstract']

    headline_sentiment = sia.polarity_scores(headline)['compound']
    abstract_sentiment = sia.polarity_scores(abstract)['compound']

    filtered_articles.append({
        'pub_date': str(date),
        'headline': headline,
        'abstract': abstract,
        'section': doc['section_name'],
        'headline_sentiment': headline_sentiment,
        'abstract_sentiment': abstract_sentiment
    })

# Create DataFrame
cleaned_ny_data_df = pd.DataFrame(filtered_articles)

# Group by date and calculate max and min sentiment scores
grouped_data = cleaned_ny_data_df.groupby('pub_date').agg({
    'headline_sentiment': ['max', 'min'],
    'abstract_sentiment': ['max', 'min']
}).reset_index()

# Flatten the multi-level column names
grouped_data.columns = ['_'.join(col).strip() for col in grouped_data.columns.values]

# Rename columns
grouped_data.columns = ['Date', 'Headline Max', 'Headline Min', 'Body Max', 'Body Min']

# Set the 'Date' column as the index of the grouped_data DataFrame
grouped_data.set_index('Date', inplace=True)

# Round the sentiment scores to two decimal places
grouped_data_rounded = grouped_data.round(2)

# Save the entire rounded grouped_data DataFrame to a CSV file
grouped_data_rounded.to_csv("ny_times_data_max_min.csv")

In [11]:
# Read in the two CSV files
df1 = pd.read_csv('ny_times_data.csv')
df2 = pd.read_csv('ny_times_data_max_min.csv')

# Merge the two dataframes using the date column as the key
merged_df = pd.merge(df1, df2, on='Date')

# Reorder the columns so that the date column is first
merged_df = merged_df[['Date', 'Headline Mean', 'Headline Median', 'Body Mean', 'Body Median', 'Headline Max', 'Headline Min', 'Body Max', 'Body Min']]

# Write the merged dataframe to a new CSV file
merged_df.to_csv('ny_times_merged_data.csv', index=False)
