In [1]:
#Improt necessary libraries
import requests
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from datetime import datetime, timedelta
import time
from google.colab import files
import re

# Connect to Google Colab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Read a CSV file from Google Drive
df = pd.read_csv('/content/drive/MyDrive/Masters/Dissertation/API_details.txt')

# Read a text file from Google Drive which contains API key and end point
with open('/content/drive/MyDrive/Masters/Dissertation/API_details.txt', 'r') as f:
  text = f.read()

first_line = text.split('\n')[0]
second_line = text.split('\n')[1]


In [3]:
# Download VADER lexicon
nltk.download('vader_lexicon')

# Initialize the Sentiment Intensity Analyzer
sia = SentimentIntensityAnalyzer()

# API configuration. These token taken from perigon
api_token = first_line
base_url = second_line

# Fetch news data for a specific date range and query
def fetch_news(start_date, end_date, query):
    headers = {
        'Authorization': f'Bearer {api_token}'
    }
    # param dictionary to define query parameters
    params = {
        'q': query,
        'from': start_date,
        'to': end_date,
        'language': 'en',
        'sort': 'date',
        'size': 100
    }
    response = requests.get(base_url, headers=headers, params=params)
    if response.status_code == 200:
        return response.json().get('articles', [])
    elif response.status_code == 400:
        print(f"Bad Request: Check parameters for {start_date} to {end_date}. Response: {response.json()}")
        return []
    elif response.status_code == 402:
        print("Payment Required: You have exceeded your free tier usage limits or need to upgrade your plan.")
        return []
    else:
        print(f"Failed to fetch news for {start_date} to {end_date}: {response.status_code}")
        return []

# Function to clean text
def clean_text(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)

# Function to analyze sentiment and save results to CSV files
def analyze_and_save(start_date, end_date, query, file_prefix):
    # Create DataFrames to hold results
    sentiment_columns = ["Date", "News", "Positive", "Neutral", "Negative", "Final Sentiment"]
    articles_df = pd.DataFrame(columns=["Date", "Title", "Content", "Sentiment"])
    sentiment_df = pd.DataFrame(columns=sentiment_columns)

    current_date = start_date

    while current_date <= end_date:
        next_date = current_date + timedelta(days=1)
        articles = fetch_news(current_date.strftime('%Y-%m-%d'), next_date.strftime('%Y-%m-%d'), query)
        for article in articles:
            content = article['content'] if article['content'] else article.get('summary', '')
            if content and 'pubDate' in article:
                content_clean = clean_text(content)
                scores = sia.polarity_scores(content_clean)
                final_sentiment = scores['pos'] - scores['neg']
                article_row = pd.DataFrame([{
                    "Date": article['pubDate'][:10],
                    "Title": article['title'],
                    "Content": content_clean,
                    "Sentiment": final_sentiment
                }])
                sentiment_row = pd.DataFrame([{
                    "Date": article['pubDate'][:10],
                    "News": content_clean,
                    "Positive": scores['pos'],
                    "Neutral": scores['neu'],
                    "Negative": scores['neg'],
                    "Final Sentiment": final_sentiment
                }])
                articles_df = pd.concat([articles_df, article_row], ignore_index=True)
                sentiment_df = pd.concat([sentiment_df, sentiment_row], ignore_index=True)

        # Save interim results every day
        articles_df.to_csv(f"{file_prefix}_news_articles_all.csv", index=False, encoding='utf-8-sig')
        sentiment_df.to_csv(f"{file_prefix}_sentiment_analysis_all.csv", index=False, encoding='utf-8-sig')

        current_date = next_date
        time.sleep(1)  # Sleep to avoid hitting the rate limit

    print(f"News articles saved to {file_prefix}_news_articles_all.csv")
    print(f"Sentiment analysis saved to {file_prefix}_sentiment_analysis_all.csv")

    # Download the files
    files.download(f'{file_prefix}_news_articles_all.csv')
    files.download(f'{file_prefix}_sentiment_analysis_all.csv')

    return articles_df, sentiment_df

user_dates = input("Please enter the number of days you need to look download: ")

while True:
    if user_dates.isnumeric():
        user_dates = int(user_dates)

        # Set up the date range
        start_date = datetime.now() - timedelta(days=user_dates)
        end_date = datetime.now()

        # Analyze and save news for S&P 500
        articles_df, sentiment_df = analyze_and_save(start_date, end_date, 'S&P 500', 'spx500')

        # Display dataframes
        print(articles_df.head())
        print(sentiment_df.head())
        break  # Exit the loop after processing
    else:
        user_dates = input("Invalid data. Please enter the number of days you need to look download: ")


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


Please enter the number of days you need to look download: 14


  articles_df = pd.concat([articles_df, article_row], ignore_index=True)
  sentiment_df = pd.concat([sentiment_df, sentiment_row], ignore_index=True)


News articles saved to spx500_news_articles_all.csv
Sentiment analysis saved to spx500_sentiment_analysis_all.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

         Date                                       Title  \
0  2024-08-13                     S&P 500 Momentum Report   
1  2024-08-13                  S&P 500 Analyst Moves: KEY   
2  2024-08-13             CRITICAL Week Ahead for S&P 500   
3  2024-08-13      S&P 500 Gains 1.7% in Everything Rally   
4  2024-08-14  The Highest Yielding Stocks In the S&P 500   

                                             Content  Sentiment  
0  All eyes on upcoming US inflation data this we...     -0.036  
1  The latest tally of analyst opinions from the ...      0.000  
2  In this video from StockCharts TV, Julius asse...      0.068  
3  The stock market rallied across the board on T...      0.125  
4  The Highest Yielding Stocks In the S&P 500\n\n...      0.000  
         Date                                               News  Positive  \
0  2024-08-13  All eyes on upcoming US inflation data this we...     0.000   
1  2024-08-13  The latest tally of analyst opinions from the ...     0.000   
2  