# Training Data Generator

Purpose of this file is to:
- Explain how the LLM training data is generated
- What format the Data is in

## Input Example

**Week Ending on:** 2022-12-02

**News Summary:**

- **Title of Article 1** (2022-11-28): Summary of Article 1. Author: Author Name, Source: News Source, Sentiment Score: 0.5

**Stock Closing Prices for Past 3 Months (Fridays):**

- 2022-09-02: $3000, ...



## Output Example

**Predicted Stock Closing Prices for Next 12 Weeks (Fridays):**

- 2022-12-09: $3050, ...

**Predicted Market Sentiment:**

- 0.5


## Processing an Example

### Imports

In [1]:
import json
import pandas as pd
from datetime import datetime, timedelta

### Get Friday Function

- gets the friday of the week the input date is on

In [2]:

def get_friday(date_str):
    date = datetime.strptime(date_str, "%Y%m%dT%H%M%S")
    return (date + timedelta(days=(4 - date.weekday()))).strftime("%Y-%m-%d")


### Open News &  Closing Price Dataset

In [3]:
ticker = "AAPL"

In [4]:

# Load news data
with open(f'./News/{ticker}_News/News_{ticker}_202301_to_202301.json', 'r') as file:
    news_data = json.load(file)


In [6]:

# Load price data
with open(f'./Weekly-Closing-Prices/{ticker}_Pricing/{ticker}_weekly_close.json', 'r') as file:
    price_data = json.load(file)


### Group Articles by Week

In [7]:

# Group articles by the week ending on Friday
weekly_articles = {}
for article in news_data['feed']:
    date = get_friday(article['time_published'])
    if date not in weekly_articles:
        weekly_articles[date] = []
    weekly_articles[date].append(article)


In [8]:

for week in weekly_articles:
    weekly_articles[week].sort(key=lambda x: x['time_published'])


### Get Prices Function

- Gets the past 3 months of close prices
- gets the next 12 weeks of close prices

In [9]:

# Function to get the past 3 months and next 12 weeks of prices (Fridays only)
def get_prices(week_date, price_data, past_weeks=12, future_weeks=12):
    past_prices = {}
    future_prices = {}
    current_date = datetime.strptime(week_date, "%Y-%m-%d")
    
    # Get past prices
    for _ in range(past_weeks):
        current_date -= timedelta(days=7)
        date_str = current_date.strftime("%Y-%m-%d")
        if date_str in price_data:
            past_prices[date_str] = price_data[date_str]

    # Reset date and get future prices
    current_date = datetime.strptime(week_date, "%Y-%m-%d")
    for _ in range(future_weeks):
        current_date += timedelta(days=7)
        date_str = current_date.strftime("%Y-%m-%d")
        if date_str in price_data:
            future_prices[date_str] = price_data[date_str]

    return past_prices, future_prices


### Adjusting Data for format

In [10]:

# Prepare data for DataFrame
data = []

for week, articles in weekly_articles.items():
    prompt = f"Week Ending on {week}:\n\nNews Summary:\n"
    total_sentiment = 0
    for i, article in enumerate(articles, start=1):
        date_published = datetime.strptime(article['time_published'], "%Y%m%dT%H%M%S").strftime("%Y-%m-%d")
        author = article['authors'][0] if article['authors'] else "Unknown Author"
        prompt += f"{i}. {article['title']} - {date_published}: {article['summary']}, Author: {author}, Source: {article['source']}, Sentiment Score: {article['overall_sentiment_score']}\n"
        total_sentiment += article['overall_sentiment_score']

    past_prices, future_prices = get_prices(week, price_data)

    prompt += "\nStock Closing Prices for Past 3 Months (Fridays):\n"
    for date, price in sorted(past_prices.items()):
        prompt += f"{date}: {price}, "

    output = "\nPredicted Stock Closing Prices for Next 12 Weeks (Fridays):\n"
    for date, price in sorted(future_prices.items()):
        output += f"{date}: {price}, "
    output += f"\nPredicted Market Sentiment: {total_sentiment / len(articles) if articles else 'N/A'}"

    data.append([week, prompt, output])


###  Saving Data to Excel

In [11]:

# Create DataFrame and save as Excel
df = pd.DataFrame(data, columns=['Week', 'Prompt', 'Output'])
df.to_excel('LLM-Training-Dataset/example_training_data.xlsx', index=False)


# Final Data Generator

In [13]:
import json
import pandas as pd
from datetime import datetime, timedelta
import os
import glob

def get_friday(date_str):
    date = datetime.strptime(date_str, "%Y%m%dT%H%M%S")
    return (date + timedelta(days=(4 - date.weekday()))).strftime("%Y-%m-%d")

# Initialize a dictionary to hold all weekly articles
weekly_articles = {}

# Assuming all JSON news files are in the './News/' folder
ticker = "MSFT"

news_files = glob.glob(f'./News/{ticker}_News/*.json')
for file_path in news_files:
    with open(file_path, 'r') as file:
        news_data = json.load(file)
        # Process each article in the file
        for article in news_data['feed']:
            date = get_friday(article['time_published'])
            if date not in weekly_articles:
                weekly_articles[date] = []
            weekly_articles[date].append(article)

# Sort articles within each week
for week in weekly_articles:
    weekly_articles[week].sort(key=lambda x: x['time_published'])

# Load price data
with open(f'./Weekly-Closing-Prices/{ticker}_Pricing/{ticker}_weekly_close.json', 'r') as file:
    price_data = json.load(file)

def get_prices(week_date, price_data, past_weeks=12, future_weeks=12):
    past_prices = {}
    future_prices = {}
    current_date = datetime.strptime(week_date, "%Y-%m-%d")
    
    # Get past prices
    for _ in range(past_weeks):
        current_date -= timedelta(days=7)
        date_str = current_date.strftime("%Y-%m-%d")
        if date_str in price_data:
            past_prices[date_str] = price_data[date_str]

    # Reset date and get future prices
    current_date = datetime.strptime(week_date, "%Y-%m-%d")
    for _ in range(future_weeks):
        current_date += timedelta(days=7)
        date_str = current_date.strftime("%Y-%m-%d")
        if date_str in price_data:
            future_prices[date_str] = price_data[date_str]

    return past_prices, future_prices

data = []

start = f"Given the following information, including news summaries with sentiment scores and stock closing prices for the past 3 months, predict the next 12 weeks of closing prices for {ticker}. The predictions should be returned in a date:price format, similar to the historical closing prices provided. Ensure each prediction is on a new line and follows the exact format as shown in the examples (YYYY-MM-DD: $price).
"

for week, articles in weekly_articles.items():
    # Sort articles by sentiment score in descending order
    articles_sorted_positive = sorted([a for a in articles if a['overall_sentiment_score'] > 0], key=lambda x: x['overall_sentiment_score'], reverse=True)
    articles_sorted_negative = sorted([a for a in articles if a['overall_sentiment_score'] < 0], key=lambda x: x['overall_sentiment_score'])
    
    # Combine the top 25 positive and top 25 negative articles, if available
    top_articles = articles_sorted_positive[:10] + articles_sorted_negative[:10]
    
    prompt = f"Week Ending on {week}:\n\nNews Summary:\n"
    total_sentiment = 0
    seen_titles = set()

    for article in top_articles:
        if article['title'] not in seen_titles:
            date_published = datetime.strptime(article['time_published'], "%Y%m%dT%H%M%S").strftime("%Y-%m-%d")
            author = article['authors'][0] if article['authors'] else "Unknown Author"
            prompt += f"- Title: {article['title']}, Date: {date_published}, Author: {author}, Source: {article['source']}, Sentiment Score: {article['overall_sentiment_score']}\n"
            total_sentiment += article['overall_sentiment_score']
            seen_titles.add(article['title'])

    past_prices, future_prices = get_prices(week, price_data)

    prompt += "\nStock Closing Prices for Past 3 Months (Fridays):\n"
    for date, price in sorted(past_prices.items()):
        prompt += f"{date}: ${price}, \n"

    output = "\nPredicted Stock Closing Prices for Next 12 Weeks (Fridays):\n"
    for date, price in sorted(future_prices.items()):
        output += f"{date}: ${price}, "
    output += f"\nPredicted Market Sentiment: {total_sentiment / len(seen_titles) if seen_titles else 'N/A'}"

    data.append([week, prompt, output])

# Create DataFrame and save as Excel
df = pd.DataFrame(data, columns=['Week', 'Prompt', 'Output'])
df.to_excel(f'LLM-Training-Dataset/{ticker}_training_data_influential_articles.xlsx', index=False)