In [None]:
# pyspark packages
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, BooleanType

#other needed packages
import re
import os

# Set JAVA_HOME for PySpark
os.environ['JAVA_HOME'] = '/opt/homebrew/opt/openjdk@17'

spark = SparkSession.builder \
    .appName("stock market preds") \
    .config("spark.driver.host", "127.0.0.1") \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/01/16 04:05:35 WARN Utils: Your hostname, Jeffreys-MacBook-Air.local, resolves to a loopback address: 127.0.0.1; using 10.0.0.17 instead (on interface en0)
26/01/16 04:05:35 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/01/16 04:05:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# functions to import data, run SQL from file and save back to file
# function to import and clean columns
def import_csv_to_table(table_name, file, format_cols):

    #read source files
    df = spark.read.csv(file, header=True, quote="\"",
                        escape="\"", multiLine=True, inferSchema=True)

    #clean column names
    if format_cols:
        cols_formatted = [re.sub(r"[^a-zA-Z0-9\s]", "", col_name).lower().replace(" ", "_") for col_name in df.columns]
        df = df.toDF(*cols_formatted)

    # create SQL view
    df.createOrReplaceTempView(f"{table_name}")
    return df

#run a SQL step
def sql_step(file):
    with open(file, 'r', encoding='utf-8') as file:
        sql_text = file.read()
    results = spark.sql(sql_text)
    return results

#run SQL and view output inline
def run_sql(file, rowstoshow, print_sql):
    with open(file, 'r', encoding='utf-8') as file:
        sql_text = file.read()
    results = spark.sql(sql_text)
    if print_sql == True: print(sql_text)
    results.show(rowstoshow, truncate=False)

# export data frame to csv
def export_csv(df, output_dir, final_file_name):
    df.coalesce(1).write.csv(output_dir, header=True, mode="overwrite")
    for file in os.listdir(output_dir):
        if file.startswith("part-") and file.endswith(".csv"):
            part_file_path = os.path.join(output_dir, file)
            break
    if part_file_path:
        os.rename(part_file_path, os.path.join(output_dir, final_file_name))
        print(f"CSV saved as: {final_file_name}")
    else:
        print("Error: Part file not found.")

In [3]:
news = import_csv_to_table("news", "raw_data/news_data.csv", False)
stocks = import_csv_to_table("stocks", "raw_data/stock_data.csv", False)

In [4]:
feature_set = sql_step("sql/sentiment_data_prep.sql")
feature_set.show(20, truncate=False)

+------+----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
|symbol|news_date |daily_news_text                                                                                                                                                                                                                                                                                                                                                                                               |percent_daily_price_change|
+------+----------+-----------------------------------------------------------------------------------------

In [16]:
news.show()

+--------+--------------------+--------------------+-----------------+-------------------+-------------------+--------------------+--------------------+--------+
|      id|            headline|             summary|           author|         created_at|         updated_at|                 url|             symbols|  source|
+--------+--------------------+--------------------+-----------------+-------------------+-------------------+--------------------+--------------------+--------+
|49701666|Evercore ISI Grou...|                NULL|Benzinga Newsdesk|2026-01-05 10:50:55|2026-01-05 10:50:56|https://www.benzi...|                   A|benzinga|
|49391324|Barclays Upgrades...|                NULL|Benzinga Newsdesk|2025-12-15 06:49:28|2025-12-15 06:49:29|https://www.benzi...|                   A|benzinga|
|49342760|What's Driving th...|                    |Benzinga Insights|2025-12-11 11:00:38|2025-12-11 11:00:39|https://www.benzi...|                   A|benzinga|
|49276887|Goldman Sachs Ini.

In [20]:
# Convert to pandas and create 80/20 train/test split (no article_id leakage)
import pandas as pd
df = feature_set.toPandas()
unique_ids = df['news_article_id'].unique()
train_ids = pd.Series(unique_ids).sample(frac=0.8, random_state=42).values
train_df, test_df = df[df['news_article_id'].isin(train_ids)], df[~df['news_article_id'].isin(train_ids)]
print(f"Train: {len(train_df)}, Test: {len(test_df)}")

Train: 28264, Test: 7166


In [21]:
train_df.head()

Unnamed: 0,news_article_id,symbol,news_date,article_text,percent_daily_price_change
0,48346820,LII,2025-10-22,"Headline: Earnings Scheduled For October 22, 2...",-0.013263
1,48346820,SF,2025-10-22,"Headline: Earnings Scheduled For October 22, 2...",-0.000508
2,48931230,VRT,2025-11-18,Headline: 10 Industrials Stocks With Whale Ale...,-0.097841
3,47070626,AAON,2025-08-12,Headline: Mercury Systems Posts Better-Than-Ex...,-0.04196
4,49699047,TSLA,2026-01-05,Headline: EXCLUSIVE: Top 20 Most-Searched Tick...,-0.010301


In [None]:
import json

with open('api_keys.json', 'r') as file:
    # Code to load the data goes here
    api_keys = json.load(file)

HUGGINGFACE_KEY = api_keys['HUGGINGFACE_KEY']

In [None]:
# ==========================================
# FinGPT-Forecaster Integration
# ==========================================
# Note: Using Apple Silicon MPS for acceleration
# 
# PREREQUISITES:
# 1. Request access to Llama-2 at: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
# 2. Run: huggingface-cli login (in terminal) with your HF token
#    OR set token below

from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from huggingface_hub import login
import torch
import re
from tqdm import tqdm

# ========== HUGGING FACE LOGIN ==========
login(token=HUGGINGFACE_KEY)

# Check MPS availability and set device
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    print(f"✅ MPS (Metal Performance Shaders) is available!")
    print(f"   Using device: {mps_device}")
else:
    mps_device = torch.device("cpu")
    print("⚠️ MPS not available, falling back to CPU")

# Load FinGPT-Forecaster model with MPS support
print("\nLoading FinGPT-Forecaster model...")
print("(Using float16 to reduce memory - requires ~14GB RAM)\n")

# Clear any cached memory first
import gc
gc.collect()
if torch.backends.mps.is_available():
    torch.mps.empty_cache()

base_model = AutoModelForCausalLM.from_pretrained(
    'meta-llama/Llama-2-7b-chat-hf',
    trust_remote_code=True,
    torch_dtype=torch.float16,  # Use float16 to halve memory usage
    low_cpu_mem_usage=True,
    device_map="auto",  # Let it handle device placement
)

forecaster_tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-chat-hf')
forecaster_model = PeftModel.from_pretrained(base_model, 'FinGPT/fingpt-forecaster_dow30_llama2-7b_lora')
forecaster_model = forecaster_model.eval()

print(f"\n✅ Model loaded successfully!")
print(f"   Model dtype: {next(forecaster_model.parameters()).dtype}")
print(f"   Model device: {next(forecaster_model.parameters()).device}")

✅ MPS (Metal Performance Shaders) is available!
   Using device: mps

Loading FinGPT-Forecaster model...
(This requires ~14GB RAM and may take 1-2 minutes)



'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 4a8f654c-e33c-4374-93a1-fcc484100134)')' thrown while requesting GET https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/resolve/main/config.json
Retrying in 1s [Retry 1/5].
'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 3747d447-c878-4684-9691-f5e0c178e43e)')' thrown while requesting GET https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/resolve/main/config.json
Retrying in 2s [Retry 2/5].
`torch_dtype` is deprecated! Use `dtype` instead!
Fetching 2 files: 100%|██████████| 2/2 [02:40<00:00, 80.10s/it] 
Loading checkpoint shards: 100%|██████████| 2/2 [03:11<00:00, 95.56s/it] 


: 

In [None]:
# ==========================================
# Create Forecaster Prompt Function
# ==========================================

# System prompt for FinGPT-Forecaster
SYSTEM_PROMPT = """You are a seasoned stock market analyst. Your task is to list the positive developments and potential concerns for companies based on relevant news and basic financials from the past weeks, then provide an analysis and prediction for the companies' stock price movement for the upcoming week. Your answer format should be as follows:

[Positive Developments]:
1. ...

[Potential Concerns]:
1. ...

[Prediction & Analysis]:
Prediction: {Up/Down}
Analysis: ...
"""

def create_forecaster_prompt(symbol, news_date, article_text):
    """
    Create a prompt for FinGPT-Forecaster from your feature_set data
    """
    # Llama-2 chat format tokens
    B_INST, E_INST = "[INST]", "[/INST]"
    B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
    
    user_prompt = f"""
[Company Symbol]: {symbol}
[Date]: {news_date}

From the news articles below, analyze {symbol}'s stock outlook:

{article_text}

Based on the information above, analyze the positive developments and potential concerns for {symbol}. Then predict whether the stock price will go Up or Down tomorrow.
"""
    
    full_prompt = B_INST + B_SYS + SYSTEM_PROMPT + E_SYS + user_prompt + E_INST
    return full_prompt


def get_fingpt_prediction(symbol, news_date, article_text, max_length=2048):
    """
    Get a price movement prediction from FinGPT-Forecaster
    Returns: dict with 'prediction' (Up/Down/Neutral), 'analysis', 'raw_output'
    """
    prompt = create_forecaster_prompt(symbol, news_date, article_text)
    
    inputs = forecaster_tokenizer(prompt, return_tensors='pt', truncation=True, max_length=1024)
    inputs = {key: value.to(forecaster_model.device) for key, value in inputs.items()}
    
    with torch.no_grad():
        output_ids = forecaster_model.generate(
            **inputs,
            max_length=max_length,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            eos_token_id=forecaster_tokenizer.eos_token_id,
            use_cache=True
        )
    
    output = forecaster_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    answer = re.sub(r'.*\[/INST\]\s*', '', output, flags=re.DOTALL)
    
    # Extract prediction (Up/Down)
    prediction = "Neutral"
    if re.search(r'prediction[:\s]*(up|increase|rise|positive)', answer, re.IGNORECASE):
        prediction = "Up"
    elif re.search(r'prediction[:\s]*(down|decrease|fall|negative|decline)', answer, re.IGNORECASE):
        prediction = "Down"
    
    return {
        'prediction': prediction,
        'analysis': answer,
        'raw_output': output
    }

print("Forecaster functions ready!")

In [None]:
# ==========================================
# Apply FinGPT-Forecaster to feature_set
# ==========================================

# Convert to pandas if needed
df_predictions = feature_set.toPandas()

# Limit to a sample for testing (remove/increase for full dataset)
sample_size = 10  # Adjust as needed - each prediction takes ~10-30 seconds
df_sample = df_predictions.head(sample_size).copy()

print(f"Running FinGPT-Forecaster on {len(df_sample)} samples...")
print("="*60)

# Store predictions
predictions = []
analyses = []

for idx, row in tqdm(df_sample.iterrows(), total=len(df_sample), desc="Predicting"):
    try:
        result = get_fingpt_prediction(
            symbol=row['symbol'],
            news_date=str(row['news_date']),
            article_text=row['daily_news_text'][:2000]  # Truncate very long articles
        )
        predictions.append(result['prediction'])
        analyses.append(result['analysis'][:500])  # Store truncated analysis
        
        # Print progress
        actual_direction = "Up" if row['percent_daily_price_change'] > 0 else "Down"
        print(f"\n{row['symbol']} ({row['news_date']}): Predicted={result['prediction']}, Actual={actual_direction}")
        
    except Exception as e:
        print(f"Error on {row['symbol']}: {e}")
        predictions.append("Error")
        analyses.append(str(e))

# Add predictions to dataframe
df_sample['fingpt_prediction'] = predictions
df_sample['fingpt_analysis'] = analyses

# Calculate actual direction for comparison
df_sample['actual_direction'] = df_sample['percent_daily_price_change'].apply(
    lambda x: "Up" if x > 0 else "Down"
)

print("\n" + "="*60)
print("Predictions complete!")

In [None]:
# ==========================================
# Evaluate FinGPT-Forecaster Performance
# ==========================================
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd

# Filter out errors
df_eval = df_sample[df_sample['fingpt_prediction'] != 'Error'].copy()

if len(df_eval) > 0:
    # Calculate accuracy
    accuracy = accuracy_score(df_eval['actual_direction'], df_eval['fingpt_prediction'])
    
    print("="*60)
    print("FinGPT-Forecaster Evaluation Results")
    print("="*60)
    print(f"\nAccuracy: {accuracy:.2%}")
    print(f"Samples evaluated: {len(df_eval)}")
    
    print("\nClassification Report:")
    print(classification_report(df_eval['actual_direction'], df_eval['fingpt_prediction']))
    
    print("\nConfusion Matrix:")
    cm = confusion_matrix(df_eval['actual_direction'], df_eval['fingpt_prediction'], labels=['Up', 'Down'])
    cm_df = pd.DataFrame(cm, index=['Actual Up', 'Actual Down'], columns=['Pred Up', 'Pred Down'])
    print(cm_df)
    
    # Show sample predictions with actual results
    print("\n" + "="*60)
    print("Sample Predictions vs Actual:")
    print("="*60)
    display_cols = ['symbol', 'news_date', 'fingpt_prediction', 'actual_direction', 'percent_daily_price_change']
    print(df_eval[display_cols].to_string(index=False))
else:
    print("No successful predictions to evaluate.")