# Silver layer processing



## Startup cells

In [0]:
# Set environment variables for sagemaker_studio imports

import os
os.environ['DataZoneProjectId'] = '4hril53mejrp2f'
os.environ['DataZoneDomainId'] = 'dzd-5w47wlphwxsdev'
os.environ['DataZoneEnvironmentId'] = 'dcza2emsroy8br'
os.environ['DataZoneDomainRegion'] = 'us-east-1'

# create both a function and variable for metadata access
_resource_metadata = None

def _get_resource_metadata():
    global _resource_metadata
    if _resource_metadata is None:
        _resource_metadata = {
            "AdditionalMetadata": {
                "DataZoneProjectId": "4hril53mejrp2f",
                "DataZoneDomainId": "dzd-5w47wlphwxsdev",
                "DataZoneEnvironmentId": "dcza2emsroy8br",
                "DataZoneDomainRegion": "us-east-1",
            }
        }
    return _resource_metadata
metadata = _get_resource_metadata()

In [0]:
"""
Logging Configuration

Purpose:
--------
This sets up the logging framework for code executed in the user namespace.
"""

from typing import Optional


def _set_logging(log_dir: str, log_file: str, log_name: Optional[str] = None):
    import os
    import logging
    from logging.handlers import RotatingFileHandler

    level = logging.INFO
    max_bytes = 5 * 1024 * 1024
    backup_count = 5

    # fallback to /tmp dir on access, helpful for local dev setup
    try:
        os.makedirs(log_dir, exist_ok=True)
    except Exception:
        log_dir = "/tmp/kernels/"

    os.makedirs(log_dir, exist_ok=True)
    log_path = os.path.join(log_dir, log_file)

    logger = logging.getLogger() if not log_name else logging.getLogger(log_name)
    logger.handlers = []
    logger.setLevel(level)

    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")

    # Rotating file handler
    fh = RotatingFileHandler(filename=log_path, maxBytes=max_bytes, backupCount=backup_count, encoding="utf-8")
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    logger.info(f"Logging initialized for {log_name}.")


_set_logging("/var/log/computeEnvironments/kernel/", "kernel.log")
_set_logging("/var/log/studio/data-notebook-kernel-server/", "metrics.log", "metrics")

In [0]:
import logging
from sagemaker_studio import ClientConfig, sqlutils, sparkutils, dataframeutils

logger = logging.getLogger(__name__)
logger.info("Initializing sparkutils")
spark = sparkutils.init()
logger.info("Finished initializing sparkutils")

In [0]:
def _reset_os_path():
    """
    Reset the process's working directory to handle mount timing issues.
    
    This function resolves a race condition where the Python process starts
    before the filesystem mount is complete, causing the process to reference
    old mount paths and inodes. By explicitly changing to the mounted directory
    (/home/sagemaker-user), we ensure the process uses the correct, up-to-date
    mount point.
    
    The function logs stat information (device ID and inode) before and after
    the directory change to verify that the working directory is properly
    updated to reference the new mount.
    
    Note:
        This is executed at module import time to ensure the fix is applied
        as early as possible in the kernel initialization process.
    """
    try:
        import os
        import logging

        logger = logging.getLogger(__name__)
        logger.info("---------Before------")
        logger.info("CWD: %s", os.getcwd())
        logger.info("stat('.'): %s %s", os.stat('.').st_dev, os.stat('.').st_ino)
        logger.info("stat('/home/sagemaker-user'): %s %s", os.stat('/home/sagemaker-user').st_dev, os.stat('/home/sagemaker-user').st_ino)

        os.chdir("/home/sagemaker-user")

        logger.info("---------After------")
        logger.info("CWD: %s", os.getcwd())
        logger.info("stat('.'): %s %s", os.stat('.').st_dev, os.stat('.').st_ino)
        logger.info("stat('/home/sagemaker-user'): %s %s", os.stat('/home/sagemaker-user').st_dev, os.stat('/home/sagemaker-user').st_ino)
    except Exception as e:
        logger.exception(f"Failed to reset working directory: {e}")

_reset_os_path()

## Notebook

In [0]:
import boto3
import json

# Initialize S3 client
s3 = boto3.client('s3')

# Bucket name
bucket_name = 'raw-us-east-1-jngai-dev'

# List all objects in the bucket
response = s3.list_objects_v2(Bucket=bucket_name)

# Find the most recent file based on LastModified
latest_file = max(response['Contents'], key=lambda x: x['LastModified'])

print(f"Most recent file: {latest_file['Key']}")
print(f"Last modified: {latest_file['LastModified']}")
print(f"Size: {latest_file['Size']} bytes")
print("-" * 80)

# Download and read the JSON file
obj = s3.get_object(Bucket=bucket_name, Key=latest_file['Key'])
json_data = json.loads(obj['Body'].read().decode('utf-8'))

# Display the JSON data
print(f"\nJSON Content ({len(json_data)} items):\n")
json_data

Most recent file: fmp_articles_2025-12-12_13-43-11.json
Last modified: 2025-12-12 13:43:12+00:00
Size: 8348 bytes
--------------------------------------------------------------------------------

JSON Content (3 items):



[{'title': 'Ciena Corporation (NYSE:CIEN) Stock Update and Financial Performance',
  'date': '2025-12-11 20:06:31',
  'content': '<ul>\n<li><strong>Ciena Corporation</strong> (<a href="https://site.financialmodelingprep.com/financial-summary/CIEN">NYSE:CIEN</a>) has been identified as a global leader in networking systems, competing closely with giants like Cisco Systems and Nokia.</li>\n<li>Evercore ISI set a price target of <strong>$240</strong> for CIEN, closely aligning with its current stock price, indicating market confidence in the company\'s valuation.</li>\n<li>Ciena reported a <strong>20% year-over-year revenue growth</strong> and an improved operating margin of <strong>11.2%</strong>, signaling strong financial health and market position.</li>\n</ul>\n\n<p>Ciena Corporation (<a href="https://site.financialmodelingprep.com/financial-summary/CIEN">NYSE:CIEN</a>) stands out as a global leader in the networking systems, services, and software industry. The company is renowned fo

In [0]:
import pandas as pd

# Create DataFrame from the JSON data
df = pd.DataFrame(json_data)

# Display the DataFrame structure
print(f"DataFrame shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nData types:\n{df.dtypes}")
print("\n" + "="*80)

# Display the DataFrame
df

DataFrame shape: (3, 8)

Columns: ['title', 'date', 'content', 'tickers', 'image', 'link', 'author', 'site']

Data types:
title      object
date       object
content    object
tickers    object
image      object
link       object
author     object
site       object
dtype: object



Unnamed: 0,title,date,content,tickers,image,link,author,site
0,Ciena Corporation (NYSE:CIEN) Stock Update and...,2025-12-11 20:06:31,<ul>\n<li><strong>Ciena Corporation</strong> (...,NYSE:CIEN,https://portal.financialmodelingprep.com/posit...,https://financialmodelingprep.com/market-news/...,Alex Lavoie,Financial Modeling Prep
1,EssilorLuxottica (OTC:ESLOY) Maintains Strong ...,2025-12-11 20:00:06,"<ul>\n<li>Citigroup has maintained its ""Buy"" r...",OTC:ESLOY,https://portal.financialmodelingprep.com/posit...,https://financialmodelingprep.com/market-news/...,Alex Lavoie,Financial Modeling Prep
2,"General Mills, Inc. (GIS) Price Target Adjuste...",2025-12-11 19:03:40,"<p><a href=""https://site.financialmodelingprep...",NYSE:GIS,https://portal.financialmodelingprep.com/posit...,https://financialmodelingprep.com/market-news/...,Stuart Mooney,Financial Modeling Prep


In [0]:
import boto3
import json
from botocore.exceptions import ClientError

# Initialize Bedrock Runtime client
bedrock_runtime = boto3.client('bedrock-runtime', region_name='us-east-1')

# Function to analyze content with a small LLM
def analyze_content_with_llm(content_text):
    """Use AWS Bedrock to generate metadata for article content"""
    
    prompt = f"""Analyze the following article content and provide:
1. A brief summary (1-2 sentences)
2. The main topic (a few words)
3. Key entities mentioned (comma-separated list)
4. Sentiment (positive, negative, or neutral)

Format your response as JSON with keys: summary, topic, key_entities, sentiment

Article content:
{content_text[:2000]}

JSON response:"""
    
    try:
        # Using Amazon Titan Text Express model (small, cost-effective)
        response = bedrock_runtime.invoke_model(
            modelId='amazon.titan-text-express-v1',
            body=json.dumps({
                "inputText": prompt,
                "textGenerationConfig": {
                    "maxTokenCount": 500,
                    "temperature": 0.3,
                    "topP": 0.9
                }
            })
        )
        
        # Parse response
        response_body = json.loads(response['body'].read())
        generated_text = response_body['results'][0]['outputText']
        
        # Try to extract JSON from response
        try:
            # Find JSON in the response
            start_idx = generated_text.find('{')
            end_idx = generated_text.rfind('}') + 1
            if start_idx != -1 and end_idx > start_idx:
                metadata = json.loads(generated_text[start_idx:end_idx])
            else:
                # Fallback if JSON parsing fails
                metadata = {
                    "summary": "Unable to generate summary",
                    "topic": "Unknown",
                    "key_entities": "None",
                    "sentiment": "neutral"
                }
        except json.JSONDecodeError:
            metadata = {
                "summary": "Unable to generate summary",
                "topic": "Unknown",
                "key_entities": "None",
                "sentiment": "neutral"
            }
        
        return metadata
        
    except ClientError as e:
        print(f"Error calling Bedrock: {e}")
        return {
            "summary": "Error generating summary",
            "topic": "Error",
            "key_entities": "None",
            "sentiment": "neutral"
        }

# Process each row and add metadata columns
print("Analyzing content with LLM...")
print("=" * 80)

summaries = []
topics = []
key_entities_list = []
sentiments = []

for idx, row in df.iterrows():
    print(f"Processing article {idx + 1}/{len(df)}: {row['title'][:60]}...")
    
    # Strip HTML tags from content for cleaner analysis
    from html import unescape
    import re
    clean_content = re.sub('<[^<]+?>', '', row['content'])
    clean_content = unescape(clean_content)
    
    # Get metadata from LLM
    metadata = analyze_content_with_llm(clean_content)
    
    summaries.append(metadata.get('summary', ''))
    topics.append(metadata.get('topic', ''))
    key_entities_list.append(metadata.get('key_entities', ''))
    sentiments.append(metadata.get('sentiment', ''))
    
    print(f"  ✓ Summary: {metadata.get('summary', '')[:80]}...")
    print()

# Add new columns to DataFrame
df['summary'] = summaries
df['topic'] = topics
df['key_entities'] = key_entities_list
df['sentiment'] = sentiments

print("=" * 80)
print(f"✓ Metadata generation complete! Added 4 new columns.")
print(f"New DataFrame shape: {df.shape}")

# Display the enhanced DataFrame
df

Analyzing content with LLM...
Processing article 1/3: Ciena Corporation (NYSE:CIEN) Stock Update and Financial Per...


  ✓ Summary: Ciena Corporation (NYSE:CIEN) is a global leader in networking systems, competin...

Processing article 2/3: EssilorLuxottica (OTC:ESLOY) Maintains Strong Position in Ey...


  ✓ Summary: Citigroup maintains "Buy" rating for EssilorLuxottica, increasing price target f...

Processing article 3/3: General Mills, Inc. (GIS) Price Target Adjusted by Jefferies...


  ✓ Summary: General Mills, Inc. (NYSE:GIS) is a leading global food company known for its po...

✓ Metadata generation complete! Added 4 new columns.
New DataFrame shape: (3, 12)


Unnamed: 0,title,date,content,tickers,image,link,author,site,summary,topic,key_entities,sentiment
0,Ciena Corporation (NYSE:CIEN) Stock Update and...,2025-12-11 20:06:31,<ul>\n<li><strong>Ciena Corporation</strong> (...,NYSE:CIEN,https://portal.financialmodelingprep.com/posit...,https://financialmodelingprep.com/market-news/...,Alex Lavoie,Financial Modeling Prep,Ciena Corporation (NYSE:CIEN) is a global lead...,Ciena Corporation (NYSE:CIEN),"Ciena Corporation, Evercore ISI, Cisco Systems...",positive
1,EssilorLuxottica (OTC:ESLOY) Maintains Strong ...,2025-12-11 20:00:06,"<ul>\n<li>Citigroup has maintained its ""Buy"" r...",OTC:ESLOY,https://portal.financialmodelingprep.com/posit...,https://financialmodelingprep.com/market-news/...,Alex Lavoie,Financial Modeling Prep,"Citigroup maintains ""Buy"" rating for EssilorLu...",EssilorLuxottica,"Citigroup, EssilorLuxottica, Ray-Ban Meta glas...",positive
2,"General Mills, Inc. (GIS) Price Target Adjuste...",2025-12-11 19:03:40,"<p><a href=""https://site.financialmodelingprep...",NYSE:GIS,https://portal.financialmodelingprep.com/posit...,https://financialmodelingprep.com/market-news/...,Stuart Mooney,Financial Modeling Prep,"General Mills, Inc. (NYSE:GIS) is a leading gl...",General Mills,"General Mills, Inc., Jefferies, Scott Marks, T...",neutral


In [0]:
from pyspark.sql.types import StructType, StructField, StringType
import uuid

# Convert pandas DataFrame to Spark DataFrame
spark_df = spark.createDataFrame(df)

# Define S3 Tables components
catalog_name = "stage-us-east-1-jngai-dev"
namespace_name = "financial_articles"
table_name = "fmp_articles"

# Full table name with backticks for S3 Tables
full_table_name = f"`{catalog_name}`.`{namespace_name}`.`{table_name}`"

# Create namespace in S3 Tables using boto3
glue = boto3.client('glue')
catalog_id = f"629904435132:s3tablescatalog/{catalog_name}"

try:
    glue.create_database(
        DatabaseInput={'Name': namespace_name},
        CatalogId=catalog_id
    )
    print(f"✓ Created namespace: {namespace_name}")
except glue.exceptions.AlreadyExistsException:
    print(f"✓ Namespace already exists: {namespace_name}")

# Extract schema from DataFrame for CREATE TABLE statement
schema_ddl = ", ".join([f"`{field.name}` {field.dataType.simpleString()}" for field in spark_df.schema.fields])

# Create the table with explicit schema
create_table_sql = f"CREATE TABLE IF NOT EXISTS {full_table_name} ({schema_ddl}) USING iceberg"
spark.sql(create_table_sql)
print(f"✓ Created table: {full_table_name}")

# Write data to S3 Tables using writeTo (preferred method)
spark_df.writeTo(full_table_name).append()
print(f"✓ Successfully wrote {spark_df.count()} rows to S3 Tables")

# Verify the data was written
result_df = spark.read.table(full_table_name)
print(f"\n✓ Verification: Table contains {result_df.count()} rows")
result_df

✓ Created namespace: financial_articles


✓ Created table: `stage-us-east-1-jngai-dev`.`financial_articles`.`fmp_articles`


✓ Successfully wrote 3 rows to S3 Tables



✓ Verification: Table contains 3 rows


DataFrame[title: string, date: string, content: string, tickers: string, image: string, link: string, author: string, site: string, summary: string, topic: string, key_entities: string, sentiment: string]

## Shutdown cells

In [0]:
"""
Stop spark session and associated Athena Spark session
"""

from IPython import get_ipython as _get_ipython
_get_ipython().user_ns["spark"].stop()