# sagemaker_sentiment_inference



## Startup cells

In [0]:
# Set environment variables for sagemaker_studio imports

import os
os.environ['DataZoneProjectId'] = '4h8bblx1wxdajr'
os.environ['DataZoneDomainId'] = 'dzd-di92v7iln9ewdj'
os.environ['DataZoneEnvironmentId'] = 'd2o5rqoayfwyef'
os.environ['DataZoneDomainRegion'] = 'us-east-1'

# create both a function and variable for metadata access
_resource_metadata = None

def _get_resource_metadata():
    global _resource_metadata
    if _resource_metadata is None:
        _resource_metadata = {
            "AdditionalMetadata": {
                "DataZoneProjectId": "4h8bblx1wxdajr",
                "DataZoneDomainId": "dzd-di92v7iln9ewdj",
                "DataZoneEnvironmentId": "d2o5rqoayfwyef",
                "DataZoneDomainRegion": "us-east-1",
            }
        }
    return _resource_metadata
metadata = _get_resource_metadata()

In [0]:
"""
Logging Configuration

Purpose:
--------
This sets up the logging framework for code executed in the user namespace.
"""

from typing import Optional


def _set_logging(log_dir: str, log_file: str, log_name: Optional[str] = None):
    import os
    import logging
    from logging.handlers import RotatingFileHandler

    level = logging.INFO
    max_bytes = 5 * 1024 * 1024
    backup_count = 5

    # fallback to /tmp dir on access, helpful for local dev setup
    try:
        os.makedirs(log_dir, exist_ok=True)
    except Exception:
        log_dir = "/tmp/kernels/"

    os.makedirs(log_dir, exist_ok=True)
    log_path = os.path.join(log_dir, log_file)

    logger = logging.getLogger() if not log_name else logging.getLogger(log_name)
    logger.handlers = []
    logger.setLevel(level)

    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")

    # Rotating file handler
    fh = RotatingFileHandler(filename=log_path, maxBytes=max_bytes, backupCount=backup_count, encoding="utf-8")
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    logger.info(f"Logging initialized for {log_name}.")


_set_logging("/var/log/computeEnvironments/kernel/", "kernel.log")
_set_logging("/var/log/studio/data-notebook-kernel-server/", "metrics.log", "metrics")

In [0]:
import logging
from sagemaker_studio import ClientConfig, sqlutils, sparkutils, dataframeutils

logger = logging.getLogger(__name__)
logger.info("Initializing sparkutils")
spark = sparkutils.init()
logger.info("Finished initializing sparkutils")

In [0]:
def _reset_os_path():
    """
    Reset the process's working directory to handle mount timing issues.
    
    This function resolves a race condition where the Python process starts
    before the filesystem mount is complete, causing the process to reference
    old mount paths and inodes. By explicitly changing to the mounted directory
    (/home/sagemaker-user), we ensure the process uses the correct, up-to-date
    mount point.
    
    The function logs stat information (device ID and inode) before and after
    the directory change to verify that the working directory is properly
    updated to reference the new mount.
    
    Note:
        This is executed at module import time to ensure the fix is applied
        as early as possible in the kernel initialization process.
    """
    try:
        import os
        import logging

        logger = logging.getLogger(__name__)
        logger.info("---------Before------")
        logger.info("CWD: %s", os.getcwd())
        logger.info("stat('.'): %s %s", os.stat('.').st_dev, os.stat('.').st_ino)
        logger.info("stat('/home/sagemaker-user'): %s %s", os.stat('/home/sagemaker-user').st_dev, os.stat('/home/sagemaker-user').st_ino)

        os.chdir("/home/sagemaker-user")

        logger.info("---------After------")
        logger.info("CWD: %s", os.getcwd())
        logger.info("stat('.'): %s %s", os.stat('.').st_dev, os.stat('.').st_ino)
        logger.info("stat('/home/sagemaker-user'): %s %s", os.stat('/home/sagemaker-user').st_dev, os.stat('/home/sagemaker-user').st_ino)
    except Exception as e:
        logger.exception(f"Failed to reset working directory: {e}")

_reset_os_path()

## Notebook

In [0]:
import pandas as pd
import boto3
from io import StringIO

# S3 bucket and prefix
bucket_name = "aws-nlp-sentiment-tarun"
prefix = "processed/"

# Initialize S3 client
s3_client = boto3.client('s3')

# List all files in the processed directory
response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)

# Read and combine all part files
dataframes = []
for obj in response.get('Contents', []):
    key = obj['Key']
    if key != prefix:  # Skip the folder itself
        # Read each file
        obj_data = s3_client.get_object(Bucket=bucket_name, Key=key)
        content = obj_data['Body'].read().decode('utf-8')
        
        # Parse as CSV
        temp_df = pd.read_csv(StringIO(content))
        dataframes.append(temp_df)

# Combine all dataframes
df = pd.concat(dataframes, ignore_index=True)

print(f"Total rows loaded: {len(df)}")
df.head()

Total rows loaded: 28332


Unnamed: 0,name,reviews.text,reviews.rating
0,AmazonBasics AAA Performance Alkaline Batterie...,I order 3 of them and one of the item is bad q...,3
1,AmazonBasics AAA Performance Alkaline Batterie...,Bulk is always the less expensive way to go fo...,4
2,AmazonBasics AAA Performance Alkaline Batterie...,Well they are not Duracell but for the price i...,5
3,AmazonBasics AAA Performance Alkaline Batterie...,Seem to work as well as name brand batteries a...,5
4,AmazonBasics AAA Performance Alkaline Batterie...,These batteries are very long lasting the pric...,5


In [0]:
from transformers import pipeline
import torch

# Load pretrained sentiment model with PyTorch backend
sentiment_model = pipeline("sentiment-analysis", framework="pt", device=-1)

# Run on first 50 reviews (start small)
df_sample = df.head(50).copy()

# Apply sentiment analysis - note the column name is 'reviews.text'
df_sample["sentiment"] = df_sample["reviews.text"].apply(
    lambda x: sentiment_model(str(x)[:512])[0]['label'] if pd.notna(x) else None
)

df_sample[["reviews.text", "reviews.rating", "sentiment"]].head(10)

  from .autonotebook import tqdm as notebook_tqdm


2026-02-21 14:26:51.183214: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2026-02-21 14:26:52.381936: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.




2026-02-21 14:26:59.527651: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Device set to use cpu


Unnamed: 0,reviews.text,reviews.rating,sentiment
0,I order 3 of them and one of the item is bad q...,3,NEGATIVE
1,Bulk is always the less expensive way to go fo...,4,NEGATIVE
2,Well they are not Duracell but for the price i...,5,POSITIVE
3,Seem to work as well as name brand batteries a...,5,POSITIVE
4,These batteries are very long lasting the pric...,5,POSITIVE
5,Bought a lot of batteries for Christmas and th...,5,POSITIVE
6,ive not had any problame with these batteries ...,5,NEGATIVE
7,Well if you are looking for cheap non-recharge...,5,POSITIVE
8,These do not hold the amount of high power jui...,3,NEGATIVE
9,AmazonBasics AA AAA batteries have done well b...,4,POSITIVE


In [0]:
# Run sentiment on full dataset in batches
batch_size = 500
results = []

for i in range(0, len(df), batch_size):
    batch = df.iloc[i:i+batch_size].copy()
    
    batch["sentiment"] = batch["reviews.text"].apply(
        lambda x: sentiment_model(str(x)[:512])[0]['label'] if pd.notna(x) else None
    )
    
    results.append(batch)

# Combine all batches
df_final = pd.concat(results, ignore_index=True)

print("Final dataset size:", len(df_final))
df_final.head()

Final dataset size: 28332


Unnamed: 0,name,reviews.text,reviews.rating,sentiment
0,AmazonBasics AAA Performance Alkaline Batterie...,I order 3 of them and one of the item is bad q...,3,NEGATIVE
1,AmazonBasics AAA Performance Alkaline Batterie...,Bulk is always the less expensive way to go fo...,4,NEGATIVE
2,AmazonBasics AAA Performance Alkaline Batterie...,Well they are not Duracell but for the price i...,5,POSITIVE
3,AmazonBasics AAA Performance Alkaline Batterie...,Seem to work as well as name brand batteries a...,5,POSITIVE
4,AmazonBasics AAA Performance Alkaline Batterie...,These batteries are very long lasting the pric...,5,POSITIVE


In [0]:
import io
import boto3
import pandas as pd

# Check if df_final exists, if not load from previous cell execution
try:
    df_final
    print(f"Using existing df_final with {len(df_final)} rows")
except NameError:
    print("df_final not found. Please re-run the previous cells first (cells d6gd1h36vgwvxj, 6qjaowpqobynmv, and 55iuke3lba8lk7)")
    raise

# Initialize S3 client
s3_client = boto3.client('s3')

# Save to CSV in memory
csv_buffer = io.StringIO()
df_final.to_csv(csv_buffer, index=False)

# Upload to S3
s3_client.put_object(
    Bucket="aws-nlp-sentiment-tarun",
    Key="predictions/sentiment_results.csv",
    Body=csv_buffer.getvalue()
)

print("✓ Predictions uploaded to S3 successfully!")
print(f"✓ Location: s3://aws-nlp-sentiment-tarun/predictions/sentiment_results.csv")
print(f"✓ Total rows: {len(df_final)}")

Using existing df_final with 28332 rows


✓ Predictions uploaded to S3 successfully!
✓ Location: s3://aws-nlp-sentiment-tarun/predictions/sentiment_results.csv
✓ Total rows: 28332


## Shutdown cells

In [0]:
"""
Stop spark session and associated Athena Spark session
"""

from IPython import get_ipython as _get_ipython
_get_ipython().user_ns["spark"].stop()