In [None]:
# Setup and Imports

import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator
import pandas as pd
import boto3
import os

In [None]:
# Define Environment

sagemaker_session = sagemaker.Session()
role = get_execution_role()
bucket = "naija-sentiment-data"  # Replace with your actual S3 bucket name
prefix = "input"

In [None]:
# Load and Prepare Data 

df = pd.read_csv("labeled_tweets.csv")

# Map sentiment labels to numeric values
label_map = {"POSITIVE": 0, "NEGATIVE": 1, "NEUTRAL": 2}
df["label"] = df["sentiment"].map(label_map)

# Drop rows with missing or unknown labels
df = df.dropna(subset=["clean_text", "label"])
df = df[df["label"].isin([0, 1, 2])]

# Save as CSV for SageMaker
df[["label", "clean_text"]].to_csv("train.csv", index=False, header=False)

In [None]:
# Upload to S3 

s3_train_path = sagemaker_session.upload_data("train.csv", bucket=bucket, key_prefix=prefix)
print(f"Training data uploaded to: {s3_train_path}")

In [None]:
# Define XGBoost Estimator

container = sagemaker.image_uris.retrieve("xgboost", sagemaker_session.boto_region_name, "1.3-1")

xgb = Estimator(
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=f"s3://{bucket}/output",
    sagemaker_session=sagemaker_session
)

xgb.set_hyperparameters(
    objective="multi:softprob",
    num_class=3,
    num_round=100,
    max_depth=5,
    eta=0.2,
    subsample=0.8,
    colsample_bytree=0.8
)

In [None]:
#Train the Model

train_input = TrainingInput(s3_train_path, content_type="csv")
xgb.fit({"train": train_input})