# Churn Rate using SKLearn Custom Script in Sagemaker

## Let's divide the workload
- Initialize Boto3 SDK and create S3 bucket.
- Upload data in Sagemaker Local Storage.
- Data Exploration and Understanding.
- Split the data into Train/Test CSV File.
- Upload data into the S3 Bucket.
- Create Training Script
- Train script in-side Sagemaker container.
- Store Model Artifacts(model.tar.gz) into the S3 Bucket.
- Deploy Sagemaker Endpoint(API) for trained model, and test it.

In [1]:
import sklearn

In [2]:
sklearn.__version__

'1.7.2'

## 1. Initialize Boto3 SDK and creat S3 bucket

In [3]:
import numpy as np
from sagemaker import get_execution_role
import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
import datetime
import time
import tarfile
import boto3
import pandas as pd

sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = 'predict-churn-rate' # Mention the created S3 bucket name here
print("Using bucket " + bucket)

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\GIA DAT\AppData\Local\sagemaker\sagemaker\config.yaml
Using bucket predict-churn-rate


## 3. EDA

In [4]:
df = pd.read_csv(
        r"C:\Users\GIA DAT\ML Zoomcamp\3. Classification\WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [5]:
def load_data():
    df = pd.read_csv(
        r"C:\Users\GIA DAT\ML Zoomcamp\3. Classification\WA_Fn-UseC_-Telco-Customer-Churn.csv")

    df.columns = df.columns.str.lower().str.replace(' ', '_')

    categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

    for c in categorical_columns:
        df[c] = df[c].str.lower().str.replace(' ', '_')

    df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
    df.totalcharges = df.totalcharges.fillna(0)

    df.churn = (df.churn == 'yes').astype(int)
    return df

In [6]:
df = load_data()
numerical = ['tenure', 'monthlycharges', 'totalcharges']

categorical = [
    'gender',
    'seniorcitizen',
    'partner',
    'dependents',
    'phoneservice',
    'multiplelines',
    'internetservice',
    'onlinesecurity',
    'onlinebackup',
    'deviceprotection',
    'techsupport',
    'streamingtv',
    'streamingmovies',
    'contract',
    'paperlessbilling',
    'paymentmethod',
]

x = df[numerical + categorical]
y = df.churn

In [7]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=0, stratify=y)

In [8]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5634, 19)
(1409, 19)
(5634,)
(1409,)


## 4. Split the data intoo train/ test CSV File

In [9]:
trainX = pd.DataFrame(X_train)
trainX['churn'] = y_train

testX = pd.DataFrame(X_test)
testX['churn'] = y_test

In [10]:
print(trainX.shape)
print(testX.shape)

(5634, 20)
(1409, 20)


## 5. Upload data into S3 bucket

In [11]:
trainX.to_csv("train-V-1.csv",index = False)
testX.to_csv("test-V-1.csv", index = False)

In [12]:
# send data to S3. SageMaker will take training data from s3
sk_prefix = "sagemaker/churn_rate_prediction/sklearncontainer"

trainpath = sess.upload_data(
    path="train-V-1.csv", 
    bucket=bucket, 
    key_prefix=f"{sk_prefix}/train"
)

testpath = sess.upload_data(
    path="test-V-1.csv", 
    bucket=bucket, 
    key_prefix=f"{sk_prefix}/test"
)


In [13]:
testpath

's3://predict-churn-rate/sagemaker/churn_rate_prediction/sklearncontainer/test/test-V-1.csv'

In [14]:
trainpath

's3://predict-churn-rate/sagemaker/churn_rate_prediction/sklearncontainer/train/train-V-1.csv'

## 6. Create Training Script

In [19]:
%%writefile script.py
import argparse
import os
import pickle
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
import sys


def model_fn(model_dir):
    model_path = os.path.join(model_dir, "model.bin")
    with open(model_path, "rb") as f_in:
        model = pickle.load(f_in)
    return model


if __name__ == "__main__":

    print("[INFO] Extracting arguments...")
    parser = argparse.ArgumentParser()

    # Hyperparameters
    parser.add_argument("--solver", type=str, default="liblinear")
    parser.add_argument("--C", type=float, default=1.0)

    # SageMaker default arguments (folders)
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-V-1.csv")
    parser.add_argument("--test-file", type=str, default="test-V-1.csv")


    args, _ = parser.parse_known_args()

    print("SKLearn Version:", sklearn.__version__)
    print()

    # ===================== READ DATA =====================
    print("[INFO] Loading data...")

    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    train_df.columns = train_df.columns.str.lower().str.replace(" ", "_")
    test_df.columns = test_df.columns.str.lower().str.replace(" ", "_")

    # Clean categorical columns
    cat_cols = list(train_df.dtypes[train_df.dtypes == "object"].index)
    for c in cat_cols:
        train_df[c] = train_df[c].str.lower().str.replace(" ", "_")
        test_df[c] = test_df[c].str.lower().str.replace(" ", "_")

    # Fix totalcharges numeric issue
    train_df.totalcharges = pd.to_numeric(train_df.totalcharges, errors="coerce").fillna(0)
    test_df.totalcharges = pd.to_numeric(test_df.totalcharges, errors="coerce").fillna(0)

    # Target
    # train_df.churn = (train_df.churn == "yes").astype(int)
    # test_df.churn = (test_df.churn == "yes").astype(int)

    # Features
    numerical = ['tenure', 'monthlycharges', 'totalcharges']
    categorical = [
        'gender', 'seniorcitizen', 'partner', 'dependents', 'phoneservice',
        'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup',
        'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies',
        'contract', 'paperlessbilling', 'paymentmethod'
    ]

    X_train = train_df[categorical + numerical].to_dict(orient="records")
    y_train = train_df.churn

    unique_classes, counts = np.unique(y_train, return_counts=True)
    print(f"[INFO] Classes in training data: {dict(zip(unique_classes, counts))}")
    
    if len(unique_classes) < 2:
        print("[ERROR] Training data must have at least 2 classes. Exiting...")
        sys.exit(1)

    X_test = test_df[categorical + numerical].to_dict(orient="records")
    y_test = test_df.churn

    print("[INFO] Training Logistic Regression model...")
    pipeline = make_pipeline(
        DictVectorizer(),
        LogisticRegression(solver=args.solver, C=args.C)
    )

    pipeline.fit(X_train, y_train)

    # ===================== SAVE MODEL =====================
    model_path = os.path.join(args.model_dir, "model.bin")
    with open(model_path, "wb") as f_out:
        pickle.dump(pipeline, f_out)

    print(f"[INFO] Model saved at {model_path}")

    # ===================== EVALUATION =====================
    y_pred = pipeline.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    print()
    print("===== TEST METRICS =====")
    print(f"Accuracy: {acc:.4f}")
    print("Classification report:")
    print(classification_report(y_test, y_pred))


Overwriting script.py


In [20]:
! python script.py --model-dir ./ \
                   --train ./ \
                   --test ./ \
                   --train-file train-V-1.csv \
                   --test-file test-V-1.csv \
                   --solver liblinear \
                   --C 1.0


[INFO] Extracting arguments...
SKLearn Version: 1.7.2

[INFO] Loading data...
[INFO] Classes in training data: {np.int64(0): np.int64(4139), np.int64(1): np.int64(1495)}
[INFO] Training Logistic Regression model...
[INFO] Model saved at ./model.bin

===== TEST METRICS =====
Accuracy: 0.8027
Classification report:
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.65      0.56      0.60       374

    accuracy                           0.80      1409
   macro avg       0.75      0.72      0.73      1409
weighted avg       0.80      0.80      0.80      1409



## 7. Train script inside Sagemaker container

In [28]:
from sagemaker.sklearn.estimator import SKLearn
from sagemaker import get_execution_role

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.m5.large",  
    framework_version=FRAMEWORK_VERSION,
    base_job_name="LogReg-churn-sklearn",
    hyperparameters={
        "solver": "liblinear",
        "C": 1.0,
    }
)



In [30]:
import sagemaker
from sagemaker.sklearn.model import SKLearnModel

sagemaker_session = sagemaker.Session()
role = "arn:aws:iam::193635814548:role/service-role/AmazonSageMaker-ExecutionRole-20251101T214260"

# Upload local model.pkl to S3
model_artifact = sagemaker_session.upload_data("model.bin", key_prefix="churn-model")

# Create SageMaker model
model = SKLearnModel(
    model_data=model_artifact,
    role=role,
    entry_point="script.py",  # this should have inference code
    framework_version="0.23-1",
)

# Deploy endpoint
endpoint_name = "LogReg-churn-endpoint"
predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name
)

# Make predictions
input_data = testX[categorical + numerical][0:2].to_dict(orient="records")
predictions = predictor.predict(input_data)
print(predictions)

# Delete endpoint
predictor.delete_endpoint()


INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2025-11-02-03-02-20-552
INFO:sagemaker:Creating endpoint-config with name LogReg-churn-endpoint
INFO:sagemaker:Creating endpoint with name LogReg-churn-endpoint
ERROR:sagemaker:Please check the troubleshooting guide for common errors: https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-python-sdk-troubleshooting.html#sagemaker-python-sdk-troubleshooting-create-endpoint


In [25]:
# train and evaluate with train/test channels
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: LogReg-churn-sklearn-2025-11-02-02-08-13-033
ERROR:sagemaker:Please check the troubleshooting guide for common errors: https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-python-sdk-troubleshooting.html#sagemaker-python-sdk-troubleshooting-create-training-job


## 8. Store Model Artifacts into S3

In [None]:
sklearn_estimator.latest_training_job.wait(logs="None")

artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)

## 9. Deploy Sagemaker Endpoint for trained model, and test it

In [27]:
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "LogReg-churn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

model = SKLearnModel(
    name=model_name,
    model_data=artifact,
    role=get_execution_role(),
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)


In [26]:
endpoint_name = "LogReg-churn-endpoint-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
)


In [None]:
# convert first 2 rows to list of dicts (since your script expects DictVectorizer input)
input_data = testX[categorical + numerical][0:2].to_dict(orient="records")

predictions = predictor.predict(input_data)
print(predictions)


## Delete endpoint

In [None]:
sm_boto3.delete_endpoint(EndpointName=endpoint_name)
print(f"Endpoint {endpoint_name} deleted.")