In [1]:
!pip install sagemaker boto3 pandas numpy scikit-learn matplotlib seaborn --upgrade

Collecting numpy
  Using cached numpy-2.3.4-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (62 kB)


In [2]:
import sagemaker
import boto3
import pandas as pd
import numpy as np

raw_bucket = 'mihir-raw-data'       
artifact_bucket = 'mihir-churn-project-artifacts'
# ---------------------------------

data_key = 'WA_Fn-UseC_-Telco-Customer-Churn.csv'
data_location = f's3://mihir-raw-data/WA_Fn-UseC_-Telco-Customer-Churn.csv'

# Load data directly from S3
df = pd.read_csv(data_location)

print("Data loaded successfully!")
print(df.head())

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Data loaded successfully!
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  .

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [4]:
# 1. Clean 'TotalCharges': Convert to numeric, forcing empty strings to 'NaN' (Not a Number)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# 2. Drop missing values: Remove the few rows that had empty 'TotalCharges'
df = df.dropna()

# 3. Drop 'customerID'
df = df.drop(['customerID'], axis=1)

print("Data cleaned!")
df.info()


Data cleaned!
<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7032 non-null   object 
 1   SeniorCitizen     7032 non-null   int64  
 2   Partner           7032 non-null   object 
 3   Dependents        7032 non-null   object 
 4   tenure            7032 non-null   int64  
 5   PhoneService      7032 non-null   object 
 6   MultipleLines     7032 non-null   object 
 7   InternetService   7032 non-null   object 
 8   OnlineSecurity    7032 non-null   object 
 9   OnlineBackup      7032 non-null   object 
 10  DeviceProtection  7032 non-null   object 
 11  TechSupport       7032 non-null   object 
 12  StreamingTV       7032 non-null   object 
 13  StreamingMovies   7032 non-null   object 
 14  Contract          7032 non-null   object 
 15  PaperlessBilling  7032 non-null   object 
 16  PaymentMethod     7032 non-null  

In [5]:
# 1. Convert target variable 'Churn' (Yes/No) to binary (1/0)
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# 2. Convert all other categorical (string) columns into numbers
# pd.get_dummies() does this automatically for us (one-hot encoding)
df_processed = pd.get_dummies(df, drop_first=True)

# 3. IMPORTANT: Move the 'Churn' column to be the *first* column.
# This is a requirement for SageMaker's built-in algorithms.
df_processed = pd.concat(
    [df_processed['Churn'], df_processed.drop(['Churn'], axis=1)], 
    axis=1
)

print("Data transformed!")
print(df_processed.head())

Data transformed!
   Churn  SeniorCitizen  tenure  MonthlyCharges  TotalCharges  gender_Male  \
0      0              0       1           29.85         29.85        False   
1      0              0      34           56.95       1889.50         True   
2      1              0       2           53.85        108.15         True   
3      0              0      45           42.30       1840.75         True   
4      1              0       2           70.70        151.65        False   

   Partner_Yes  Dependents_Yes  PhoneService_Yes  \
0         True           False             False   
1        False           False              True   
2        False           False              True   
3        False           False             False   
4        False           False              True   

   MultipleLines_No phone service  ...  StreamingTV_No internet service  \
0                            True  ...                            False   
1                           False  ...            

In [21]:
# This prints the final list of columns *without* the 'Churn' target
# This is the exact order your model expects.
print(list(df_processed.drop('Churn', axis=1).columns))

['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'gender_Male', 'Partner_Yes', 'Dependents_Yes', 'PhoneService_Yes', 'MultipleLines_No phone service', 'MultipleLines_Yes', 'InternetService_Fiber optic', 'InternetService_No', 'OnlineSecurity_No internet service', 'OnlineSecurity_Yes', 'OnlineBackup_No internet service', 'OnlineBackup_Yes', 'DeviceProtection_No internet service', 'DeviceProtection_Yes', 'TechSupport_No internet service', 'TechSupport_Yes', 'StreamingTV_No internet service', 'StreamingTV_Yes', 'StreamingMovies_No internet service', 'StreamingMovies_Yes', 'Contract_One year', 'Contract_Two year', 'PaperlessBilling_Yes', 'PaymentMethod_Credit card (automatic)', 'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check']


In [6]:
from sklearn.model_selection import train_test_split

# Split data: 70% for train, 15% for validation, 15% for test
train_val_df, test_df = train_test_split(
    df_processed, 
    test_size=0.15, 
    random_state=42, 
    stratify=df_processed['Churn']
)

train_df, val_df = train_test_split(
    train_val_df, 
    test_size=0.15, # 0.15 of the 85% is ~15% of the total
    random_state=42, 
    stratify=train_val_df['Churn']
)

print(f"Train shape: {train_df.shape}")
print(f"Validation shape: {val_df.shape}")
print(f"Test shape: {test_df.shape}")

Train shape: (5080, 31)
Validation shape: (897, 31)
Test shape: (1055, 31)


In [7]:
# 1. Save as CSVs (without headers or index, as required by SageMaker)
train_df.to_csv('train.csv', header=False, index=False)
val_df.to_csv('validation.csv', header=False, index=False)
test_df.to_csv('test.csv', header=False, index=False)

# 2. Get the S3 client
s3_client = boto3.client('s3')

# 3. Upload to S3, organizing them in a 'data' folder
# Note: We use artifact_bucket, which you defined in cell 2
s3_client.upload_file('train.csv', artifact_bucket, 'data/train.csv')
s3_client.upload_file('validation.csv', artifact_bucket, 'data/validation.csv')
s3_client.upload_file('test.csv', artifact_bucket, 'data/test.csv')

print("Files uploaded to S3!")

Files uploaded to S3!


In [8]:
import sagemaker

# 1. Get your SageMaker session and execution role
sess = sagemaker.Session()
role = sagemaker.get_execution_role()

# 2. This is your artifact bucket name from Phase 2
# This variable should still be in memory, but we set it again to be safe
artifact_bucket = 'mihir-churn-project-artifacts' 

# 3. Get the ECR (Elastic Container Registry) path for the XGBoost algorithm
container = sagemaker.image_uris.retrieve(
    framework='xgboost',
    region=sess.boto_region_name,
    version='1.7-1' # Use a recent, stable version
)

print(f"Using XGBoost container: {container}")

Using XGBoost container: 683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.7-1


In [9]:
# Define the S3 paths to our training and validation data
s3_train_path = f's3://{artifact_bucket}/data/train.csv'
s3_val_path = f's3://{artifact_bucket}/data/validation.csv'

# Create SageMaker TrainingInput objects
train_input = sagemaker.inputs.TrainingInput(
    s3_data=s3_train_path, 
    content_type='csv'
)

validation_input = sagemaker.inputs.TrainingInput(
    s3_data=s3_val_path, 
    content_type='csv'
)

print("Data inputs defined.")

Data inputs defined.


In [10]:
!pip install xgboost



In [11]:
import xgboost as xgb
import joblib
import tarfile   # <-- NEW
import boto3     # <-- Added for safety

# 1. Get the data (it should still be in memory from Phase 2)
X_train = train_df.iloc[:, 1:] 
y_train = train_df.iloc[:, 0]  
X_val = val_df.iloc[:, 1:]
y_val = val_df.iloc[:, 0]

# 2. Define and train the XGBoost model locally
print("Training model locally...")
model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='auc',
    n_estimators=150,
    early_stopping_rounds=10
)
model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
print("Model trained!")

# 3. Save the model to a file
model_file_name = 'churn-model.joblib'
joblib.dump(model, model_file_name)
print(f"Model saved to {model_file_name}")

# 4. NEW: Create the model.tar.gz archive
model_archive_name = 'model.tar.gz' 
with tarfile.open(model_archive_name, 'w:gz') as tar:
    tar.add(model_file_name)
print(f"Model archive created at {model_archive_name}")

# 5. Upload the NEW .tar.gz archive to S3
# (artifact_bucket should still be in memory from Phase 2)
s3_client = boto3.client('s3')
model_s3_path_key = f'models/{model_archive_name}' # <-- Use the new archive name

s3_client.upload_file(
    Filename=model_archive_name, # <-- Upload the .tar.gz
    Bucket=artifact_bucket,
    Key=model_s3_path_key
)
print(f"Model archive uploaded to s3://{artifact_bucket}/{model_s3_path_key}")

Training model locally...
Model trained!
Model saved to churn-model.joblib
Model archive created at model.tar.gz
Model archive uploaded to s3://mihir-churn-project-artifacts/models/model.tar.gz


In [12]:
from sagemaker.sklearn.model import SKLearnModel
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import CSVDeserializer

# 1. Get the container URI (No change)
sklearn_container = sagemaker.image_uris.retrieve(
    framework="sklearn",
    region=sess.boto_region_name,
    version="1.2-1"
)

# 2. Get the S3 path to our model (No change)
model_s3_uri = f"s3://{artifact_bucket}/models/model.tar.gz"

# 3. Create a SageMaker Model object -- THIS PART IS NEW
# We now add the `source_dir` argument to explicitly point to our folder
sklearn_model = SKLearnModel(
    model_data=model_s3_uri,
    role=role,
    image_uri=sklearn_container,
    entry_point="churn-script.py",   # The name of the script *inside* the folder
    source_dir="source_code"        # <-- THIS IS THE FIX
)
print("Model object created.")

# 4. DEPLOY THE MODEL (with a new name)
print("Deploying endpoint... this will take 5-10 minutes.")
predictor = sklearn_model.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium',
    endpoint_name='churn-prediction-endpoint-v-final' # <-- New name
)

print("Endpoint is LIVE!")

Model object created.
Deploying endpoint... this will take 5-10 minutes.


In [17]:
# test_df should still be in memory from Phase 2
# We grab the first row, skip the 'Churn' column [:,1:], and get its values
test_customer_features = test_df.iloc[0, 1:].values

# Convert the array of numbers into a single comma-separated string
payload_string = ",".join([str(x) for x in test_customer_features])

print("Your test payload string is:")
print(payload_string)

Your test payload string is:
0,4,77.85,299.2,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False


In [18]:
!curl -X POST 'https://5rh41ecvfd.execute-api.us-east-1.amazonaws.com/predict' \
-H 'Content-Type: application/json' \
-d '{"features": "0,4,77.85,299.2,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False"}'

{"churn_probability": 0.69683635}