#### Load the data from s3Bucket

In [45]:
import boto3
import pandas as pd
from io import StringIO

# Define S3 bucket and file key for processed data
bucket_name = "dataville-traffic-data"
s3_key = "processed/updated_traffic_data.csv"  # Updated to point to processed data

# Initialize S3 client
s3_client = boto3.client("s3")

# Download the processed file from S3
response = s3_client.get_object(Bucket=bucket_name, Key=s3_key)

# Read CSV into Pandas DataFrame
df = pd.read_csv(response["Body"])

# Display first few rows
print(df.head())

   sensor_id  vehicle_count  avg_speed  incident  weather_condition_Fog  \
0          5       4.685296   8.369228         1                    0.0   
1          3       4.015968   8.849047         1                    1.0   
2          1       5.131515   8.273052         0                    1.0   
3          1       6.023952   8.418683         1                    0.0   
4          2       5.354624   8.455215         1                    0.0   

   weather_condition_Rain  weather_condition_Clear  weather_condition_Snow  \
0                     0.0                      0.0                     1.0   
1                     0.0                      0.0                     0.0   
2                     0.0                      0.0                     0.0   
3                     0.0                      1.0                     0.0   
4                     1.0                      0.0                     0.0   

            event_time  
0  2023-01-26 16:25:10  
1  2023-01-26 19:11:24  
2  20

#### Scaled the data

In [None]:
from sklearn.preprocessing import StandardScaler
# List of numerical columns that need to be scaled
numerical_columns = ['vehicle_count', 'avg_speed']
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
print(df.head())

In [None]:
import boto3
import sagemaker
import pandas as pd
from sagemaker import get_execution_role
from sagemaker.session import Session
from io import StringIO
from sklearn.model_selection import train_test_split

# ✅ Replace with your SageMaker Execution Role ARN
role_arn = "arn:aws:iam::195275672112:role/SageMakerExecutionRole"

# ✅ Define S3 bucket and processed data file key
bucket_name = "dataville-traffic-data"
s3_key = "processed/updated_traffic_data.csv"

# ✅ Initialize S3 client
s3_client = boto3.client("s3")

# ✅ Download the processed file from S3
response = s3_client.get_object(Bucket=bucket_name, Key=s3_key)

# ✅ Read CSV into Pandas DataFrame
df = pd.read_csv(response["Body"])

# ✅ Convert event_time to datetime
df["event_time"] = pd.to_datetime(df["event_time"])

# ✅ Convert event_time to Unix timestamp (numeric format)
df["event_time"] = df["event_time"].astype(int) // 10**9  # Convert to seconds

# ✅ Feature Engineering: Extract useful time-based features
df["hour"] = df["event_time"] % 86400 // 3600  # Extract hour from timestamp
df["day_of_week"] = (df["event_time"] // 86400 + 4) % 7  # Monday=0, Sunday=6
df["is_weekend"] = df["day_of_week"].apply(lambda x: 1 if x >= 5 else 0)
df["is_peak_hour"] = df["hour"].apply(lambda x: 1 if (7 <= x <= 9) or (16 <= x <= 18) else 0)
df["month"] = (df["event_time"] // (30 * 86400)) % 12 + 1  # Approximate month extraction

# ✅ Drop original event_time column (since we now have time-based features)
df.drop(columns=["event_time"], inplace=True)

# ✅ Define target and features
target_column = "incident"
features = [col for col in df.columns if col != target_column]  # Keep all features

X = df[features]
y = df[target_column]  # Target column

# ✅ Split dataset into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.33, random_state=42)

# ✅ Convert datasets to CSV format
train_data = pd.concat([y_train, X_train], axis=1)
val_data = pd.concat([y_val, X_val], axis=1)

train_csv_buffer = StringIO()
val_csv_buffer = StringIO()

train_data.to_csv(train_csv_buffer, header=False, index=False)
val_data.to_csv(val_csv_buffer, header=False, index=False)

s3_train_key = "train_data.csv"
s3_val_key = "val_data.csv"

s3_train_path = f"s3://{bucket_name}/{s3_train_key}"
s3_val_path = f"s3://{bucket_name}/{s3_val_key}"

# ✅ Upload new training data to S3
s3_client.put_object(Bucket=bucket_name, Key=s3_train_key, Body=train_csv_buffer.getvalue())
s3_client.put_object(Bucket=bucket_name, Key=s3_val_key, Body=val_csv_buffer.getvalue())

# ✅ Initialize SageMaker session
sagemaker_session = sagemaker.Session()

# ✅ Define input data sources (Set content_type="text/csv")
train_input = sagemaker.inputs.TrainingInput(s3_train_path, content_type="text/csv")
val_input = sagemaker.inputs.TrainingInput(s3_val_path, content_type="text/csv")

# ✅ Define SageMaker Linear Learner model
linear_learner = sagemaker.estimator.Estimator(
    image_uri=sagemaker.image_uris.retrieve("linear-learner", sagemaker_session.boto_region_name),
    role=role_arn,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=f"s3://{bucket_name}/linear-learner-output",
    sagemaker_session=sagemaker_session,
)

# ✅ Set hyperparameters
linear_learner.set_hyperparameters(
    predictor_type="binary_classifier",
    mini_batch_size=10
)

# ✅ Train the model using SageMaker
linear_learner.fit({"train": train_input, "validation": val_input})

print("✅ Model training complete!")

#### Dataset is highly imbalanced, with 95.27% of the incidents being 0 (no incident) and only 4.73% being 1 (incident). This imbalance can negatively impact the model’s ability to correctly classify incidents because it might just predict 0 most of the time to achieve high accuracy.

### Random Oversampling

In [106]:
from imblearn.over_sampling import RandomOverSampler

# ✅ Apply Random Oversampling to balance the dataset
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

# ✅ Check new class distribution
print(y_resampled.value_counts(normalize=True))

incident
1    0.5
0    0.5
Name: proportion, dtype: float64


In [None]:
from sklearn.model_selection import train_test_split
from io import StringIO

# ✅ Split dataset into train (70%), validation (15%), and test (15%)
X_train, X_temp, y_train, y_temp = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 15% each

# ✅ Convert datasets to CSV for SageMaker training
train_data = pd.concat([y_train, X_train], axis=1)
val_data = pd.concat([y_val, X_val], axis=1)

train_csv_buffer = StringIO()
val_csv_buffer = StringIO()

train_data.to_csv(train_csv_buffer, header=False, index=False)
val_data.to_csv(val_csv_buffer, header=False, index=False)

# ✅ Upload new training & validation data to S3
s3_train_key = "train_data_resampled.csv"
s3_val_key = "val_data_resampled.csv"

s3_train_path = f"s3://{bucket_name}/{s3_train_key}"
s3_val_path = f"s3://{bucket_name}/{s3_val_key}"

s3_client.put_object(Bucket=bucket_name, Key=s3_train_key, Body=train_csv_buffer.getvalue())
s3_client.put_object(Bucket=bucket_name, Key=s3_val_key, Body=val_csv_buffer.getvalue())

# ✅ Update input data paths for SageMaker training
train_input = sagemaker.inputs.TrainingInput(s3_train_path, content_type="text/csv")
val_input = sagemaker.inputs.TrainingInput(s3_val_path, content_type="text/csv")

# ✅ Train the model with the resampled data
linear_learner.fit({"train": train_input, "validation": val_input})

print("✅ Model training complete with resampled data!")

### Deploy the model

In [None]:
import boto3
import sagemaker
from sagemaker.model import Model
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

# ✅ Define variables
bucket_name = "dataville-traffic-data"
model_key = "linear-learner-output/linear-learner-2025-02-05-23-28-47-406/output/model.tar.gz"
role_arn = "arn:aws:iam::195275672112:role/SageMakerExecutionRole"
endpoint_name = "linear-learner-endpoint"

# ✅ Initialize SageMaker session
sagemaker_session = sagemaker.Session()

# ✅ Define the model using the trained artifact
model = Model(
    image_uri=sagemaker.image_uris.retrieve("linear-learner", sagemaker_session.boto_region_name),
    model_data=f"s3://{bucket_name}/{model_key}",
    role=role_arn,
    sagemaker_session=sagemaker_session,
)

# ✅ Deploy the model as an endpoint
predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
    endpoint_name=endpoint_name
)

# ✅ Configure predictor serializer/deserializer
predictor.serializer = CSVSerializer()
predictor.deserializer = JSONDeserializer()

print(f"✅ Model deployed successfully! Endpoint name: {endpoint_name}")

In [116]:
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

# ✅ Reattach the predictor to the deployed endpoint
predictor = Predictor(endpoint_name="linear-learner-endpoint")

# ✅ Configure serializer/deserializer
predictor.serializer = CSVSerializer()
predictor.deserializer = JSONDeserializer()

print("✅ Predictor successfully reattached!")

✅ Predictor successfully reattached!


In [130]:
import numpy as np

# ✅ Convert test features to CSV format
X_test_csv = X_test.to_csv(header=False, index=False)

# ✅ Make predictions
response = predictor.predict(X_test_csv)

# ✅ Extract predictions from the JSON response
if isinstance(response, dict) and "predictions" in response:
    predictions = np.array([pred["score"] for pred in response["predictions"]])  # Extract scores
else:
    raise ValueError("Unexpected response format:", response)

# ✅ Convert probabilities to binary predictions (threshold = 0.5)
binary_predictions = (predictions > 0.5).astype(int)

print("✅ Predictions complete!")
print(binary_predictions[:10])  # Show first 10 predictions

✅ Predictions complete!
[0 0 1 1 0 1 0 1 0 0]


In [131]:
# Compute performance metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Compute scores
accuracy = accuracy_score(y_test, binary_predictions)
precision = precision_score(y_test, binary_predictions)
recall = recall_score(y_test, binary_predictions)
f1 = f1_score(y_test, binary_predictions)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, binary_predictions)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

print("\nConfusion Matrix:")
print(conf_matrix)

Accuracy: 0.6400
Precision: 0.6582
Recall: 0.5361
F1-score: 0.5909

Confusion Matrix:
[[76 27]
 [45 52]]


In [132]:
# Import required libraries
import numpy as np
import pandas as pd
import plotly.figure_factory as ff
import plotly.express as px
import plotly.graph_objects as go
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Compute performance metrics
accuracy = accuracy_score(y_test, binary_predictions)
precision = precision_score(y_test, binary_predictions)
recall = recall_score(y_test, binary_predictions)
f1 = f1_score(y_test, binary_predictions)

# Create a DataFrame for the metrics
metrics_df = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1-score"],
    "Score": [accuracy, precision, recall, f1]
})

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, binary_predictions)
conf_matrix_df = pd.DataFrame(conf_matrix, 
                              index=["Actual Negative", "Actual Positive"], 
                              columns=["Predicted Negative", "Predicted Positive"])

# Visualization: Bar Chart of Performance Metrics
fig_metrics = px.bar(metrics_df, x="Metric", y="Score", title="Model Performance Metrics", text="Score")
fig_metrics.update_traces(texttemplate='%{text:.3f}', textposition='outside')

# Visualization: Confusion Matrix Heatmap
fig_conf_matrix = ff.create_annotated_heatmap(
    z=conf_matrix, 
    x=["Predicted Negative", "Predicted Positive"], 
    y=["Actual Negative", "Actual Positive"], 
    colorscale="blues",
    showscale=True
)

# Show plots
fig_metrics.show()
fig_conf_matrix.show()

In [133]:
# New threshold
threshold = 0.4
binary_predictions = (predictions > threshold).astype(int)

# Compute performance metrics
accuracy = accuracy_score(y_test, binary_predictions)
precision = precision_score(y_test, binary_predictions)
recall = recall_score(y_test, binary_predictions)
f1 = f1_score(y_test, binary_predictions)

print(f"Threshold: {threshold}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")


Threshold: 0.4
Accuracy: 0.6500
Precision: 0.6134
Recall: 0.7526
F1-score: 0.6759


In [137]:
from sagemaker import image_uris

# Get XGBoost container for version 1.5-1
xgboost_container = image_uris.retrieve(
    framework="xgboost",
    region=boto3.Session().region_name,
    version="1.5-1"
)

print(f"Using XGBoost container: {xgboost_container}")


Using XGBoost container: 683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-xgboost:1.5-1


In [138]:
# Define XGBoost estimator
xgboost = sagemaker.estimator.Estimator(
    image_uri=xgboost_container,  # Use the container URI we just retrieved
    role=role_arn,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=f"s3://{bucket_name}/xgboost-output",
    sagemaker_session=sagemaker_session,
)


In [139]:
# Set hyperparameters with cost-sensitive learning
xgboost.set_hyperparameters(
    objective="binary:logistic",  # Binary classification task
    num_round=100,                # Number of boosting rounds
    scale_pos_weight=20           # Handle class imbalance
)


In [None]:
# Train XGBoost model
xgboost.fit({"train": train_input, "validation": val_input})


In [141]:
# Deploy the model to an endpoint
predictor = xgboost.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
    endpoint_name="xgboost-traffic-predictor"
)


-------!

In [145]:
# Convert test features to CSV format
X_test_csv = X_test.to_csv(header=False, index=False)

# Make predictions with correct content type
response = predictor.predict(X_test_csv, initial_args={"ContentType": "text/csv"})

# Decode the response from bytes to string and clean it
decoded_response = response.decode("utf-8").replace("\n", ",")

# Extract predictions from the cleaned response
predictions = np.array([float(score) for score in decoded_response.split(",") if score.strip()])

# Convert probabilities to binary predictions (threshold = 0.5)
binary_predictions = (predictions > 0.5).astype(int)


In [None]:

# Print predictions and binary classifications
print("Predicted Probabilities:", predictions)
print("Binary Predictions (Threshold=0.5):", binary_predictions)

In [147]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, binary_predictions)
precision = precision_score(y_test, binary_predictions)
recall = recall_score(y_test, binary_predictions)
f1 = f1_score(y_test, binary_predictions)

# Print evaluation results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Accuracy: 0.9750
Precision: 0.9510
Recall: 1.0000
F1-Score: 0.9749


In [148]:
from sklearn.metrics import confusion_matrix

# Compute the confusion matrix
cm = confusion_matrix(y_test, binary_predictions)

# Define class names
class_names = ["No Incident", "Incident"]


In [150]:
from sklearn.metrics import confusion_matrix

# Compute the confusion matrix
cm = confusion_matrix(y_test, binary_predictions)

# Define class names
class_names = ["No Incident", "Incident"]

# Print the confusion matrix with labels
print("Confusion Matrix:")
print(f"{'':<12}{class_names[0]:<12}{class_names[1]:<12}")
print(f"{class_names[0]:<12}{cm[0][0]:<12}{cm[0][1]:<12}")
print(f"{class_names[1]:<12}{cm[1][0]:<12}{cm[1][1]:<12}")


Confusion Matrix:
            No Incident Incident    
No Incident 98          5           
Incident    0           97          


In [151]:
import plotly.figure_factory as ff

# annotated heatmap
fig = ff.create_annotated_heatmap(
    z=cm,
    x=class_names,  # Predicted labels
    y=class_names,  # Actual labels
    colorscale="Viridis",
    showscale=True
)

fig.update_layout(
    title="Confusion Matrix",
    xaxis=dict(title="Predicted Label"),
    yaxis=dict(title="Actual Label")
)

fig.show()


#### Interpretation of the Confusion Matrix
#### 1.	True Negatives (TN): `98`
#### •	The model correctly predicted `No Incident` for 98 cases.
#### 2.	False Positives (FP): `5`
#### •	The model incorrectly predicted `Incident` for 5 cases where there was actually `No Incident`.
#### 3.	False Negatives (FN): `0`
#### •	The model did not miss any actual incidents (`Incident`), which is excellent.
#### 4.	True Positives (TP): `97`
#### •	The model correctly predicted `Incident` for 97 cases.
#### Key Observations
#### •	High Recall (Perfect Detection of Incidents):
#### •	Since there are no false negatives (`FN=0`), the model identified all actual incidents correctly.
#### •	A Few False Positives:
#### •	There are 5 cases where the model predicted an incident but it wasn’t true. This slightly reduces precision but ensures high recall, which is often more critical in imbalanced datasets.