In [None]:
import pandas as pd
import os
import boto3
import sagemaker
from sagemaker.serializers import CSVSerializer
from sagemaker.serverless import ServerlessInferenceConfig
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import joblib
import tarfile

In [None]:
# identificación de región
region = sagemaker.Session().boto_region_name
print("AWS Region: {}".format(region))

# indetificación de rol
role = sagemaker.get_execution_role()
print("RoleArn: {}".format(role))

# sesión de sagemaker
sagemaker_session = sagemaker.Session()

In [None]:
bucket = "frauddatajuan"
key = "credit_data_with_targets.csv"
s3_input_uri = f"s3://{bucket}/credit_data_with_targets.csv"

In [None]:
def load():
    df = pd.read_csv(s3_input_uri)

    y = df["Risk target"]
    y = y.replace("Moderate risk", "Bad risk") #Reemplazar ciertas respuestas del modelo que no salieron bien (considerando peor escenario)

    X = df.drop("Risk target", axis=1)
    X = X.drop("Description", axis=1)

    return X, y

In [None]:
def plotFeature(model,colN):
  for feature in zip(colN,model.feature_importances_):
    print(feature)
  sortedIdx=model.feature_importances_.argsort()
  colN=np.array(colN)
  plt.barh(colN[sortedIdx],model.feature_importances_[sortedIdx])
  plt.show()
  print(model.feature_importances_[sortedIdx])

In [None]:
def randomForest_best_HP(X, y, colN):
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  nEst = [] # number of stimators (inner trees)
  for i in range(1,200,3):
    nEst.append(i)

  acc = [] # Accuracy
  oob_error = [] # out of bag --> to see the generalization error of the model

  print()
  for i in nEst:
    clf = RandomForestClassifier(n_estimators=i, oob_score=True, random_state=0)
    clf.fit(X_train, y_train)
    y_pred=clf.predict(X_test)
    score=accuracy_score(y_test,y_pred)
    print(f"For {i} nEstimators -> Accuracy: {score:.2f}")
    acc.append(score)
    oob_error.append(1-clf.oob_score_)

  plt.figure(1)
  plt.plot(nEst,oob_error)
  plt.xlabel("number of trees")
  plt.ylabel("oob")
  plt.legend()
  index_max=max(range(len(acc)),key=acc.__getitem__)
  plt.figure(2)
  plotFeature(clf,colN)

In [None]:
def randomForest_CV_forHPs(X, y):
  param_grid = {
      'n_estimators': [13,103,37,50,52,100,130,133,200],
      'max_depth': [None, 10, 20],
      'min_samples_split': [2, 5],
      'max_features': ['sqrt', 'log2'],
      'class_weight': [None, 'balanced']
  }

  rf = RandomForestClassifier(oob_score=True, random_state=0)
  best_model = GridSearchCV(rf, param_grid, cv=5, scoring='f1')
  best_model.fit(X, y)
  print("Best params:", best_model.best_params_)

  skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

  all_y_test = []
  all_y_pred = []

  for train_index, test_index in skf.split(X, y):
      X_train, X_test = X.iloc[train_index], X.iloc[test_index]
      y_train, y_test = y.iloc[train_index], y.iloc[test_index]

      model = best_model.best_estimator_
      model.fit(X_train, y_train)
      y_pred = model.predict(X_test)

      all_y_test.extend(y_test)
      all_y_pred.extend(y_pred)

  print(classification_report(all_y_test, all_y_pred, target_names=["Bad risk (0)", "Good risk (1)"]))

  cm = confusion_matrix(all_y_test, all_y_pred, labels=[0, 1])

  print("Confusion Matrix:")
  print(cm)

  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Benign (0)", "Malignant (1)"])
  disp.plot(cmap=plt.cm.Blues)
  plt.title("Confusion Matrix - SVM")
  plt.show()

  return best_model.best_estimator_

In [None]:
def save_and_package_model(model):
    # Crea carpeta y guarda el modelo
    os.makedirs("model", exist_ok=True)
    joblib.dump(model, "model/model.joblib")

    # Empaqueta el modelo como .tar.gz
    with tarfile.open("model.tar.gz", "w:gz") as tar:
        tar.add("model", arcname=".")

def upload_model_to_s3():
    session = sagemaker.Session()
    bucket = session.default_bucket()
    s3_key = "model.tar.gz"
    s3_path = f"s3://frauddatajuan/{s3_key}"

    s3 = boto3.client("s3")
    s3.upload_file("model.tar.gz", bucket, s3_key)

    return s3_path, session

def deploy_model(s3_model_path, session, role):
    sklearn_model = SKLearnModel(
        model_data=s3_model_path,
        role=role,
        entry_point="inference.py",
        framework_version="0.23-1",  # o la versión que estés usando
        sagemaker_session=session
    )

    serverless_config = ServerlessInferenceConfig(
        memory_size_in_mb=1024,
        max_concurrency=5
    )

    predictor = sklearn_model.deploy(
        endpoint_name='mi-primer-endpoint-serverless',
        serverless_inference_config=serverless_config,
        serializer=CSVSerializer()
    )

    return predictor

In [None]:
def main():
  le = LabelEncoder()
  ohe = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
    
  X, y = load()

  categorical_cols = X.select_dtypes(include='object').columns.tolist()
  X_cat = ohe.fit_transform(X[categorical_cols])
  X_numeric = X.drop(columns=categorical_cols)
  X_encoded = pd.concat([
    X_numeric.reset_index(drop=True),
    pd.DataFrame(X_cat, columns=ohe.get_feature_names_out(categorical_cols))
  ], axis=1)
    
  y = le.fit_transform(y)
  y = pd.Series(y, index=X_encoded.index)
  print(X_encoded.info())

  #Trying RF model to expect increase f1 (malignant detection)
  print()
  print("-"*100)
  print("RANDOM FOREST (RF) USING SCC AND OOB ERROR TO SELECT BEST N_TREES:")
  print()
  randomForest_best_HP(X_encoded, y, X_encoded.columns)

  #Selecting the best n_trees and use CV to select the best HPs for the model
  print()
  print("-"*100)
  print("RANDOM FOREST (RF) USING CV TO SELECT BEST HPS:")
  print()
  rf_model = randomForest_CV_forHPs(X_encoded, y)

  save_and_package_model(rf_model)

  # Carga a S3 y despliega
  s3_model_path, session = upload_model_to_s3()
  predictor = deploy_model(s3_model_path, session, role)

  #Probamos nuestro endpoint (modelo)
  input_data = [[67,"male",2,"own	moderate","little",1169,6,"radio/TV"]]
  response = predictor.predict(input_data)
  print(response)

In [None]:
main()