In [None]:
import sklearn # Check Sklearn version
sklearn.__version__
import numpy as np
from sagemaker import get_execution_role
import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
import datetime
import time
import tarfile
import boto3
import pandas as pd

sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = 'sagemaker-tutorials-mlhub' # Mention the created S3 bucket name here
print("Using bucket " + bucket)

# send data to S3. SageMaker will take training data from s3
sk_prefix = "sagemaker/mobile_price_classification/sklearncontainer"
trainpath = sess.upload_data(
    path="train-V-1.csv", bucket=bucket, key_prefix=sk_prefix
)

testpath = sess.upload_data(
    path="test-V-1.csv", bucket=bucket, key_prefix=sk_prefix
)

%%writefile script.py

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO 
import argparse
import joblib
import os
import numpy as np
import pandas as pd
    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf
    
if __name__ == "__main__":

    logger.info(f"Starting training")

    #data prepare
    self.parameter_lr = self.param['parameter_lr']
    self.penal,self.size= self.parameter_lr["penalty"],self.parameter_lr["size"]
    self.parameter_lr.pop("penalty")
    self.parameter_lr.pop("size")


    def fit_ml_models(algo, algo_param, algo_name,x_train,y_train,x_test,y_test):
    
        algo = Pipeline([("algo", algo)])
        
        
        model = GridSearchCV(algo, param_grid=algo_param, cv=10, verbose=1)
        
        
        logger.info(f"Fitting {algo_name}")
        fit_model = model.fit(x_train, y_train)
        
        best_params = model.best_params_
        logger.info("Best Parameters: "+f"{best_params}")
        
        best_model = model.best_estimator_
        best_estimator = model.best_estimator_._final_estimator
        best_score = round(model.best_score_, 4)
        logger.info(f"Best Score: "+"{:.3f}".format(best_score))
        
        y_pred_train = model.predict(x_train)
        y_pred_test = model.predict(x_test)
        
        
        acc_score_train = round(accuracy_score(y_pred_train, y_train)*100, 3)
        acc_score_test = round(accuracy_score(y_pred_test, y_test)*100, 3)
        logger.info(f"Train and Test Accuracy Score for train {acc_score_train} and test {acc_score_test}")
        
        logger.info(f"Finishing training")

    return acc_score_train, acc_score_test, best_score, best_params

    def run_train(self):
        model_lr = LogisticRegression(penalty=self.penal, random_state=42)

        acc_score_train, acc_score_test, best_score , best_params= fit_ml_models(model_lr, self.parameter_lr, "Logistic Regression",
                                                                             self.x_train,self.y_train,self.x_test,self.y_test)
    
        model_lr = LogisticRegression(solver=best_params["algo__solver"], C=best_params["algo__C"],penalty=self.penal, random_state=42)
        model_lr.fit(self.x_train, self.y_train)


    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model,model_path)
    print("Model persisted at " + model_path)


    #self.df = self.df.drop("Id", axis=1)
    x = self.df.drop(["species"], axis=1)
    y = self.df["species"]

    self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(x, y, test_size=self.size, random_state=42)


    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-V-1.csv")
    parser.add_argument("--test-file", type=str, default="test-V-1.csv")

    args, _ = parser.parse_known_args()
    
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (85%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (15%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
  
    print("Training RandomForest Model.....")
    print()
    model =  RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state, verbose = 3,n_jobs=-1)
    model.fit(X_train, y_train)
    print()
    

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model,model_path)
    print("Model persisted at " + model_path)
    print()

    
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test,y_pred_test)
    test_rep = classification_report(y_test,y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Accuracy is: ', test_acc)
    print('[TESTING] Testing Report: ')
    print(test_rep)


from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0,
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)


# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)
# sklearn_estimator.fit({"train": datapath}, wait=True)


sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)


from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name =  model_name,
    model_data=artifact,
    role=get_execution_role(),
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)
endpoint_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
)

In [2]:
query = """CREATE DATABASE if not exists iris_db;
USE iris_db;

        CREATE TABLE iris_dataset (
            sepal_length FLOAT,
            sepal_width FLOAT,
            petal_length FLOAT,
            petal_width FLOAT,
            species FLOAT
        );                                                        

        CREATE TABLE predictions (
            id INT PRIMARY KEY AUTO_INCREMENT,
            sepal_length FLOAT,
            sepal_width FLOAT,
            petal_length FLOAT,
            petal_width FLOAT,
            pred_species FLOAT,
            time_pred DOUBLE
        );
        """
query

'\n        CREATE DATABASE if not exists iris_db;\n\n        USE iris_db;\n\n        CREATE TABLE iris_dataset (\n            sepal_length FLOAT,\n            sepal_width FLOAT,\n            petal_length FLOAT,\n            petal_width FLOAT,\n            species FLOAT\n        );                                                        \n\n        CREATE TABLE predictions (\n            id INT PRIMARY KEY AUTO_INCREMENT,\n            sepal_length FLOAT,\n            sepal_width FLOAT,\n            petal_length FLOAT,\n            petal_width FLOAT,\n            pred_species FLOAT,\n            time_pred DOUBLE\n        );\n        '

In [None]:
print(predictor.predict(testX[features][0:2].values.tolist()))

sm_boto3.delete_endpoint(EndpointName=endpoint_name)

In [3]:
# os.chdir("PRUEBA1.0\ABICHALLENGE_LUIS-ROMERO")
import os
os.getcwd()
os.chdir("../")
# ! pip install -r requirements.txt

from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point="script.py",
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn",
    hyperparameters={
        "n_estimators": 100,
        "random_state": 0,
    },
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)


# launch training job, with asynchronous call
sklearn_estimator.fit({"train": trainpath, "test": testpath}, wait=True)
# sklearn_estimator.fit({"train": datapath}, wait=True)


sklearn_estimator.latest_training_job.wait(logs="None")
artifact = sm_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)


from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name =  model_name,
    model_data=artifact,
    role=get_execution_role(),
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)

In [4]:
# import os
# os.chdir("../")

from Classifier.constants import *
from Classifier.utils.common import read_yaml, create_directories

config_filepath = CONFIG_FILE_PATH
params_filepath = PARAMS_FILE_PATH

config = read_yaml(config_filepath)
params = read_yaml(params_filepath)

from Classifier.config.configuration import ConfigurationManager as CM
config = CM().get_data_training_config()

param = config["params"]
conf = config["training"]

[2024-08-19 07:36:18,091: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-08-19 07:36:18,093: INFO: common: yaml file: params.yaml loaded successfully]
[2024-08-19 07:36:18,098: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-08-19 07:36:18,100: INFO: common: yaml file: params.yaml loaded successfully]
[2024-08-19 07:36:18,101: INFO: common: created directory at: artifacts]


In [6]:
conf

{'root_dir': WindowsPath('model'),
 'trained_model_path': WindowsPath('model/model.pkl')}

In [None]:
import _mysql_connector
param={'parameter_lr': {'algo__solver': ['liblinear', 'newton-cg'],
  'algo__C': [0.001, 0.01, 0.1, 0.5, 1],
  'penalty': 'l2',
  'size': 0.2}}

# conf={paths...}

config={"user":"iris1",
            "password":"Xi25_PS6iww9os?z3",
            "host":"iris-db-instance.c3kq6wgkc2hl.us-east-1.rds.amazonaws.com"}


access_point="iris-acces-siwgf7f9fy1wu965gq533ba3nfkscuse1a-s3alias"



connection = mysql.connector.connect(**config)
cursor = connection.cursor()

#developer // production


#subnet privada
# subnet-0481d2f1cb01b8236
#db subnet
#subnet-064f2feaf0c3d4c0a
# Configuración de VPC



In [None]:
import sklearn # Check Sklearn version
sklearn.__version__
import numpy as np
from sagemaker import get_execution_role
import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
import datetime
import time
import tarfile
import boto3
import pandas as pd

#s3://iris-bucket1926/iris-sagemaker/
sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
region = sess.boto_session.region_name
bucket = 'iris-bucket1926' # Mention the created S3 bucket name here
print("Using bucket " + bucket)

# send data to S3. SageMaker will take training data from s3
# sk_prefix = "sagemaker/mobile_price_classification/sklearncontainer"

#(Load teh data )

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, auc
import sklearn
import joblib
import boto3
import pathlib
from io import StringIO 
import argparse
import joblib
import os
import numpy as np
import pandas as pd
    
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf
    
if __name__ == "__main__":

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--n_estimators", type=int, default=100)
    parser.add_argument("--random_state", type=int, default=0)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train-V-1.csv")
    parser.add_argument("--test-file", type=str, default="test-V-1.csv")

    args, _ = parser.parse_known_args()
    
    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    print()
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))
    
    features = list(train_df.columns)
    label = features.pop(-1)
    
    print("Building training and testing datasets")
    print()
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print('Column order: ')
    print(features)
    print()
    
    print("Label column is: ",label)
    print()
    
    print("Data Shape: ")
    print()
    print("---- SHAPE OF TRAINING DATA (85%) ----")
    print(X_train.shape)
    print(y_train.shape)
    print()
    print("---- SHAPE OF TESTING DATA (15%) ----")
    print(X_test.shape)
    print(y_test.shape)
    print()
    
  
    print("Training RandomForest Model.....")
    print()
    model =  RandomForestClassifier(n_estimators=args.n_estimators, random_state=args.random_state, verbose = 3,n_jobs=-1)
    model.fit(X_train, y_train)
    print()
    

    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model,model_path)
    print("Model persisted at " + model_path)
    print()

    
    y_pred_test = model.predict(X_test)
    test_acc = accuracy_score(y_test,y_pred_test)
    test_rep = classification_report(y_test,y_pred_test)

    print()
    print("---- METRICS RESULTS FOR TESTING DATA ----")
    print()
    print("Total Rows are: ", X_test.shape[0])
    print('[TESTING] Model Accuracy is: ', test_acc)
    print('[TESTING] Testing Report: ')
    print(test_rep)


In [None]:

from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"


endpoint_name="Iris_endpoint"
# Especifica tu rol de IAM
rolef = 'arn:aws:iam::123456789012:role/SageMakerRole'

# Configuración de VPC
vpc_configf = {
    'Subnets': ['subnet-abc123', 'subnet-def456'],  # Tus subnets
    'SecurityGroupIds': ['sg-0123456789abcdef0']  # Tus security groups
}


sklearn_estimator = SKLearn(
    entry_point="main.py",
    role=rolef,
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="Custom-iris-sklearn",
    use_spot_instances = True,
    vpc_config=vpc_configf,
    max_wait = 7200,
    max_run = 3600
)
predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
)

# launch training job, with asynchronous call
sklearn_estimator.fit( wait=True)
# sklearn_estimator.fit({"train": datapath}, wait=True)


# sklearn_estimator.latest_training_job.wait(logs="None")
# artifact = sm_boto3.describe_training_job(
#     TrainingJobName=sklearn_estimator.latest_training_job.name
# )["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)


from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name =  model_name,
    model_data=artifact,
    role=get_execution_role(),
    entry_point="script.py",
    framework_version=FRAMEWORK_VERSION,
)

In [None]:
from sagemaker.sklearn.model import SKLearnModel

FRAMEWORK_VERSION = "0.23-1"


endpoint_name="Iris_endpoint"
# Especifica tu rol de IAM
rolef = 'arn:aws:iam::123456789012:role/SageMakerRole'

#subnet privada
# subnet-0481d2f1cb01b8236
#db subnet
#subnet-064f2feaf0c3d4c0a
# Configuración de VPC
vpc_configf = {
    'Subnets': ['subnet-abc123', 'subnet-def456'],  # Tus subnets
    'SecurityGroupIds': ['sg-0123456789abcdef0']  # Tus security groups
}


# Definir el estimador y entrenar
sklearn_estimator = SKLearn(
    entry_point="main.py",
    role=rolef,
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="Custom-iris-sklearn",
    use_spot_instances=True,
    vpc_config=vpc_configf,
    max_wait=7200,
    max_run=3600
)

# Entrenamiento del modelo
sklearn_estimator.fit(wait=True)

# Recuperar la ubicación de los artefactos del modelo en S3
model_data = sklearn_estimator.model_data

# Crear un objeto SKLearnModel usando los artefactos del modelo en S3
model = SKLearnModel(
    model_data=model_data,
    role=rolef,
    entry_point="main.py",  # Mismo script que usaste para entrenar
    framework_version=FRAMEWORK_VERSION,
    vpc_config=vpc_configf
)

# Desplegar el modelo como un endpoint
predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    endpoint_name=endpoint_name,
)
