In [8]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import boto3
import pandas as pd
from dotenv import load_dotenv
import os

In [9]:
load_dotenv()

access_key = os.getenv("AWS_ACCESS_KEY_ID")
secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
s3_url = os.getenv("MLFLOW_S3_ENDPOINT_URL")
tracker_url = os.getenv("MLFLOW_URL")
bucket_name = "dataset"
object_name = "Iris.csv"
os.environ["MLFLOW_TRACKING_USERNAME"] = os.getenv("MLFLOW_ADMIN_USERNAME")
os.environ["MLFLOW_TRACKING_PASSWORD"] = os.getenv("MLFLOW_ADMIN_PASSWORD")
os.environ["MLFLOW_S3_IGNORE_TLS"] = "true"

In [10]:
s3 = boto3.client(
    's3',
    endpoint_url=s3_url,
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key,
    verify=False
)

In [11]:
print(s3_url)

https://localhost:443


In [12]:
try:
    response = s3.get_object(Bucket=bucket_name, Key=object_name)
    dataset_content = response.get('Body')
    print(dataset_content)
    # Load the dataset into a DataFrame
    df = pd.read_csv(dataset_content)
    print("Dataset loaded successfully:")
    print(df.head())
except Exception as e:
    print("Error fetching dataset from MinIO:", e)


<botocore.response.StreamingBody object at 0x159789ab0>
Dataset loaded successfully:
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa




In [13]:
mlflow.set_tracking_uri("http://localhost:5001")
experiment_name = "oke"
mlflow.set_experiment(experiment_name)

data = df.drop('Species', axis=1)
target = df['Species']
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

In [14]:
# Start an MLflow run
with mlflow.start_run():
    # Define model parameters
    n_estimators = 100
    max_depth = 5
    random_state = 42

    # Log parameters to MLflow
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("random_state", random_state)

    # Train the model
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    model.fit(X_train, y_train)

    # Make predictions and calculate accuracy
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)

    # Log metrics to MLflow
    mlflow.log_metric("accuracy", accuracy)

    # Log the model artifact to MinIO via MLflow
    mlflow.sklearn.log_model(model, "model")

    # Print Run ID for reference
    run_id = mlflow.active_run().info.run_id
    print(f"Run ID: {run_id}")
    print(f"Model accuracy: {accuracy}")


2024/11/11 12:09:23 INFO mlflow.tracking._tracking_service.client: 🏃 View run worried-ant-318 at: http://localhost:5001/#/experiments/2/runs/be4b4549c9bb42afae9bfbcc312374ec.
2024/11/11 12:09:23 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5001/#/experiments/2.


Run ID: be4b4549c9bb42afae9bfbcc312374ec
Model accuracy: 1.0
