In [1]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import boto3
from dotenv import load_dotenv
import os
from xgboost import XGBClassifier
import pandas as pd
from mlflow.models.signature import infer_signature
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [2]:
load_dotenv(dotenv_path="../.env")

access_key = os.getenv("AWS_ACCESS_KEY_ID_CLIENT")
secret_key = os.getenv("AWS_SECRET_ACCESS_KEY_CLIENT")
s3_url = os.getenv("MLFLOW_S3_ENDPOINT_URL_CLIENT")
tracker_url = os.getenv("MLFLOW_URL_CLIENT")
os.environ["MLFLOW_S3_ENDPOINT_URL"] = os.getenv("MLFLOW_S3_ENDPOINT_URL_CLIENT")
os.environ["MLFLOW_TRACKING_USERNAME"] = os.getenv("MLFLOW_ADMIN_USERNAME_CLIENT")
os.environ["MLFLOW_TRACKING_PASSWORD"] = os.getenv("MLFLOW_ADMIN_PASSWORD_CLIENT")
os.environ["MLFLOW_S3_IGNORE_TLS"] = "true"
os.environ["MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"] = "true"
bucket_name = "dataset"
object_name = "water_potability.csv"

In [3]:
s3 = boto3.client(
    's3',
    endpoint_url=s3_url,
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key,
    verify=False
)

In [4]:
try:
    # Attempt to list buckets
    response = s3.list_buckets()
    
    # If successful, print bucket names
    print("Connected successfully! Buckets available:")
    for bucket in response['Buckets']:
        print(f"- {bucket['Name']}")
except Exception as e:
    # Print any connection errors
    print("Connection error:", e)


Connected successfully! Buckets available:
- dataset
- mlflow
- webserver


In [5]:
try:
    response = s3.get_object(Bucket=bucket_name, Key=object_name)
    print(response)
    dataset_content = response.get('Body')
    print(dataset_content)
    df = pd.read_csv(dataset_content)
    print("Dataset loaded successfully:")
    print(df.head())
except Exception as e:
    print("Error fetching dataset from MinIO:", e)


{'ResponseMetadata': {'RequestId': '18222C0063F8683F', 'HostId': 'b675f5c02385af78c69266a96b22038bea3ecd7423814977a7267fd68ac2bbeb', 'HTTPStatusCode': 200, 'HTTPHeaders': {'accept-ranges': 'bytes', 'content-length': '525187', 'content-type': 'text/csv', 'etag': '"a5eeae83d792e40da6c2f9e1eab945fc"', 'last-modified': 'Thu, 02 Jan 2025 13:27:29 GMT', 'server': 'MinIO', 'strict-transport-security': 'max-age=31536000; includeSubDomains', 'vary': 'Origin, Accept-Encoding', 'x-amz-id-2': 'b675f5c02385af78c69266a96b22038bea3ecd7423814977a7267fd68ac2bbeb', 'x-amz-request-id': '18222C0063F8683F', 'x-content-type-options': 'nosniff', 'x-ratelimit-limit': '905', 'x-ratelimit-remaining': '905', 'x-xss-protection': '1; mode=block', 'x-amz-version-id': 'ec0e7df1-0564-498d-b08c-738edbf93673', 'date': 'Sat, 08 Feb 2025 07:46:26 GMT'}, 'RetryAttempts': 0}, 'AcceptRanges': 'bytes', 'LastModified': datetime.datetime(2025, 1, 2, 13, 27, 29, tzinfo=tzutc()), 'ContentLength': 525187, 'ETag': '"a5eeae83d792e4

In [6]:
feature = ['ph','Solids','Turbidity','Potability']
# feature = ['ph','Solids','Potability']
# feature = ['ph','Potability']
df = df[feature]
df.dropna(inplace=True)

In [7]:
df.describe()

Unnamed: 0,ph,Solids,Turbidity,Potability
count,2785.0,2785.0,2785.0,2785.0
mean,7.080795,21971.018946,3.97023,0.395332
std,1.59432,8721.738165,0.784964,0.48901
min,0.0,320.942611,1.45,0.0
25%,6.093092,15661.940335,3.440859,0.0
50%,7.036752,20868.627979,3.958543,0.0
75%,8.062066,27317.099444,4.51515,1.0
max,14.0,56867.859236,6.494749,1.0


In [8]:
df[df['Potability']==1].describe()

Unnamed: 0,ph,Solids,Turbidity,Potability
count,1101.0,1101.0,1101.0,1101.0
mean,7.073783,22389.962003,3.975108,1.0
std,1.448048,9082.357442,0.787685,0.0
min,0.227499,728.75083,1.492207,1.0
25%,6.179312,15671.119287,3.426266,1.0
50%,7.036752,21162.726735,3.961731,1.0
75%,7.933068,28099.038729,4.521836,1.0
max,13.175402,56488.672413,6.494249,1.0


In [9]:
df[df['Potability']==0].describe()

Unnamed: 0,ph,Solids,Turbidity,Potability
count,1684.0,1684.0,1684.0,1684.0
mean,7.085378,21697.113776,3.967041,0.0
std,1.683499,8469.216148,0.783398,0.0
min,0.0,320.942611,1.45,0.0
25%,6.037723,15596.974568,3.449742,0.0
50%,7.035456,20566.77022,3.949117,0.0
75%,8.15551,26899.500571,4.50913,0.0
max,14.0,56867.859236,6.494749,0.0


In [10]:
df[df['Potability']==0].sample(5)

Unnamed: 0,ph,Solids,Turbidity,Potability
2203,6.246118,27001.335319,4.755158,0
124,8.550875,28918.419953,3.772887,0
523,8.437876,12856.928695,3.815469,0
1032,8.697122,27752.00024,4.180568,0
1679,6.63299,12829.092778,4.360851,0


In [11]:
df[df['Potability']==1].sample(5)

Unnamed: 0,ph,Solids,Turbidity,Potability
3128,4.959853,9887.830755,3.455045,1
2414,9.15966,16679.335164,5.452362,1
364,4.993531,13900.779271,3.153809,1
2335,4.761105,19197.105515,4.581395,1
2829,7.342231,35273.976229,4.261504,1


In [12]:
df['Potability'].value_counts()

Potability
0    1684
1    1101
Name: count, dtype: int64

In [13]:
df.isna().sum()

ph            0
Solids        0
Turbidity     0
Potability    0
dtype: int64

In [14]:
data = df.drop('Potability', axis=1)
target = df['Potability']

In [15]:
from imblearn.combine import SMOTETomek

smt = SMOTETomek(random_state=42)
data, target = smt.fit_resample(data, target)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

In [17]:
mlflow.set_tracking_uri(tracker_url)
experiment_name = "water_potability"
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='s3://mlflow/1', creation_time=1735824531785, experiment_id='1', last_update_time=1735824531785, lifecycle_stage='active', name='water_potability', tags={}>

In [18]:
tags = {
    "ph": "True",
    "Hardness": "False",
    "Solids": "True",
    "Chloramines": "False",
    "Sulfate": "False",
    "Conductivity": "False",
    "Organic_carbon": "False",
    "Trihalomethanes": "False",
    "Turbidity": "True",
    "Potability": "Binary",
}


In [19]:
# Model parameters
n_estimators = 100
max_depth = 5
random_state = 42


with mlflow.start_run():
    # Log parameters
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("random_state", random_state)

    # Initialize and train the model
    model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    model.fit(X_train, y_train)

    # Evaluate the model
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    mlflow.log_metric("accuracy", accuracy)

    # Define input example and signature
    input_example = X_test.iloc[:1].fillna(X_test.mean())
    signature = infer_signature(X_test, model.predict(X_test))

    # Log the model with MLflow
    mlflow.sklearn.log_model(
        model,
        artifact_path="model",
        input_example=input_example,
        signature=signature
    )

    # Print run information
    run_id = mlflow.active_run().info.run_id
    mlflow.set_tags(tags)
    mlflow.set_tag("Model", "XGBoost")
    print(f"Run ID: {run_id}")
    print(f"Model accuracy: {accuracy}")


2025/02/08 14:46:39 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


Run ID: db6c854e7f774723bf892e59e9d3ae1b
Model accuracy: 0.5557851239669421
🏃 View run marvelous-grub-465 at: http://10.34.4.242/#/experiments/1/runs/db6c854e7f774723bf892e59e9d3ae1b
🧪 View experiment at: http://10.34.4.242/#/experiments/1


2025/02/08 14:46:45 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/02/08 14:46:45 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# Model parameters
max_iter = 1000
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("model", LogisticRegression(max_iter = max_iter))
])


with mlflow.start_run():
    # Log parameters
    mlflow.log_param("max_iter", max_iter)

    # Initialize and train the model
    pipeline.fit(X_train, y_train)

    # Evaluate the model
    predictions = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    mlflow.log_metric("accuracy", accuracy)

    # Define input example and signature
    input_example = X_test.iloc[:1].fillna(X_test.mean())
    signature = infer_signature(X_test, pipeline.predict(X_test))

    # Log the model with MLflow
    mlflow.sklearn.log_model(
        pipeline,
        artifact_path="model",
        input_example=input_example,
        signature=signature
    )

    # Print run information
    run_id = mlflow.active_run().info.run_id
    mlflow.set_tags(tags)
    mlflow.set_tag("Model", "Logistic Regression")
    print(f"Run ID: {run_id}")
    print(f"Model accuracy: {accuracy}")

2025/02/08 14:49:35 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


Run ID: aa15fc456b644b8b92dd7e6cdc78139a
Model accuracy: 0.5041322314049587
🏃 View run kindly-rook-357 at: http://10.34.4.242/#/experiments/1/runs/aa15fc456b644b8b92dd7e6cdc78139a
🧪 View experiment at: http://10.34.4.242/#/experiments/1


2025/02/08 14:49:40 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/02/08 14:49:40 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [24]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# Model parameters
n_estimators = 100
max_depth = 5
random_state = 42

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("rf", RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state))
])

with mlflow.start_run():
    # Log parameters
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("random_state", random_state)
    # Initialize and train the model
    pipeline.fit(X_train, y_train)

    # Evaluate the model
    predictions = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    mlflow.log_metric("accuracy", accuracy)

    # Define input example and signature
    input_example = X_test.iloc[:1].fillna(X_test.mean())
    signature = infer_signature(X_test, pipeline.predict(X_test))

    # Log the model with MLflow
    mlflow.sklearn.log_model(
        pipeline,
        artifact_path="model",
        input_example=input_example,
        signature=signature
    )

    # Print run information
    run_id = mlflow.active_run().info.run_id
    mlflow.set_tags(tags)
    mlflow.set_tag("Model", "Random Forest")
    print(f"Run ID: {run_id}")
    print(f"Model accuracy: {accuracy}")

2025/02/08 14:52:08 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.


Run ID: 23a158c4b041448290ee16f3e71b4cd0
Model accuracy: 0.5805785123966942
🏃 View run angry-whale-828 at: http://10.34.4.242/#/experiments/1/runs/23a158c4b041448290ee16f3e71b4cd0
🧪 View experiment at: http://10.34.4.242/#/experiments/1


2025/02/08 14:52:15 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...
2025/02/08 14:52:15 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!


In [25]:
import mlflow.pyfunc

model_uri = f"runs:/{run_id}/model"
model = mlflow.pyfunc.load_model(model_uri)

# Check if metadata and signature are available
if model.metadata:
    print("Model signature:", model.metadata.get_input_schema())
else:
    print("Model metadata is missing.")

Model signature: ['ph': double (required), 'Solids': double (required), 'Turbidity': double (required)]
