In [8]:
# !pip install mlflow

In [9]:
# !pip install xgboost

In [10]:
import pandas as pd
import mlflow
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score, \
                            roc_curve,confusion_matrix

In [11]:
data = pd.read_csv('process.csv', index_col=0)
data.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,1,2,1,6,142,4,3,3,3,2,...,3,48,1,1,1,1,0,1,1,1
1,0,29,3,6,770,2,0,1,0,2,...,3,3,1,1,0,1,0,0,1,0
2,3,8,1,2,390,2,1,1,3,2,...,3,30,1,1,0,3,1,0,1,1
3,1,26,3,3,848,2,1,1,3,1,...,1,26,1,0,0,1,1,0,1,1
4,1,17,2,4,734,2,0,2,3,2,...,2,34,1,0,1,1,1,0,1,0


In [12]:
# Independent and dependent data
x = data[data.columns.difference(['class'])]
y = data['class']

In [13]:
# Scaler data
scaler =  StandardScaler()
Xscale  = scaler.fit_transform(x)
Xscale

array([[ 2.78850999, -0.4645937 , -1.15669474, ...,  0.62648137,
         1.04698668,  1.66990103],
       [-1.19773321, -1.26286467,  1.21764107, ...,  0.62648137,
        -0.76597727, -0.13053135],
       [ 1.19401271,  1.13194823, -0.21905894, ..., -1.02650377,
         0.14050471, -0.13053135],
       ...,
       [ 0.21959771,  1.13194823, -1.43269237, ...,  0.62648137,
         1.04698668, -0.13053135],
       [-1.10915003, -0.4645937 , -0.43834473, ...,  0.62648137,
         1.04698668, -0.13053135],
       [-0.7548173 , -1.26286467,  0.99457449, ...,  1.86622023,
         1.04698668, -1.93096373]])

In [14]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(Xscale, y, test_size = 0.20, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 20), (200, 20), (800,), (200,))

In [41]:
# local notebook mlflow 
# mlflow.set_tracking_uri("sqlite:///mlflow.db")
# mlflow.set_experiment("xgboost-ml")

# mlflow.client
# from mlflow.tracking import MlflowClient
# client = MlflowClient("sqlite:///mlflow.db")

In [43]:
# mlflow ui setup
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://127.0.0.1:5000'


In [45]:
# first run
mlflow.set_experiment("xgboost-ml")
xgboost_params = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2)
}
kf = KFold(n_splits=30)
           
with mlflow.start_run():
    mlflow.set_tag("model","XGBClassifier")
    mlflow.log_param("xgboost_params",xgboost_params)

    randomcv_models = XGBClassifier()

    random = RandomizedSearchCV(estimator=randomcv_models,
                                        param_distributions=xgboost_params,
                                       n_iter=100,
                                       cv=kf,
                                       verbose=2, 
                                       n_jobs=-1)
    random.fit(X_train, y_train)
    print(random.best_params_)
    
    y_pred = random.predict(X_test)

    mlflow.sklearn.log_model(random, artifact_path="models")

2024/07/22 14:52:37 INFO mlflow.tracking.fluent: Experiment with name 'xgboost-ml' does not exist. Creating a new experiment.


Fitting 30 folds for each of 12 candidates, totalling 360 fits
{'min_child_weight': 5, 'max_depth': 7}


In [47]:
# first search
mlflow.search_experiments()

[<Experiment: artifact_location='mlflow-artifacts:/3', creation_time=1721685157391, experiment_id='3', last_update_time=1721685157391, lifecycle_stage='active', name='xgboost-ml', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/2', creation_time=1721174520861, experiment_id='2', last_update_time=1721174520861, lifecycle_stage='active', name='my-experiment-3', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1721174454461, experiment_id='1', last_update_time=1721174454461, lifecycle_stage='active', name='my-experiment-2', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1721174343796, experiment_id='0', last_update_time=1721174343796, lifecycle_stage='active', name='Default', tags={}>]

In [50]:
#second run
mlflow.set_experiment("xgboost-ml-2")

xgboost_params = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2)
}
kf = KFold(n_splits=30)
           
with mlflow.start_run():
    mlflow.set_tag("model","XGBClassifier")
    mlflow.log_param("xgboost_params",xgboost_params)

    randomcv_models = XGBClassifier()

    random = RandomizedSearchCV(estimator=randomcv_models,
                                        param_distributions=xgboost_params,
                                       n_iter=100,
                                       cv=kf,
                                       verbose=2, 
                                       n_jobs=-1)
    random.fit(X_train, y_train)
    print(random.best_params_)
    
    y_pred = random.predict(X_test)

    mlflow.sklearn.log_model(random, artifact_path="models")
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")

2024/07/22 14:56:20 INFO mlflow.tracking.fluent: Experiment with name 'xgboost-ml-2' does not exist. Creating a new experiment.


Fitting 30 folds for each of 12 candidates, totalling 360 fits
{'min_child_weight': 5, 'max_depth': 7}
default artifacts URI: 'mlflow-artifacts:/4/2500e761510d443088845f7cb633db35/artifacts'


In [52]:
# Second search
mlflow.search_experiments()

[<Experiment: artifact_location='mlflow-artifacts:/4', creation_time=1721685380881, experiment_id='4', last_update_time=1721685380881, lifecycle_stage='active', name='xgboost-ml-2', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/3', creation_time=1721685157391, experiment_id='3', last_update_time=1721685157391, lifecycle_stage='active', name='xgboost-ml', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1721174343796, experiment_id='0', last_update_time=1721174343796, lifecycle_stage='active', name='Default', tags={}>]

In [54]:
#third run
mlflow.set_experiment("xgboost-ml-3")
xgboost_params = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2)
}
kf = KFold(n_splits=30)

with mlflow.start_run():
    mlflow.set_tag("model","XGBClassifier")
    mlflow.log_param("xgboost_params",xgboost_params)

    randomcv_models = XGBClassifier()

    random = RandomizedSearchCV(estimator=randomcv_models,
                                        param_distributions=xgboost_params,
                                       n_iter=100,
                                       cv=kf,
                                       verbose=2, 
                                       n_jobs=-1)
    random.fit(X_train, y_train)
    print(random.best_params_)
    
    y_pred = random.predict(X_test)

    mlflow.sklearn.log_model(random, artifact_path="models")
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")

2024/07/22 14:59:12 INFO mlflow.tracking.fluent: Experiment with name 'xgboost-ml-3' does not exist. Creating a new experiment.


Fitting 30 folds for each of 12 candidates, totalling 360 fits
{'min_child_weight': 5, 'max_depth': 7}
default artifacts URI: 'mlflow-artifacts:/5/55ec22ab2e02409aac217278a8db3d3a/artifacts'


In [57]:
# Third search
mlflow.search_experiments()

[<Experiment: artifact_location='mlflow-artifacts:/5', creation_time=1721685552484, experiment_id='5', last_update_time=1721685552484, lifecycle_stage='active', name='xgboost-ml-3', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/4', creation_time=1721685380881, experiment_id='4', last_update_time=1721685380881, lifecycle_stage='active', name='xgboost-ml-2', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/3', creation_time=1721685157391, experiment_id='3', last_update_time=1721685157391, lifecycle_stage='active', name='xgboost-ml', tags={}>,
 <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1721174343796, experiment_id='0', last_update_time=1721174343796, lifecycle_stage='active', name='Default', tags={}>]

In [113]:
from mlflow.tracking import MlflowClient

client = MlflowClient()
model_name = "xgboost-ml"

# latest_versions = client.get_latest_versions(model_name)

latest_versions = client.get_experiment_by_name(name=model_name)
for version in latest_versions:
    print(version)
    

('artifact_location', 'mlflow-artifacts:/3')
('creation_time', 1721685157391)
('experiment_id', '3')
('last_update_time', 1721685157391)
('lifecycle_stage', 'active')
('name', 'xgboost-ml')
('tags', {})


In [125]:
# model_name = "xgboost-ml"
# model_version = 3
# new_stage = "Staging"
# client.transition_model_version_stage(
#     name=model_name,
#     version=model_version,
#     stage=new_stage,
#     archive_existing_versions=False
# )

In [127]:
def test_model(name, stage, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}/{stage}")
    y_pred = model.predict(X_test)
    return {"rmse": mean_squared_error(y_test, y_pred, squared=False)}

In [59]:
# interacting model registry

In [67]:
from mlflow.tracking import MlflowClient

client = MlflowClient("http://127.0.0.1:5000")

run_id=client.get_experiment(experiment_id='3')

print(run_id)

<Experiment: artifact_location='mlflow-artifacts:/3', creation_time=1721685157391, experiment_id='3', last_update_time=1721685157391, lifecycle_stage='active', name='xgboost-ml', tags={}>


In [69]:
id = client.create_registered_model(name='xgboost-ml-3')
print(id)

<RegisteredModel: aliases={}, creation_timestamp=1721686254663, description='', last_updated_timestamp=1721686254663, latest_versions=[], name='xgboost-ml-3', tags={}>


In [None]:
# model-version
client = MlflowClient()
result = client.create_model_version(
    name='xgboost-ml-3')

In [None]:
# final aws ec2 main script file

In [30]:
# Hypertunning Random_search 
xgboost_params = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2)
}

kf = KFold(n_splits=30)

model_param = {}

randomcv_models = XGBClassifier()

random = RandomizedSearchCV(estimator=randomcv_models,
                                    param_distributions=xgboost_params,
                                   n_iter=100,
                                   cv=kf,
                                   verbose=2, 
                                   n_jobs=-1)
random.fit(Xscale, y)
model_param = random.best_params_
print(model_param)


Fitting 30 folds for each of 12 candidates, totalling 360 fits
{'min_child_weight': 3, 'max_depth': 5}


In [None]:
model_param = {
                'max_depth':9,
                'min_child_weight':8
                }

In [23]:
params = {'min_child_weight': 3, 'max_depth': 5}

best_model = XGBClassifier(**params)
best_model = best_model.fit(X_train,y_train)
y_pred = best_model.predict(X_test)
score = accuracy_score(y_test,y_pred)
cr = classification_report(y_test,y_pred)

print("FINAL MODEL 'XGBoost'")
print ("Accuracy Score value: {:.4f}".format(score))
print (cr)

FINAL MODEL 'XGBoost'
Accuracy Score value: 0.8200
              precision    recall  f1-score   support

           0       0.73      0.61      0.67        59
           1       0.85      0.91      0.88       141

    accuracy                           0.82       200
   macro avg       0.79      0.76      0.77       200
weighted avg       0.81      0.82      0.81       200



In [21]:
import mlflow
from mlflow.tracking import MlflowClient
from mlflow.entities import ViewType

mlflow.set_tracking_uri("http://127.0.0.1:5000")
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://127.0.0.1:5000'


In [23]:
mlflow.set_experiment("xgboost-ml-day-1")

xgboost_params = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2)
}
kf = KFold(n_splits=30)

with mlflow.start_run():
    mlflow.set_tag("model","XGBClassifier")
#     mlflow.log_param("xgboost_params",xgboost_params)

    randomcv_models = XGBClassifier()

    random = RandomizedSearchCV(estimator=randomcv_models,
                                        param_distributions=xgboost_params,
                                       n_iter=10,
                                       cv=kf,
                                       verbose=1, 
                                       n_jobs=-1)
    random.fit(X_train, y_train)
    
    print(random.best_params_)
    
    mlflow.log_param("params",random.best_params_)
    
    y_pred = random.predict(X_test)
    score = accuracy_score(y_test,y_pred)
    
    mlflow.log_metric("score",score)

    mlflow.sklearn.log_model(random, artifact_path="models")
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")

2024/07/24 14:28:46 INFO mlflow.tracking.fluent: Experiment with name 'xgboost-ml-day-1' does not exist. Creating a new experiment.


Fitting 30 folds for each of 10 candidates, totalling 300 fits
{'min_child_weight': 5, 'max_depth': 7}
default artifacts URI: 'file:///C:/Users/sudwa/Desktop/mlops-credit-card-risk/notebook/artifacts/5/4c1ee4c2854f492d84bcf1bf7ad16801/artifacts'


In [24]:
EXPERIMENT_NAME = "xgboost-ml-day-1"
MODEL_NAME = "random"

client = MlflowClient()
experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
best_run = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=1,
    order_by=["metrics.score asc"],
)[0]

In [26]:
# register the best model
run_id = best_run.info.run_id
model_uri = f"runs:/{run_id}/model"
model_accuracy = round(best_run.data.metrics["score"] * 100)
model_details = mlflow.register_model(model_uri=model_uri, name=MODEL_NAME)
client.update_registered_model(
    name=model_details.name, description=f"Current accuracy: {model_accuracy}%"
    )

Successfully registered model 'random'.
2024/07/24 14:31:21 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random, version 1
Created version '1' of model 'random'.


<RegisteredModel: aliases={}, creation_timestamp=1721856681431, description='Current accuracy: 78%', last_updated_timestamp=1721856681620, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1721856681540, current_stage='None', description='', last_updated_timestamp=1721856681540, name='random', run_id='4c1ee4c2854f492d84bcf1bf7ad16801', run_link='', source='file:///C:/Users/sudwa/Desktop/mlops-credit-card-risk/notebook/artifacts/5/4c1ee4c2854f492d84bcf1bf7ad16801/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>], name='random', tags={}>

In [32]:
MODEL_NAME = "random"
client = MlflowClient()
latest_versions = client.get_latest_versions(MODEL_NAME, stages=["None"])
latest_versions

for version in latest_versions:
        model_version = version.version
# Move the registered model to stage
new_stage = "Production"
client.transition_model_version_stage(
    name=MODEL_NAME,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False,
)
client.update_model_version(
    name=MODEL_NAME,
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_stage}.",
)

  latest_versions = client.get_latest_versions(MODEL_NAME, stages=["None"])
  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1721856681540, current_stage='Production', description='The model version 1 was transitioned to Production.', last_updated_timestamp=1721856994322, name='random', run_id='4c1ee4c2854f492d84bcf1bf7ad16801', run_link='', source='file:///C:/Users/sudwa/Desktop/mlops-credit-card-risk/notebook/artifacts/5/4c1ee4c2854f492d84bcf1bf7ad16801/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='1'>