In [1]:
# !pip install mlflow

In [2]:
# !pip install xgboost

In [3]:
import pandas as pd
import mlflow
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, \
                            precision_score, recall_score, f1_score, roc_auc_score, \
                            roc_curve,confusion_matrix

In [4]:
data = pd.read_csv('process.csv', index_col=0)
data.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,1,2,1,6,142,4,3,3,3,2,...,3,48,1,1,1,1,0,1,1,1
1,0,29,3,6,770,2,0,1,0,2,...,3,3,1,1,0,1,0,0,1,0
2,3,8,1,2,390,2,1,1,3,2,...,3,30,1,1,0,3,1,0,1,1
3,1,26,3,3,848,2,1,1,3,1,...,1,26,1,0,0,1,1,0,1,1
4,1,17,2,4,734,2,0,2,3,2,...,2,34,1,0,1,1,1,0,1,0


In [5]:
# Independent and dependent data
x = data[data.columns.difference(['class'])]
y = data['class']

In [6]:
# Scaler data
scaler =  StandardScaler()
Xscale  = scaler.fit_transform(x)
Xscale

array([[ 2.78850999, -0.4645937 , -1.15669474, ...,  0.62648137,
         1.04698668,  1.66990103],
       [-1.19773321, -1.26286467,  1.21764107, ...,  0.62648137,
        -0.76597727, -0.13053135],
       [ 1.19401271,  1.13194823, -0.21905894, ..., -1.02650377,
         0.14050471, -0.13053135],
       ...,
       [ 0.21959771,  1.13194823, -1.43269237, ...,  0.62648137,
         1.04698668, -0.13053135],
       [-1.10915003, -0.4645937 , -0.43834473, ...,  0.62648137,
         1.04698668, -0.13053135],
       [-0.7548173 , -1.26286467,  0.99457449, ...,  1.86622023,
         1.04698668, -1.93096373]])

In [7]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(Xscale, y, test_size = 0.20, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 20), (200, 20), (800,), (200,))

In [8]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("xgboost-ml")

<Experiment: artifact_location='file:///C:/Users/sudwa/Desktop/mlops-credit-card-risk/notebook/mlruns/1', creation_time=1721066323794, experiment_id='1', last_update_time=1721066323794, lifecycle_stage='active', name='xgboost-ml', tags={}>

In [9]:
# first run

xgboost_params = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2)
}
kf = KFold(n_splits=30)
           
with mlflow.start_run():
    mlflow.set_tag("model","XGBClassifier")
    mlflow.log_param("xgboost_params",xgboost_params)

    randomcv_models = XGBClassifier()

    random = RandomizedSearchCV(estimator=randomcv_models,
                                        param_distributions=xgboost_params,
                                       n_iter=100,
                                       cv=kf,
                                       verbose=2, 
                                       n_jobs=-1)
    random.fit(X_train, y_train)
    print(random.best_params_)
    
    y_pred = random.predict(X_test)

    mlflow.sklearn.log_model(random, artifact_path="models")

2024/07/15 12:54:19 INFO mlflow.tracking.fluent: Experiment with name 'my-experiment-1' does not exist. Creating a new experiment.


Fitting 30 folds for each of 12 candidates, totalling 360 fits
{'min_child_weight': 5, 'max_depth': 7}




In [10]:
# first search
mlflow.search_experiments()

[<Experiment: artifact_location='file:///C:/Users/sudwa/Desktop/mlops-credit-card-risk/notebook/mlruns/2', creation_time=1721073259866, experiment_id='2', last_update_time=1721073259866, lifecycle_stage='active', name='my-experiment-1', tags={}>,
 <Experiment: artifact_location='file:///C:/Users/sudwa/Desktop/mlops-credit-card-risk/notebook/mlruns/1', creation_time=1721066323794, experiment_id='1', last_update_time=1721066323794, lifecycle_stage='active', name='xgboost-ml', tags={}>,
 <Experiment: artifact_location='file:///C:/Users/sudwa/Desktop/mlops-credit-card-risk/notebook/mlruns/0', creation_time=1721066323762, experiment_id='0', last_update_time=1721066323762, lifecycle_stage='active', name='Default', tags={}>]

In [11]:
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:5000")
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://127.0.0.1:5000'


In [12]:
#second run
xgboost_params = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2)
}
kf = KFold(n_splits=30)
           
mlflow.set_experiment("my-experiment-2")
with mlflow.start_run():
    mlflow.set_tag("model","XGBClassifier")
    mlflow.log_param("xgboost_params",xgboost_params)

    randomcv_models = XGBClassifier()

    random = RandomizedSearchCV(estimator=randomcv_models,
                                        param_distributions=xgboost_params,
                                       n_iter=100,
                                       cv=kf,
                                       verbose=2, 
                                       n_jobs=-1)
    random.fit(X_train, y_train)
    print(random.best_params_)
    
    y_pred = random.predict(X_test)

    mlflow.sklearn.log_model(random, artifact_path="models")
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")

Fitting 30 folds for each of 12 candidates, totalling 360 fits
{'min_child_weight': 5, 'max_depth': 7}
default artifacts URI: 'file:///C:/Users/sudwa/Desktop/mlops-credit-card-risk/mlruns/3/c9fe7910bd85438b8cb8688214e6c456/artifacts'


In [13]:
#third run
xgboost_params = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2)
}
kf = KFold(n_splits=30)
           
mlflow.set_experiment("my-experiment-3")
with mlflow.start_run():
    mlflow.set_tag("model","XGBClassifier")
    mlflow.log_param("xgboost_params",xgboost_params)

    randomcv_models = XGBClassifier()

    random = RandomizedSearchCV(estimator=randomcv_models,
                                        param_distributions=xgboost_params,
                                       n_iter=100,
                                       cv=kf,
                                       verbose=2, 
                                       n_jobs=-1)
    random.fit(X_train, y_train)
    print(random.best_params_)
    
    y_pred = random.predict(X_test)

    mlflow.sklearn.log_model(random, artifact_path="models")
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")

Fitting 30 folds for each of 12 candidates, totalling 360 fits
{'min_child_weight': 5, 'max_depth': 7}
default artifacts URI: 'file:///C:/Users/sudwa/Desktop/mlops-credit-card-risk/mlruns/4/b30a3d95a2d24bbf8f1e63000fd39cd8/artifacts'


In [14]:
# second and third search
mlflow.search_experiments()

[<Experiment: artifact_location='file:///C:/Users/sudwa/Desktop/mlops-credit-card-risk/mlruns/4', creation_time=1721072751873, experiment_id='4', last_update_time=1721072751873, lifecycle_stage='active', name='my-experiment-v3', tags={}>,
 <Experiment: artifact_location='file:///C:/Users/sudwa/Desktop/mlops-credit-card-risk/mlruns/3', creation_time=1721072687219, experiment_id='3', last_update_time=1721072687219, lifecycle_stage='active', name='my-experiment-2', tags={}>,
 <Experiment: artifact_location='file:///C:/Users/sudwa/Desktop/mlops-credit-card-risk/mlruns/2', creation_time=1721072626132, experiment_id='2', last_update_time=1721072626132, lifecycle_stage='active', name='my-experiment-1', tags={}>,
 <Experiment: artifact_location='file:///C:/Users/sudwa/Desktop/mlops-credit-card-risk/mlruns/1', creation_time=1721069151515, experiment_id='1', last_update_time=1721069151515, lifecycle_stage='active', name='my-experiment-v1', tags={}>,
 <Experiment: artifact_location='file:///C:/Us

In [15]:
# interacting model registry

In [16]:
# way 1
from mlflow.tracking import MlflowClient

client = MlflowClient("http://127.0.0.1:5000")

id = client.create_registered_model(name='my-experiment-v3')
print(id)

RestException: RESOURCE_ALREADY_EXISTS: Registered Model (name=my-experiment-v3) already exists. Error: (sqlite3.IntegrityError) UNIQUE constraint failed: registered_models.name
[SQL: INSERT INTO registered_models (name, creation_time, last_updated_time, description) VALUES (?, ?, ?, ?)]
[parameters: ('my-experiment-v3', 1721073453766, 1721073453766, '')]
(Background on this error at: https://sqlalche.me/e/14/gkpj)

In [17]:
# way 2
run_id=client.get_experiment(experiment_id='0')

print(run_id)
mlflow.register_model(
    model_uri=f"runs:/{run_id}/models",
    name='xgboost-classifiers'
)


<Experiment: artifact_location='file:///C:/Users/sudwa/Desktop/mlops-credit-card-risk/mlruns/0', creation_time=1721066118524, experiment_id='0', last_update_time=1721066118524, lifecycle_stage='active', name='Default', tags={}>


Registered model 'xgboost-classifiers' already exists. Creating a new version of this model...


RestException: RESOURCE_DOES_NOT_EXIST: Run with id=<Experiment: artifact_location='file: not found

In [None]:
# final aws ec2 main script file

In [None]:
# Hypertunning Random_search 
xgboost_params = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2)
}

kf = KFold(n_splits=30)

model_param = {}

randomcv_models = XGBClassifier()

random = RandomizedSearchCV(estimator=randomcv_models,
                                    param_distributions=xgboost_params,
                                   n_iter=100,
                                   cv=kf,
                                   verbose=2, 
                                   n_jobs=-1)
random.fit(Xscale, y)
model_param = random.best_params_
print(model_param)


In [None]:
# Evaluate model
best_models = {
    "XGBClassifier": XGBClassifier(**model_param,n_jobs=-1),

}


In [None]:
best_model = XGBClassifier(**model_param)
best_model = best_model.fit(X_train,y_train)
y_pred = best_model.predict(X_test)
score = accuracy_score(y_test,y_pred)
cr = classification_report(y_test,y_pred)

print("FINAL MODEL 'XGBoost'")
print ("Accuracy Score value: {:.4f}".format(score))
print (cr)