Using this module if at module .py doesn't exist

In [1]:
# MLflow tracking & logging (dipisah ke modul)
from mlflow_utils import log_mlflow_run

run_id = log_mlflow_run(
    best_logreg=best_logreg,
    y_train=y_train,
    logreg_decision_scores=logreg_pred,
    test_metrics={
        "test_accuracy": float(test_accuracy_log),
        "test_precision": float(test_precision_log),
        "test_recall": float(test_recall_log),
        "test_f1": float(test_f1_log),
        "test_roc_auc": float(test_roc_auc_log),
    },
    X_test=X_test,
    y_test=y_test,
)
print(f"MLflow logging selesai. run_id={run_id}")


NameError: name 'best_logreg' is not defined

In [None]:
# Orkestrasi Prefect Cloud
from prefect import flow, task, get_run_logger
from prefect.artifacts import create_markdown_artifact
from datetime import datetime
import os

@task
def load_and_prepare_data():
    import json
    import pandas as pd
    file_path = r"C:\\Users\\User\\AppData\\Roaming\\Python\\Python313\\site-packages\\ML-Production\\Tiktok-ML\\trending.json"
    with open(file_path, encoding="utf8") as f:
        raw_data = json.load(f)
    trending_videos_list = raw_data['collector']
    df = pd.json_normalize(trending_videos_list)
    df = df.explode('hashtags').to_json(orient='records')
    parsed = json.loads(df)
    df = pd.json_normalize(parsed)
    df_new = df.drop(['text', 'webVideoUrl', 'videoUrl', 'videoUrlNoWaterMark', 
                      'videoUrlNoWaterMark', 'downloaded', 'authorMeta.id', 
                      'hashtags', 'authorMeta.secUid', 'authorMeta.name', 'authorMeta.nickName',
                      'authorMeta.verified', 'authorMeta.signature', 'authorMeta.avatar', 'musicMeta.musicId',
                      'musicMeta.musicName', 'musicMeta.musicAuthor', 'musicMeta.musicOriginal', 'musicMeta.playUrl',
                      'musicMeta.coverThumb', 'musicMeta.coverMedium', 'musicMeta.coverLarge', 'covers.default',
                      'covers.origin', 'covers.dynamic', 'videoMeta.height', 'videoMeta.width', 'videoMeta.duration'], axis=1)
    df_new = df_new.dropna()
    df_new = df_new.drop(['hashtags.title', 'hashtags.cover', 'hashtags.id', 'mentions'], axis=1)
    df_new['id'] = pd.to_numeric(df_new['id'])
    hashtag_choosen = ['fyp', 'foryou', 'foryoupage']
    df_new['hashtags'] = df_new['hashtags.name']
    df_new = df_new.drop('hashtags.name', axis=1)
    df_new = df_new[df_new['hashtags'].isin(hashtag_choosen)]
    df_encoded = pd.get_dummies(df_new['hashtags'], prefix='hashtag', drop_first=True)
    df_final = pd.concat([df_new.drop('hashtags', axis=1), df_encoded], axis=1)
    df_final['id'] = df_final['id'].astype(str)
    return df, df_final

@task
def train_and_log_model(df, df_final):
    from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_predict
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import roc_auc_score
    import mlflow
    import mlflow.sklearn

    df_final = df_final.copy()
    df_final['viral'] = (df['playCount'] > 800000).astype(int)
    X = df_final.drop(['id', 'playCount', 'viral'], axis=1).astype(int)
    y = df_final['viral']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    log_reg_params = {'penalty': ['l1', 'l2'], 'C': [0.01, 0.1, 1, 10], 'solver': ['liblinear'], 'max_iter': [5000]}
    grid_log_reg = GridSearchCV(LogisticRegression(random_state=42), log_reg_params, verbose=0)
    grid_log_reg.fit(X_train, y_train) # train model
    best_logreg = grid_log_reg.best_estimator_

    logreg_pred = cross_val_predict(best_logreg, X_train, y_train, cv=5, method='decision_function')

    mlflow.set_experiment("tiktok-ml-best-model")
    with mlflow.start_run(run_name=f"prefect_logreg_{datetime.utcnow().isoformat()}Z"):
        mlflow.log_param("model", "LogisticRegression")
        mlflow.log_params(best_logreg.get_params())
        mlflow.log_metric("train_roc_auc_cv", float(roc_auc_score(y_train, logreg_pred)))
        test_accuracy = float((best_logreg.predict(X_test) == y_test).mean())
        mlflow.log_metric("test_accuracy", test_accuracy)
        mlflow.sklearn.log_model(best_logreg, artifact_path="model", registered_model_name="tiktok_logreg")
    return test_accuracy

@task
def report_metrics(test_accuracy: float):
    logger = get_run_logger()
    logger.info(f"Test accuracy: {test_accuracy:.4f}")
    create_markdown_artifact(
        key="model_report",
        markdown=f"## Tiktok Logistic Regression\n\nTest accuracy: {test_accuracy:.4f}",
        description="Ringkasan hasil training"
    )

@flow(name="tiktok-ml-best-model-flow")
def tiktok_flow():
    df, df_final = load_and_prepare_data()
    acc = train_and_log_model(df, df_final)
    report_metrics(acc)

# Untuk menjalankan secara lokal:
# tiktok_flow()

print("Prefect flow didefinisikan. Untuk Prefect Cloud, daftarkan proyek & jadwalkan flow ini.")
