In [None]:
import os
import shutil
import mlflow
from glob import glob
from tqdm import tqdm
import polars as pl

from dotenv import load_dotenv
load_dotenv("../.env")

## A. Delete Model Parameters of Run Deleted on MLflow from Local Directory

In [None]:
# Get run_id of Local Model Parameter

localPaths = glob('../checkpoint/*')
localRunIds = set([path.rsplit('/', 1)[1] for path in localPaths])

print( len(localRunIds) )

In [None]:
# Obtain run_id existing in MLflow

activeExperimentIds = [experiment.experiment_id for experiment in mlflow.search_experiments(view_type="ACTIVE_ONLY")]
print(activeExperimentIds)

activeRuns = mlflow.search_runs(experiment_ids=activeExperimentIds, run_view_type=1)
activeRunIds = set(activeRuns["run_id"])
print( len(activeRunIds) )

In [None]:
removedRunIds = localRunIds - activeRunIds
print( len(removedRunIds) )

for runId in tqdm(removedRunIds):
    
    path = f'../checkpoint/{runId}'
    shutil.rmtree(path)

## B. Delete unnecessary model parameters in Train

In [None]:
# Obtain train run_id existing in MLflow
experiment_ids = [experiment.experiment_id for experiment in mlflow.search_experiments( view_type="ACTIVE_ONLY" ) if experiment.name.endswith("-TRAIN")]
df_mlflow = pl.from_pandas(mlflow.search_runs( experiment_ids=experiment_ids ))
trainRunIds = df_mlflow.filter( pl.col("status") == "FINISHED" ).get_column("run_id")
print(trainRunIds)

# Get run_id of Local Model Parameter
runDirPathList = list(glob('../checkpoint/*'))
print(len(runDirPathList))

for runDirPath in tqdm(runDirPathList):

    # Parse Run ID
    runId = runDirPath.rsplit("/", 1)[-1]
    
    # Continue if the run_id is not in train runs
    if runId not in trainRunIds:
        continue

    # Delete Unused Model Parameters
    targetDirPathList = sorted(glob(f'../checkpoint/{runId}/*'), key=(lambda s: int(s.rsplit("/", 1)[1])))[1:]
    for path in targetDirPathList:
        shutil.rmtree(path)

## C. Delete unnecessary model parameters in Test

In [None]:
# Obtain train run_id existing in MLflow
experiment_ids = [experiment.experiment_id for experiment in mlflow.search_experiments( view_type="ACTIVE_ONLY" ) if experiment.name.endswith("-TEST")]
df_mlflow = pl.from_pandas(mlflow.search_runs( experiment_ids=experiment_ids ))
trainRunIds = df_mlflow.filter( pl.col("status") == "FINISHED" ).get_column("run_id")
print(trainRunIds)

# Get run_id of Local Model Parameter
runDirPathList = list(glob('../checkpoint/*'))
print(len(runDirPathList))

for runDirPath in tqdm(runDirPathList):

    # Parse Run ID
    runId = runDirPath.rsplit("/", 1)[-1]
    
    # Continue if the run_id is not in train runs
    if runId not in trainRunIds:
        continue

    # Delete Unused Model Parameters
    targetDirPathList = sorted(glob(f'../checkpoint/{runId}/*'), key=(lambda s: int(s.rsplit("/", 1)[1])))[:-1]
    for path in targetDirPathList:
        shutil.rmtree(path)