In [None]:
from sklearn.datasets import fetch_rcv1
import mlflow, datetime, os, pickle, random, sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from joblib import dump

# Load or create dataset
os.makedirs('../data', exist_ok=True)

if os.path.exists('../data/data.pickle') and os.path.exists('../data/target.pickle'):
    X = pickle.load(open('../data/data.pickle', 'rb'))
    y = pickle.load(open('../data/target.pickle', 'rb'))
    y = y.toarray()[:, random.randint(0, 3)]
else:
    rcv1 = fetch_rcv1()
    pickle.dump(rcv1.data, open('../data/data.pickle', 'wb'))
    pickle.dump(rcv1.target, open('../data/target.pickle', 'wb'))
    X, y = rcv1.data, rcv1.target.toarray()[:, random.randint(0, 3)]

mlflow.set_tracking_uri("./mlruns")
dataset_name = "Reuters Corpus Volume"
current_time = datetime.datetime.now().strftime("%y%m%d_%H%M%S")
experiment_name = f"{dataset_name}_{current_time}"    
experiment_id = mlflow.create_experiment(experiment_name)

with mlflow.start_run(experiment_id=experiment_id, run_name=dataset_name):
    params = {
        "dataset_name": dataset_name,
        "num_datapoints": X.shape[0],
        "num_dimensions": X.shape[1]
    }
    mlflow.log_params(params)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=42)
    
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    mlflow.log_metrics({
        'Accuracy': accuracy_score(y_test, y_pred),
        'F1_Score': f1_score(y_test, y_pred)
    })
    
    os.makedirs('../model', exist_ok=True)
    dump(model, f'../model/{experiment_id}_lr_model.joblib')

print("âœ… Test run complete. Model trained and saved.")
