In [1]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.12.1-py3-none-any.whl.metadata (29 kB)
Collecting Flask<4 (from mlflow)
  Downloading flask-3.0.3-py3-none-any.whl.metadata (3.2 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.0.0-py3-none-any.whl.metadata (3.5 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.3-py2.py3-none-any.whl.metadata (7.7 kB)
Collecting markdown<4,>=3.3 (from mlflow)
  Downloading Markdown-3.6-py3-none-any.whl.metadata (7.0 kB)
Collecting querystring-parser<2 (from mlflow)
  Downloading querystring_parser-1.2.4-py2.py3-none-any.whl.metadata (559 bytes)
Collecting gunicorn<22 (from mlflow)
  Downloading gunicorn-21.2.0-py3-none-any.whl.metadata (4.1 kB)
Collecting Werkzeug>=3.0.0 (from Flask<4->mlflow)
  Downloading werkzeug-3.0.3-py3-none-any.whl.metadata (3.7 kB)
Collecting itsdangerous>=2.1.2 (from Flask<4->mlflow)
  Downloading itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4-

In [2]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import psycopg2
import pandas as pd
from sqlalchemy import create_engine

In [3]:
# Connect to MLflow server
mlflow.set_tracking_uri("http://mlflow:5000")

In [4]:
# PostgreSQL database connection information
host = '172.25.16.1'
port = '5432'
dbname = 'dsi324_db'
user = 'postgres'
pwd = '1234'

In [5]:
# Create a SQLAlchemy engine
engine = create_engine(f'postgresql://{user}:{pwd}@{host}:{port}/{dbname}')

# Query to fetch data from the database
query = "SELECT * FROM student_education_history"

# Load data into a DataFrame
student_education_history = pd.read_sql(query, engine)

# Close the database connection
engine.dispose()

In [6]:
# Start an MLflow experiment
with mlflow.start_run(run_name="my_experiment"):

    # Split data into features (X) and target (y)
    X = student_education_history.drop(columns=['student_id', 'bachelor_university_id'])
    y = student_education_history['bachelor_university_id']

    # Split data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a model
    model = RandomForestClassifier()
    model.fit(X_train, y_train)

    # Predict and calculate accuracy
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)

    # Log parameters, metrics, and model
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("random_state", 42)
    mlflow.log_metric("accuracy", accuracy)

    # Print run information
    print("MLflow Run completed.")
    print("MLflow Run ID:", mlflow.active_run().info.run_id)

mlflow.end_run()

MLflow Run completed.
MLflow Run ID: e49587795ed146d8be1a942720894d3a


In [7]:
# # Start an MLflow experiment
# with mlflow.start_run(run_name="my_experiment"):

#     # Load dataset
#     iris = datasets.load_iris()
#     X = iris.data
#     y = iris.target

#     # Split data into training and test sets
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#     # Train a model
#     model = RandomForestClassifier()
#     model.fit(X_train, y_train)

#     # Predict and calculate accuracy
#     predictions = model.predict(X_test)
#     accuracy = accuracy_score(y_test, predictions)

#     # Log parameters, metrics, and model
#     mlflow.log_param("test_size", 0.2)
#     mlflow.log_param("random_state", 42)
#     mlflow.log_metric("accuracy", accuracy)

#     # Print run information
#     print("MLflow Run completed.")
#     print("MLflow Run ID:", mlflow.active_run().info.run_id)

# mlflow.end_run()