In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

import mlflow

import warnings
warnings.filterwarnings('ignore')

In [2]:
mlflow.set_experiment("Baseline-Models") 

<Experiment: artifact_location='file:///c:/Users/jneed/Documents/projets/vacances/mlruns/788223573085369621', creation_time=1700001940422, experiment_id='788223573085369621', last_update_time=1700001940422, lifecycle_stage='active', name='Baseline-Models', tags={}>

In [3]:
df = pd.read_csv('celeb_embeddings.csv')
df.sample(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,target
69,-0.0323,0.04332,-0.006428,-0.04318,0.014835,-0.014165,0.0107,-0.028815,-0.021448,0.015811,...,0.001917,0.011886,-0.052931,-0.047543,0.048936,-0.010441,-0.003664,0.040583,-0.033006,1
605,0.053987,0.044439,0.05172,0.022096,0.103793,0.001721,-0.056659,-0.041845,-0.004872,-0.094555,...,-0.046188,-0.033324,-0.038872,0.053364,-0.050251,0.004074,0.002286,0.065672,-0.012583,0
606,0.112884,-0.027598,-0.054691,0.026067,-0.02563,-0.0245,0.020429,0.079027,-0.067067,-0.020187,...,0.03356,0.012251,0.048048,-0.018794,0.026594,-0.024508,-0.035637,-0.081886,-0.073478,0


In [4]:
# Separate the features and the target
X = df.drop('target', axis=1)
y = df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [5]:
def gererate_model_metrics(y_true, y_pred):

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='binary')  # Assuming binary classification
    recall = recall_score(y_true, y_pred, average='binary')  # Assuming binary classification
    f1 = f1_score(y_true, y_pred, average='binary')  # Assuming binary classification
    cm = confusion_matrix(y_true, y_pred)

    return accuracy, precision, recall, f1, cm

In [6]:
# Train a SVM classifier and track its performance in MLflow
mlflow.sklearn.autolog() 

with mlflow.start_run(run_name = 'svc_clf_default'):
    clf = SVC(C=1.0, kernel='rbf', gamma='scale', random_state=42)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    accuracy, precision, recall, f1, cm = gererate_model_metrics(y_test, y_pred)

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1", f1)

In [7]:
# Train a DecisionTree classifier and track its performance in MLflow
mlflow.sklearn.autolog() 

with mlflow.start_run(run_name = 'dtc_clf_default'):
    clf = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    accuracy, precision, recall, f1, cm = gererate_model_metrics(y_test, y_pred)

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1", f1)

In [8]:
# Train a RandomForest classifier and track its performance in MLflow
mlflow.sklearn.autolog() 

with mlflow.start_run(run_name = 'rf_clf_default'):
    clf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    accuracy, precision, recall, f1, cm = gererate_model_metrics(y_test, y_pred)

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1", f1)

In [9]:
# Train a MultiLayer Perceptron classifier and track its performance in MLflow
mlflow.sklearn.autolog() 

with mlflow.start_run(run_name = 'mlp_clf_default'):
    clf = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam')
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    accuracy, precision, recall, f1, cm = gererate_model_metrics(y_test, y_pred)

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1", f1)

In [10]:
# Train an AdaBoost classifier and track its performance in MLflow
mlflow.sklearn.autolog() 

with mlflow.start_run(run_name = 'ada_clf_default'):
    clf = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1), n_estimators=50, learning_rate=1.0)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    accuracy, precision, recall, f1, cm = gererate_model_metrics(y_test, y_pred)

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1", f1)

In [11]:
# #pip install pyngrok     

from pyngrok import ngrok
# # Terminate open tunnels if exist
ngrok.kill()

# # Open an HTTPs tunnel on port 5000 for http://localhost:5000
ngrok_tunnel = ngrok.connect(addr="5000", proto="http", bind_tls=True)
print("MLflow Tracking UI:", ngrok_tunnel)

t=2023-11-14T23:57:09+0100 lvl=warn msg="ngrok config file found at legacy location, move to XDG location" xdg_path=C:\\Users\\jneed\\AppData\\Local/ngrok/ngrok.yml legacy_path=C:\\Users\\jneed\\.ngrok2\\ngrok.yml


MLflow Tracking UI: NgrokTunnel: "https://d385-2a01-cb08-995a-8200-2185-f7ae-8e48-914a.ngrok.io" -> "http://localhost:5000"


In [12]:
# !mlflow ui