In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from pycaret.classification import *
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
# mlflow.set_experiment("Baseline-Models") 

In [3]:
df = pd.read_csv('celeb_embeddings.csv')
df.sample(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,target
450,0.038142,0.005211,-0.005983,-0.05867,0.089513,-0.014522,0.0528,-0.014017,-0.033721,-0.011794,...,-0.03582,-0.016927,0.037001,0.014554,-0.001237,-0.027081,-0.068729,0.023298,-0.018545,1
498,0.051143,0.065513,0.029455,-0.067144,0.04784,-0.036553,-0.001459,0.045836,0.009363,0.029734,...,0.022716,-0.051894,-0.092621,0.0074,-0.000421,0.010811,-0.018366,-0.021066,0.012098,1
298,0.013168,-0.001504,-0.094584,0.041153,0.066051,-0.056818,0.049747,0.130332,-0.011523,0.029514,...,-0.013877,0.00281,0.008651,0.039777,0.042756,-0.009049,0.022813,0.012335,-0.001681,1


In [4]:
# Separate the features and the target
X = df.drop('target', axis=1)
y = df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [5]:
def generate_model_metrics(y_true, y_pred):

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='binary')  # Assuming binary classification
    recall = recall_score(y_true, y_pred, average='binary')  # Assuming binary classification
    f1 = f1_score(y_true, y_pred, average='binary')  # Assuming binary classification
    cm = confusion_matrix(y_true, y_pred)

    return accuracy, precision, recall, f1, cm

In [6]:
# Here we test our dataset with 
# Initialiser l'environnement PyCaret
classifier_test = setup(data = df, target = 'target') # session_id=123 allow to reproduce the same results

# Comparer tous les modèles disponibles dans la bibliothèque de modèles 
models = compare_models()

# Sélectionner le meilleur modèle
best_model = automl(optimize = 'Accuracy')

# Affiner le meilleur modèle
tuned_best_model = tune_model(best_model)

# Prédire les étiquettes de classe pour l'ensemble de données de test
predictions = predict_model(tuned_best_model, data = df)


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.9918,0.9998,0.991,0.9928,0.9919,0.9835,0.9835


In [7]:
from sklearn.svm import SVC

# Create a SVM Classifier
svm_model = SVC(kernel='linear') # Linear Kernel

# Train the model using the training sets
svm_model.fit(X_train, y_train)

# Predict the response for test dataset
svm_predictions = svm_model.predict(X_test)


In [8]:
svm_accuracy, svm_precision, svm_recall, svm_f1, svm_cm = generate_model_metrics(y_test, svm_predictions)

In [13]:
print("Model\t\t\tAccuracy\t\tPrecision\t\tRecall\t\t\tF1 Score")
print(f"SVM Model\t\t{svm_accuracy}\t{svm_precision}\t{svm_recall}\t{svm_f1}")


Model			Accuracy		Precision		Recall			F1 Score
SVM Model		0.9272727272727272	0.9245283018867925	0.9245283018867925	0.9245283018867925
