In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from pycaret.classification import *
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('celeb_embeddings.csv')
df.sample(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,503,504,505,506,507,508,509,510,511,target
624,0.003692,0.028498,-0.074741,0.038545,-0.026305,-0.077807,0.115657,-0.011797,0.029502,0.020282,...,0.031919,0.010142,0.06496,0.052511,0.033579,-0.04259,-0.010204,-0.002154,0.031679,0
429,-0.003164,0.035089,0.011096,0.058037,0.084806,-0.02868,-0.017977,-0.019721,0.115619,0.002532,...,-0.052788,0.068432,0.026856,0.035919,-0.009704,0.067549,-0.042087,0.002269,0.022753,1
516,0.025299,0.01229,-0.01936,-0.047073,0.040185,0.015663,0.078022,0.040239,-0.020616,-0.01001,...,-0.033176,0.02933,0.029749,0.01237,0.042833,-0.01699,-0.04167,-0.005787,0.008753,1


In [5]:
# Separate the features and the target
X = df.drop('target', axis=1)
y = df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [6]:
def generate_model_metrics(y_true, y_pred):

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='binary')  # Assuming binary classification
    recall = recall_score(y_true, y_pred, average='binary')  # Assuming binary classification
    f1 = f1_score(y_true, y_pred, average='binary')  # Assuming binary classification
    cm = confusion_matrix(y_true, y_pred)

    return accuracy, precision, recall, f1, cm

In [7]:
file_path = 'celeb_embeddings.csv'
data = pd.read_csv(file_path)

# # load dataset
# from pycaret.datasets import get_data
# classifier = get_data(data)

# init setup
from pycaret.classification import *
clf1 = setup(data = data, target = 'target')

# compare models
best = compare_models(sort = 'F1')

Unnamed: 0,Description,Value
0,Session id,2336
1,Target,target
2,Target type,Binary
3,Original data shape,"(1093, 513)"
4,Transformed data shape,"(1093, 513)"
5,Transformed train set shape,"(765, 513)"
6,Transformed test set shape,"(328, 513)"
7,Numeric features,512
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9856,0.999,0.9794,0.9924,0.9856,0.9713,0.9718,0.258
rf,Random Forest Classifier,0.9778,0.9982,0.9794,0.9777,0.978,0.9556,0.9566,0.856
gbc,Gradient Boosting Classifier,0.9739,0.9974,0.9767,0.972,0.9742,0.9477,0.948,7.913
nb,Naive Bayes,0.9661,0.9897,0.964,0.9693,0.9662,0.9321,0.933,0.054
lightgbm,Light Gradient Boosting Machine,0.9634,0.9968,0.9666,0.9619,0.9639,0.9269,0.9276,10.45
xgboost,Extreme Gradient Boosting,0.9517,0.9944,0.9562,0.9496,0.9524,0.9033,0.9043,1.716
ada,Ada Boost Classifier,0.9425,0.9848,0.9563,0.9338,0.9442,0.8849,0.8865,1.63
lr,Logistic Regression,0.9321,0.9798,0.9613,0.9104,0.9349,0.864,0.866,0.565
ridge,Ridge Classifier,0.9294,0.0,0.9613,0.9061,0.9326,0.8587,0.8611,0.058
knn,K Neighbors Classifier,0.9216,0.9802,1.0,0.8674,0.9286,0.8428,0.8541,0.067


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

## Testing all classifiers models from PyCaret

In [8]:
# Here we test our dataset with all classifiers models from PyCaret
# Initialize PyCaret environment
classifier_test = setup(data = df, target = 'target') # session_id=123 allow to reproduce the same results

# Compare all (classifiers) models in PyCaret
models = compare_models()

# Select best model
best_model = automl(optimize = 'Accuracy')

# Fine tuning the best model
tuned_best_model = tune_model(best_model)

# Predict class labels
predictions = predict_model(tuned_best_model, data = df)


Unnamed: 0,Description,Value
0,Session id,4830
1,Target,target
2,Target type,Binary
3,Original data shape,"(1093, 513)"
4,Transformed data shape,"(1093, 513)"
5,Transformed train set shape,"(765, 513)"
6,Transformed test set shape,"(328, 513)"
7,Numeric features,512
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9869,0.9993,0.9897,0.9849,0.9872,0.9738,0.9741,0.354
rf,Random Forest Classifier,0.9805,0.9983,0.9768,0.9847,0.9806,0.9609,0.9611,0.677
gbc,Gradient Boosting Classifier,0.9687,0.9952,0.9768,0.962,0.9693,0.9374,0.9376,7.799
xgboost,Extreme Gradient Boosting,0.9687,0.9931,0.9665,0.972,0.9691,0.9374,0.9377,1.76
lightgbm,Light Gradient Boosting Machine,0.9674,0.9958,0.969,0.9671,0.9678,0.9348,0.9353,11.56
nb,Naive Bayes,0.9582,0.988,0.9534,0.9642,0.958,0.9164,0.9178,0.082
ada,Ada Boost Classifier,0.9529,0.9871,0.9664,0.9429,0.9542,0.9058,0.9068,1.792
lr,Logistic Regression,0.919,0.9737,0.9535,0.8948,0.9229,0.8378,0.8404,0.105
knn,K Neighbors Classifier,0.9189,0.976,1.0,0.8635,0.9264,0.8374,0.8494,0.085
svm,SVM - Linear Kernel,0.9164,0.0,0.9201,0.9174,0.918,0.8328,0.8343,0.087


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.987,1.0,1.0,0.975,0.9873,0.974,0.9743
1,0.961,0.9946,0.9487,0.9737,0.961,0.9221,0.9224
2,0.987,0.998,1.0,0.975,0.9873,0.974,0.9743
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,0.961,0.9966,1.0,0.9286,0.963,0.922,0.9248
5,0.9868,0.9979,1.0,0.975,0.9873,0.9736,0.974
6,0.9737,0.9993,0.9744,0.9744,0.9744,0.9473,0.9473
7,0.9868,1.0,1.0,0.975,0.9873,0.9736,0.974
8,0.9605,0.9986,0.9737,0.9487,0.961,0.9211,0.9214
9,0.9868,1.0,0.9737,1.0,0.9867,0.9737,0.974


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.9945,0.9999,0.991,0.9982,0.9946,0.989,0.989


In [None]:
# Save the model
save_model(tuned_best_model, 'tuned_best_model')

# Load the model
loaded_model = load_model('tuned_best_model')

# Predict using the loaded model
predictions_loaded_model = predict_model(loaded_model, data = df)

Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Loaded


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.9918,0.9998,0.991,0.9928,0.9919,0.9835,0.9835


In [9]:
# Initialize PyCaret environment
classifier_test = setup(data = df, target = 'target') # session_id=123 allow to reproduce the same results

# Compare all (classifiers) models in PyCaret
models = compare_models(include=['svm', 'et'])

# Select best model
best_model = automl(optimize = 'Accuracy')

# Fine tuning the best model
tuned_best_model = tune_model(best_model)

# Predict class labels
predictions = predict_model(tuned_best_model, data = df)


Unnamed: 0,Description,Value
0,Session id,7283
1,Target,target
2,Target type,Binary
3,Original data shape,"(1093, 513)"
4,Transformed data shape,"(1093, 513)"
5,Transformed train set shape,"(765, 513)"
6,Transformed test set shape,"(328, 513)"
7,Numeric features,512
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9857,0.9981,0.9845,0.9873,0.9857,0.9713,0.9717,0.257
svm,SVM - Linear Kernel,0.9072,0.0,0.9226,0.9026,0.9095,0.8141,0.8204,0.654


Processing:   0%|          | 0/13 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.987,0.998,0.9744,1.0,0.987,0.974,0.9744
1,0.974,0.998,0.9487,1.0,0.9737,0.9481,0.9494
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,0.987,1.0,1.0,0.975,0.9873,0.974,0.9743
4,0.987,1.0,1.0,0.975,0.9873,0.974,0.9743
5,0.9868,0.9952,0.9737,1.0,0.9867,0.9737,0.974
6,0.9737,0.9972,0.9737,0.9737,0.9737,0.9474,0.9474
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,0.9868,0.9993,1.0,0.975,0.9873,0.9736,0.974


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.9945,0.9999,0.9946,0.9946,0.9946,0.989,0.989
