In [29]:
## classification results for couple of classifiers
import pickle
import pandas as pd
import plotly.express as px
import numpy as np
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC

In [None]:
### load profiles of datasets
IF_deep_profiles_path = "IF_deep_GMM/csv/profiles_harmonize_False.pkl"
IF_HC_profiles_path = "IF_HC_GMM/csv/profiles_harmonize_False.pkl"

CP_deep_profiles_path = "CP_deep_GMM/csv/profiles_harmonize_False.pkl"
CP_Cellprofiler_profiles_path = "CP_Cellprofiler_GMM/csv/profiles_harmonize_False.pkl"


In [31]:
### load profiles of datasets
IF_deep_profiles = pickle.load(open(IF_deep_profiles_path, "rb"))
IF_HC_profiles = pickle.load(open(IF_HC_profiles_path, "rb"))

CP_deep_profiles = pickle.load(open(CP_deep_profiles_path, "rb"))
CP_Cellprofiler_profiles = pickle.load(open(CP_Cellprofiler_profiles_path, "rb"))

list_profiles = [IF_deep_profiles, IF_HC_profiles, CP_deep_profiles, CP_Cellprofiler_profiles]
list_profiles_names = ["IF_deep", "IF_HC", "CP_deep", "CP_Cellprofiler"]

In [32]:
## check results on classifier for each dataset

classifiers = {"linear SVM": LinearSVC(random_state=42), "Nearest Neighbors": KNeighborsClassifier(3), "RBF SVM": SVC(random_state=42), "LDA": LDA(), "Naive Bayes": GaussianNB()}
results = []
for i, profiles in enumerate(list_profiles):
    test_plates = list(profiles.keys())
    profile_names = list(profiles[test_plates[0]].keys())
    treatments = profiles[test_plates[0]][profile_names[0]]["train"].Treatment.unique()

    indicies = pd.MultiIndex.from_product([classifiers.keys(), profile_names])
    df_classification_f1_score = pd.DataFrame(index=indicies, columns=test_plates)
    for test_plate in test_plates:
        for profile_name in profile_names:
            tr = profiles[test_plate][profile_name]["train"]
            x_train = tr.select_dtypes(include = 'number')
            y_train = tr["Treatment"]
            ts = profiles[test_plate][profile_name]["test"]
            x_test = ts.select_dtypes(include = 'number')
            y_test = ts["Treatment"]
            
            for classifier_name, classifier in classifiers.items():
                classifier.fit(x_train, y_train)
                y_pred = classifier.predict(x_test)
                df_classification_f1_score.loc[(classifier_name, profile_name), test_plate] = f1_score(y_test, y_pred, average="macro")
    ## normalization on all train and test together
    data = df_classification_f1_score.mean(axis=1).reset_index().rename(columns={0: "F1 Score", "level_0": "Classifier", "level_1": "Profile"})

    a = data[data.Profile != "pca"].groupby("Classifier").agg({"F1 Score": ["max","mean","std"]})
    a.columns = ["F1 Score clusters (max)", "F1 Score clusters (mean)", "F1 Score clusters (std)"]
    a.reset_index()

    b = data[data.Profile == "pca"].groupby("Classifier").agg({"F1 Score": "first"}).rename(columns={"F1 Score": "F1 Score (pca)"}).reset_index()

    c = pd.merge(a, b, on="Classifier")
    c["dataset"] = list_profiles_names[i]
    results.append(c)

results = pd.concat(results)

In [33]:
## save results
results.to_csv("dataset_comparison_results/compare_datasets_F1score.csv", index=False)


In [34]:
px.bar(results, x="dataset", y="F1 Score clusters (mean)", error_y="F1 Score clusters (std)", color="Classifier",barmode="group", title="F1 Score clusters (mean)")

In [35]:
px.bar(results, x="dataset", y="F1 Score clusters (max)", color="Classifier", barmode="group", title="F1 Score clusters (max)")

In [36]:
px.bar(results, x="dataset", y="F1 Score (pca)", color="Classifier", barmode="group", title="F1 Score (pca)")