In [1]:
import mlflow
import time

mlflow.set_experiment(experiment_name="white_color_classifier")

<Experiment: artifact_location='file:///home/padre/rojects/korrekturVonScans/notebooks/mlruns/388251539619712003', creation_time=1683209882173, experiment_id='388251539619712003', last_update_time=1683209882173, lifecycle_stage='active', name='white_color_classifier', tags={}>

In [2]:

import os
import pickle

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from color_corrector.image_features import image2features
import json
import numpy as np
from PIL import Image
from tqdm.notebook import tqdm

In [3]:
model_path = os.path.abspath(os.path.join(".", "..", "resources", "white_classifier.pkl.gz"))
model_path

'/home/padre/rojects/korrekturVonScans/resources/white_classifier.pkl.gz'

In [4]:
data_path = "/home/padre/rojects/korrekturVonScans/datasets/color_corrections"
with open(os.path.join(data_path, "labeled_tasks.json")) as inp:
    data: dict = json.load(inp)

In [5]:


labels = []
img_path = None

X = None
for i, value in enumerate(tqdm(sorted(data.values(), key=lambda v: v["original_image"]))):
    new_image_path = value["original_image"]
    if new_image_path != img_path:
        img_path = new_image_path
        image = Image.open(f"{data_path}/{img_path}")
        img = image2features(image)
    point_x = value["x"]
    point_y = value["y"]
    point = img[point_x, point_y, :]
    if X is None:
        X = np.empty((len(data), point.shape[0]))
    X[i, :] = point
    labels.append(value["labeled"][0])

X = np.array(X)
labels = np.array(labels, dtype=object)

  0%|          | 0/8190 [00:00<?, ?it/s]

In [6]:
model = make_pipeline(StandardScaler(), LogisticRegression())

In [11]:
import gzip

with mlflow.start_run():
    y = labels
    k_fold = KFold(n_splits=10)
    acc = 0
    auc = 0

    for index_train, index_test in tqdm(k_fold.split(X, y), total=k_fold.n_splits):
        X_train = X[index_train]
        y_train = y[index_train]

        X_test = X[index_test]
        y_test = y[index_test]

        model.fit(X_train, y_train)

        predictions_proba = model.predict_proba(X_test)
        predictions = model.predict(X_test)
        acc += accuracy_score(y_true=y_test, y_pred=predictions) * len(y_test)
        auc += roc_auc_score(y_true=y_test, y_score=predictions_proba[:, 1]) * len(y_test)
    
    mlflow.log_metric("accuracy", acc / len(y))
    print("acc", acc / len(y))
    mlflow.log_metric("auc", auc / len(y))
    mlflow.log_params(model.get_params())
    print("auc", auc / len(y))
# model = LogisticRegression(class_weight={1: 10, 0: 1})


  0%|          | 0/10 [00:00<?, ?it/s]

acc 0.9719169719169719
auc 0.9794595873905705


In [None]:
model.fit(X, y)
with gzip.open(model_path, "wb", compresslevel=9) as out:
    pickle.dump(obj=model, file=out)