In [1]:
%load_ext autoreload
%autoreload 2

In [5]:
import open_clip
import torch
from sklearn.svm import LinearSVC
import numpy as np
from datasets import load_from_disk
from tqdm import tqdm
import sys
sys.path.append("../tools")
from utils import load_data_split
from sklearn.model_selection import cross_val_score
import warnings
import pandas as pd
from IPython.display import clear_output

In [6]:
model, _, preprocess = open_clip.create_model_and_transforms('hf-hub:laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K',device="cuda")

model.eval()
clear_output()

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [8]:
X_train, y_train = load_data_split(dataset_path="../../data/medium_QF_40",
                                   split="train",
                                   model=model,
                                   preprocess=preprocess,
                                   device=device,
                                   normalize=False,
                                   show_progress_bar=True)

X_train_norm, y_train_norm = load_data_split(dataset_path="../../data/medium_QF_40",
                                             split="train",
                                             model=model,
                                             preprocess=preprocess,
                                             device=device,
                                             normalize=True,
                                             show_progress_bar=True)

100%|██████████| 1000/1000 [00:17<00:00, 56.18it/s]
100%|██████████| 1000/1000 [00:18<00:00, 55.51it/s]


In [9]:
X_test, y_test = load_data_split(dataset_path="../../data/medium_QF_40",
                                 split="test",
                                 model=model,
                                 preprocess=preprocess,
                                 device=device,
                                 normalize=False,
                                 show_progress_bar=True)

X_test_norm, y_test_norm = load_data_split(dataset_path="../../data/medium_QF_40",
                                           split="test",
                                           model=model,
                                           preprocess=preprocess,
                                           device=device,
                                           normalize=True,
                                           show_progress_bar=True)

100%|██████████| 1000/1000 [00:18<00:00, 52.84it/s]
100%|██████████| 1000/1000 [00:19<00:00, 51.62it/s]


In [22]:
def get_score(X_train, y_train, X_test, y_test):
    clf = LinearSVC(dual=False)
    return clf.fit(X_train,y_train).score(X_test,y_test)

## No normalization (not on train nor test)

In [23]:
get_score(X_train=X_train,
          y_train=y_train,
          X_test =X_test,
          y_test =y_test)

0.965

## Normalization on train and test

In [26]:
get_score(X_train=X_train_norm,
          y_train=y_train_norm,
          X_test=X_test_norm,
          y_test=y_test_norm)

0.975

In [32]:
X_test, y_test = load_data_split(dataset_path="../../data/big_QF_40",
                                 split="test",
                                 model=model,
                                 preprocess=preprocess,
                                 device=device,
                                 normalize=True,
                                 show_progress_bar=True)

100%|██████████| 10000/10000 [03:04<00:00, 54.07it/s]


In [34]:
X_train_40, y_train_40 = load_data_split(dataset_path="../../data/big_QF_40",
                                   split="train",
                                   model=model,
                                   preprocess=preprocess,
                                   device=device,
                                   normalize=True,
                                   show_progress_bar=True)
X_train_65, y_train_65 = load_data_split(dataset_path="../../data/big_QF_65",
                                   split="train",
                                   model=model,
                                   preprocess=preprocess,
                                   device=device,
                                   normalize=True,
                                   show_progress_bar=True)
X_train_90, y_train_90 = load_data_split(dataset_path="../../data/big_QF_90",
                                   split="train",
                                   model=model,
                                   preprocess=preprocess,
                                   device=device,
                                   normalize=True,
                                   show_progress_bar=True)

100%|██████████| 10000/10000 [03:03<00:00, 54.54it/s]
100%|██████████| 10000/10000 [03:11<00:00, 52.25it/s]
100%|██████████| 10000/10000 [03:09<00:00, 52.82it/s]


In [35]:
labels = (40, 65, 90)
train = ((X_train_40,y_train_40),
         (X_train_65,y_train_65),
         (X_train_90,y_train_90))
for i, l in enumerate(labels):
    print(f"score for training on QF {l}", get_score(train[i][0],
                                                     train[i][1],
                                                     X_test,
                                                     y_test))

score for training on QF 40 0.99
score for training on QF 65 0.9678
score for training on QF 90 0.7496
