# Import

In [5]:
import pandas as pd
import os
from os.path import join
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import seaborn as sns
from torchvision.io import read_image
from matplotlib import pyplot as plt
import torchvision
import torch
import random
import numpy as np
from sklearn.neighbors import NearestNeighbors

pd.options.display.max_columns = 700

BASE = '/media/HDD1/yucj/OL3I/'
ORIGINAL = join(BASE, 'original_file/')
FEATURE = join(BASE, 'data_file_423/')

from typing import List
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import k_means, SpectralClustering
import multiprocessing as mp

ANALYSIS = join(BASE, 'analysis')

def conf_matrix_from_matrices(mat_gt, mat_pred):
  overlap_and = (mat_pred & mat_gt)
  tp = overlap_and.sum()
  fp = mat_pred.sum()-overlap_and.sum()
  fn = mat_gt.sum()-overlap_and.sum()
  tn = mat_gt.shape[0]**2-(tp+fp+fn)
  return tp, fp, fn, tn

In [2]:
def check_or_save(obj, path, index=None, header=None):
  if isinstance(obj, pd.DataFrame):
    if index is None or header is None:
      raise ValueError('Index and header must be specified for saving a dataframe')
    if os.path.exists(path):
      if not header:
        saved_df = pd.read_csv(path,header=None)
      else:
        saved_df = pd.read_csv(path)
      naked_df = saved_df.reset_index(drop=True)
      naked_df.columns = range(naked_df.shape[1])
      naked_obj = obj.reset_index(drop=not index)
      naked_obj.columns = range(naked_obj.shape[1])
      if naked_df.round(6).equals(naked_obj.round(6)):
        return
      else:
        diff = (naked_df.round(6) == naked_obj.round(6))
        diff[naked_df.isnull()] = naked_df.isnull() & naked_obj.isnull()
        assert diff.all().all(), "Dataframe is not the same as saved dataframe"
    else:
      obj.to_csv(path, index=index, header=header)
  else:
    if os.path.exists(path):
      saved_obj = torch.load(path)
      if isinstance(obj, list):
        for i in range(len(obj)):
          check_array_equality(obj[i], saved_obj[i])
      else:
        check_array_equality(obj, saved_obj)
    else:
      print(f'Saving to {path}')
      torch.save(obj, path)


def check_array_equality(ob1, ob2):
  if torch.is_tensor(ob1) or isinstance(ob1, np.ndarray):
    assert (ob2 == ob1).all()
  else:
    assert ob2 == ob1

# Create Tabular Dataset

In [6]:
from scipy.stats import zscore
from sklearn.preprocessing import MinMaxScaler

info = pd.read_csv(join(ORIGINAL, 'OL3I_tabular_info_all_423_missing_age.csv'))

non_feature_columns = ['img_id', 'anon_id', 'set_1y', 'label_1y']
feature_columns = info.columns.difference(non_feature_columns)
print(len(feature_columns))

# 归一化
scaler = MinMaxScaler()
info[feature_columns] = scaler.fit_transform(info[feature_columns])

train_df = info.set_index('set_1y').loc['train']
val_df = info.set_index('set_1y').loc['val']
test_df = info.set_index('set_1y').loc['test']

train_labels_all = list(train_df['label_1y'])
val_labels_all = list(val_df['label_1y'])
test_labels_all = list(test_df['label_1y'])

lengths = [1 for i in range(len(feature_columns)-2)]
lengths.insert(25, 2)
# lengths.insert(15, 2)
lengths.append(2)

print(len(lengths))
print(lengths)

# check_or_save(lengths, join(FEATURE, 'OL3I_tabular_lengths.pt'))
#
# check_or_save(train_labels_all, join(FEATURE, 'OL3I_labels_train.pt'))
# check_or_save(val_labels_all, join(FEATURE, 'OL3I_labels_val.pt'))
# check_or_save(test_labels_all, join(FEATURE, 'OL3I_labels_test.pt'))

check_or_save(train_df.loc[:,~train_df.columns.isin(non_feature_columns)],join(FEATURE, 'OL3I_features_train_missing_age.csv'), index=False, header=False)
check_or_save(val_df.loc[:,~val_df.columns.isin(non_feature_columns)],join(FEATURE, 'OL3I_features_val_missing_age.csv'), index=False, header=False)
check_or_save(test_df.loc[:,~test_df.columns.isin(non_feature_columns)],join(FEATURE, 'OL3I_features_test_missing_age.csv'), index=False, header=False)

check_or_save(train_df, join(FEATURE,f'OL3I_full_features_train_missing_age.csv'), index=True, header=True)
check_or_save(val_df, join(FEATURE,f'OL3I_full_features_val_missing_age.csv'), index=True, header=True)
check_or_save(test_df, join(FEATURE,f'OL3I_full_features_test_missing_age.csv'), index=True, header=True)

423
423
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

# Save Normalized Ims

In [4]:
import h5py
from matplotlib import pyplot as plt

h5_file = h5py.File(join(ORIGINAL, 'l3_slices.h5'), 'r')

for df, t_split in zip([train_df, val_df, test_df], ['train', 'val', 'test']):
  images = []
  print(t_split)
  for i,row in df.iterrows():
    key_i = row['anon_id']
    img_arr = np.array(h5_file[key_i])
    img_normed = (img_arr - np.min(img_arr)) / (np.max(img_arr) - np.min(img_arr))
    img_saved = np.expand_dims(img_normed, axis=0)
    if np.shape(img_saved) != (1, 512, 512):
      print("key_i:", np.shape(img_saved))
    images.append(torch.from_numpy(img_saved).float())
  # print(img_saved)
  print(t_split, len(images))
  images_t = torch.stack(images).float()
  print(images_t.is_contiguous()) 
  check_or_save(images_t, join(FEATURE, f'{t_split}_images.pt'))

train
train 5227
True
Saving to /media/HDD1/yucj/OL3I/data_file_423/train_images.pt
val
val 1303
True
Saving to /media/HDD1/yucj/OL3I/data_file_423/val_images.pt
test
test 1609
True
Saving to /media/HDD1/yucj/OL3I/data_file_423/test_images.pt


# Create Low Data Splits & Images

In [3]:
from scipy.stats import zscore
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedShuffleSplit
import h5py
from matplotlib import pyplot as plt

info = pd.read_csv(join(ORIGINAL, 'OL3I_tabular_info_all_423.csv'))

non_feature_columns = ['img_id', 'anon_id', 'set_1y', 'label_1y']
feature_columns = info.columns.difference(non_feature_columns)
print(len(feature_columns))

# 归一化
scaler = MinMaxScaler()
info[feature_columns] = scaler.fit_transform(info[feature_columns])
train_df = info.set_index('set_1y').loc['train']
train_labels_all = list(train_df['label_1y'])

# creat low-data training set
for k in [0.5, 0.1, 0.01]:
  n_samples = int(len(train_df) * k)
  sss = StratifiedShuffleSplit(n_splits=1, test_size=n_samples, random_state=2022)

  # sampling
  for train_index, sample_index in sss.split(train_df, train_labels_all):
      sampled_train_df = train_df.iloc[sample_index]
      sampled_train_labels = [train_labels_all[i] for i in sample_index]
  print('low-data df: ', len(sampled_train_df))
  print('low-data labels: ', len(sampled_train_labels))

  # images
  h5_file = h5py.File(join(ORIGINAL, 'l3_slices.h5'), 'r')
  images = []
  for i,row in sampled_train_df.iterrows():
    key_i = row['anon_id']
    img_arr = np.array(h5_file[key_i])
    img_normed = (img_arr - np.min(img_arr)) / (np.max(img_arr) - np.min(img_arr))
    img_saved = np.expand_dims(img_normed, axis=0)
    if np.shape(img_saved) != (1, 512, 512):
      print("key_i:", np.shape(img_saved))
    images.append(torch.from_numpy(img_saved).float())
  # print(img_saved)
  print('low-data images: ', len(images))
  images_t = torch.stack(images).float()

  # save
  check_or_save(images_t, join(FEATURE, f'train_low_{k}_images.pt'))
  check_or_save(sampled_train_labels, join(FEATURE, f'OL3I_labels_train_low_{k}_.pt'))
  check_or_save(sampled_train_df.loc[:,~train_df.columns.isin(non_feature_columns)],join(FEATURE, f'OL3I_features_train_low_{k}_.csv'), index=False, header=False)
  check_or_save(sampled_train_df, join(FEATURE,f'OL3I_full_features_train_low_{k}_.csv'), index=True, header=True)



423
low-data df:  2613
low-data labels:  2613
low-data images:  2613
Saving to /media/HDD1/yucj/OL3I/data_file_423/train_low_0.5_images.pt
Saving to /media/HDD1/yucj/OL3I/data_file_423/OL3I_labels_train_low_0.5_.pt
low-data df:  522
low-data labels:  522
low-data images:  522
Saving to /media/HDD1/yucj/OL3I/data_file_423/train_low_0.1_images.pt
Saving to /media/HDD1/yucj/OL3I/data_file_423/OL3I_labels_train_low_0.1_.pt
low-data df:  52
low-data labels:  52
low-data images:  52
Saving to /media/HDD1/yucj/OL3I/data_file_423/train_low_0.01_images.pt
Saving to /media/HDD1/yucj/OL3I/data_file_423/OL3I_labels_train_low_0.01_.pt


In [None]:
split = 'train'
for k in [0.1, 0.01]:
  low_data_ids = torch.load(join(FEATURES, f'{split}_ids{addendum}_{k}.pt'))
  low_data_df = pd.read_csv(join(FEATURES,f'dvm_full_features_{split}_noOH{addendum}_{k}.csv'))
  print(low_data_df.value_counts('Genmodel_ID'))
  print(len(low_data_ids))

# Check Transforms

In [None]:
from torchvision import transforms
import torch
from os.path import join

BASE = ''
TABLES = join(BASE, 'tables_V2.0')
FEATURES = join(BASE, 'features')

train_images = torch.load(join(FEATURES, f'val_images.pt'))

In [None]:
img_size=128

transform = transforms.Compose([
      transforms.RandomApply([transforms.ColorJitter(brightness=0.8, contrast=0.8, saturation=0.8)], p=0.8),
      transforms.RandomGrayscale(p=0.2),
      transforms.RandomApply([transforms.GaussianBlur(kernel_size=29, sigma=(0.1, 2.0))],p=0.5),
      transforms.RandomResizedCrop(size=(img_size,img_size), scale=(0.2, 1.0), ratio=(0.75, 1.3333333333333333), antialias=True),
      transforms.RandomHorizontalFlip(p=0.5),
      transforms.Resize(size=(img_size,img_size), antialias=True),
      transforms.Lambda(lambda x : x.float())
    ])

im = train_images[1]
im_t = transform(im)
_ = plt.imshow(im_t.permute(1,2,0))

# New Physical Features

## Adding missing values to physical table

In [None]:
# Fill using other values
physical_df_orig = pd.read_csv(join(FEATURES,'Ad_table (extra).csv'))
physical_df_orig.rename(columns={' Genmodel_ID':'Genmodel_ID', ' Genmodel':'Genmodel'}, inplace=True)

# Manual touches

# Peugeot RCZ
physical_df_orig.loc[physical_df_orig['Genmodel_ID'] == '69_36','Wheelbase']=2612
# Ford Grand C-Max
physical_df_orig.loc[physical_df_orig['Genmodel_ID'] == '29_20','Wheelbase']=2788 

def fill_from_other_entry(row):
    for attr in ['Wheelbase', 'Length', 'Width', 'Height']:
        if pd.isna(row[attr]) or row[attr]==0:
            other_rows = physical_df_orig.loc[physical_df_orig['Genmodel_ID']==row['Genmodel_ID']]
            other_rows.dropna(subset=[attr], inplace=True)
            other_rows.drop_duplicates(subset=[attr], inplace=True)
            other_rows = other_rows[other_rows[attr]>0]
            if len(other_rows)>0:
                row[attr] = other_rows[attr].values[0]
    return row

physical_df_orig = physical_df_orig.apply(fill_from_other_entry, axis=1)

physical_df_orig.to_csv(join(FEATURES,'Ad_table_physical_filled.csv'), index=False)

## Add physical attributes to features

In [None]:
# Add jitter to physical dimensions so they aren't just labels
def add_jitter(x, jitter=50):
    return x + random.randint(-jitter, jitter)

random.seed(2022)
physical_df = pd.read_csv(join(FEATURES,'Ad_table_physical_filled.csv'))
for attr in ['Wheelbase', 'Length', 'Width', 'Height']:
    physical_df[attr] = physical_df[attr].apply(add_jitter)
physical_df.to_csv(join(FEATURES,'Ad_table_physical_filled_jittered_50.csv'), index=False)

In [None]:
# Ford ranger (29_30) has wrong height. Missing 1 in front... 805.0 instead of 1805.0
# Mercedes Benz (59_29) wrong wheelbase, 5246.0 instead of 3106
# Kia Rio (43_9) wrong wheelbase, 4065.0 instead of 2580
# FIXED


physical_df = pd.read_csv(join(FEATURES,'Ad_table_physical_filled_jittered_50.csv'))[['Adv_ID', 'Wheelbase','Height','Width','Length']]
for v in ['_all_views']:
    for split in ['train', 'val', 'test']:
        features_df = pd.read_csv(join(FEATURES,f'dvm_full_features_{split}_noOH{v}.csv'))
        merged_df = features_df.merge(physical_df, on='Adv_ID')
        physical_only_df = merged_df[['Wheelbase','Height','Width','Length','Bodytype']]

        for attr in ['Wheelbase','Height','Width','Length']:
            assert merged_df[attr].isna().sum()==0
            assert (merged_df[attr]==0).sum()==0

        # normalize physical attributes
        for attr in ['Wheelbase','Height','Width','Length']:
            merged_df[attr] = (merged_df[attr]-merged_df[attr].mean())/merged_df[attr].std()
            physical_only_df[attr] = (physical_only_df[attr]-physical_only_df[attr].mean())/physical_only_df[attr].std()

        # Drop unwanted cols
        non_feature_columns = ['Adv_ID', 'Image_name', 'Genmodel_ID']
        if v == '_all_views':
            non_feature_columns.append('Predicted_viewpoint')
        merged_df = merged_df.drop(non_feature_columns, axis=1)

        merged_df_cols = merged_df.columns.tolist()
        rearranged_cols = merged_df_cols[-4:]+merged_df_cols[:-4]
        merged_df = merged_df[rearranged_cols]
        check_or_save(merged_df, join(FEATURES,f'dvm_features_{split}_noOH{v}_physical_jittered_50.csv'), index=False, header=False)
        check_or_save(physical_only_df, join(FEATURES,f'dvm_features_{split}_noOH{v}_physical_only_jittered_50.csv'), index=False, header=False)
    lengths = torch.load(join(FEATURES,f'tabular_lengths{v}.pt'))
    new_lengths = [1,1,1,1]
    lengths = new_lengths + lengths
    check_or_save(lengths, join(FEATURES,f'tabular_lengths{v}_physical.pt'))
    lengths = [1,1,1,1,13]
    check_or_save(lengths, join(FEATURES,f'tabular_lengths{v}_physical_only.pt'))

# Add Labels to Featues

In [None]:
for v in ['_all_views']:
    for split in ['train', 'val']:
        labels = torch.load(join(FEATURES,f'labels_model_all_{split}{v}.pt'))
        features = pd.read_csv(join(FEATURES,f'dvm_features_{split}_noOH{v}_physical_jittered_50.csv'), header=None)
        features['label'] = labels
        check_or_save(features, join(FEATURES,f'dvm_features_{split}_noOH{v}_physical_jittered_50_labeled.csv'), index=False, header=False)
    lengths = torch.load(join(FEATURES,f'tabular_lengths{v}_physical.pt'))
    lengths.append(max(labels)+1)
    check_or_save(lengths, join(FEATURES,f'tabular_lengths{v}_physical_labeled.pt'))