In [8]:
# Script to preprocess the mscoco dataset 
# We are using the Gluoncv library for interfacing with the MSCOCO dataset
# Follow the instructions here on how to download the dataset: 
# https://gluon-cv.mxnet.io/build/examples_datasets/mscoco.html

"""
Needed to install pyprotocols, which needed 'git clone https://github.com/pdollar/coco.git'
to be cloned and 'make' the pythonapi
"""

from gluoncv import data, utils
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import numpy as np
import random 
from skimage.transform import resize
from sklearn.preprocessing import MultiLabelBinarizer
import operator

random.seed = 11


SAVING_PATH = "/srv/workspace/research/mlml/datasets/mscoco/"

In [3]:
# Validating the access to the dataset
train_dataset = data.COCODetection(splits=['instances_train2017'])
val_dataset = data.COCODetection(splits=['instances_val2017'])
print('Num of training images:', len(train_dataset))
print('Num of validation images:', len(val_dataset))

loading annotations into memory...
Done (t=16.66s)
creating index...
index created!
loading annotations into memory...
Done (t=0.44s)
creating index...
index created!
Num of training images: 117266
Num of validation images: 4952


In [4]:
# Extract only classification labels [exclude image segementation information]
labels_list = [labels[1][:,4:5].ravel() for labels in train_dataset]

# Convert to binary labels format
mlb = MultiLabelBinarizer()
binarized_labels = mlb.fit_transform(labels_list)

# format in pandas dataframe with columns as class names
labels_names = train_dataset.index_map
sorted_labels_names = sorted(labels_names.items(), key=operator.itemgetter(1))
sorted_names = [label_name[0] for label_name in sorted_labels_names]
labels_df = pd.DataFrame(binarized_labels,columns=sorted_names)


In [20]:
# preprocess the images and save in .npz format
# resize -> mean = 0 -> std = 1
# [TODO] edit the saving path as needed

INPUT_SHAPE = (224, 224, 3)
INPUT_IMAGE_MEAN = [0.485, 0.456, 0.406]
INPUT_IMAGE_STD = [0.229, 0.224, 0.225]

counter = 0
for image, label in train_dataset:
    image=image.asnumpy()
    image = image.astype(np.float32)
    image /= 255.0
    image = (image - INPUT_IMAGE_MEAN) / INPUT_IMAGE_STD
    image = resize(image, INPUT_SHAPE)
    image = image.astype(np.float32)
    np.savez(SAVING_PATH + "train_formatted_normalized_npz/" + str(counter), image = image)
    counter += 1
    if counter%500 == 0:
        print(counter)

500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000
20500
21000
21500
22000
22500
23000
23500
24000
24500
25000
25500
26000
26500
27000
27500
28000
28500
29000
29500
30000
30500
31000
31500
32000
32500
33000
33500
34000
34500
35000
35500
36000
36500
37000
37500
38000
38500
39000
39500
40000
40500
41000
41500
42000
42500
43000
43500
44000
44500
45000
45500
46000
46500
47000
47500
48000
48500
49000
49500
50000
50500
51000
51500
52000
52500
53000
53500
54000
54500
55000
55500
56000
56500
57000
57500
58000
58500
59000
59500
60000
60500
61000
61500
62000
62500
63000
63500
64000
64500
65000
65500
66000
66500
67000
67500
68000
68500
69000
69500
70000
70500
71000
71500
72000
72500
73000
73500
74000
74500
75000
75500
76000
76500
77000
77500
78000
78500
79000
79500
80000
80500
81000
81500
82000
82500
83000
83500
84000
84500
85000


In [37]:
# Savethe groundtruth matrix
labels_df.reset_index(level=0,inplace=True)
labels_df.to_csv(SAVING_PATH + '/binarized_labels_normalized.csv')

## Creating ratios and splits at the same time

In [73]:
# Create the artificial missing labels 
def hide_labels_per_sample(train_labels,ratio_of_hidden_samples = 0.4):
    number_of_labels_to_hide_per_sample = np.round(np.sum(train_labels,axis=1)*ratio_of_hidden_samples)
    train_negative_weights = np.zeros_like(train_labels) + 1 
    labels_with_missing_positives = np.copy(train_labels)
    for idx, sample in enumerate(labels_array):
        indices_to_hide = random.sample(list(np.nonzero(sample)[0]),int(number_of_labels_to_hide_per_sample[idx]))
        labels_with_missing_positives[idx][indices_to_hide] = 0 
        train_negative_weights[idx][indices_to_hide] = 0 
    return labels_with_missing_positives, train_negative_weights

def get_positive_weights(train_labels, pos_weights = 1):
    train_positive_weights = np.zeros_like(train_labels) + pos_weights # We make positive weight 5 becuase of data imbalance
    return train_positive_weights

### Work only on images with 4 or more labels

In [59]:
# For each split, create missing labels of ratios: 0.0, 0.25, 0.5, 0.75
SAVE_DIRECTORY = '/srv/workspace/research/mlml/Sample-level-weighted-loss/labels_balanced_4labels/'
global_labels = pd.read_csv('/srv/workspace/research/mlml/datasets/mscoco/binarized_labels_normalized.csv')
global_labels.drop(["Unnamed: 0", "index"],axis=1, inplace= True)

#select only images with more than 4 labels
global_labels = global_labels[global_labels.sum(axis=1) >= 4]
labels_array = global_labels.values

#making splits
train_indices = np.asarray([global_labels.index.values]*2).T # duplicating the column because iterative splits expects >1 dim
# create two splits
X_half1, y_half1, X_half2, y_half2 = iterative_train_test_split(train_indices, labels_array, test_size = 0.5)
# split each split to two splits
X_quarter1, y_quarter1, X_quarter2, y_quarter2 = iterative_train_test_split(X_half1, y_half1, test_size = 0.5)
X_quarter3, y_quarter3, X_quarter4, y_quarter4 = iterative_train_test_split(X_half2, y_half2, test_size = 0.5)      
split1 = pd.DataFrame(y_quarter1, columns=global_labels.columns, index = X_quarter1[:,0])
split2 = pd.DataFrame(y_quarter2, columns=global_labels.columns, index = X_quarter2[:,0])
split3 = pd.DataFrame(y_quarter3, columns=global_labels.columns, index = X_quarter3[:,0])
split4 = pd.DataFrame(y_quarter4, columns=global_labels.columns, index = X_quarter4[:,0])
train1 = pd.concat([split1,split2,split3])
test1 = split4
train2 = pd.concat([split1,split2,split4])
test2 = split3
train3 = pd.concat([split1,split3,split4])
test3 = split2
train4 = pd.concat([split2,split3,split4])
test4 = split1

In [74]:
# Create the artificial labels
ratios_to_hide = np.arange(0,1,0.25)
for ratio in ratios_to_hide:
    ratio_save_dir = os.path.join(SAVE_DIRECTORY,'missing_labels'+str(round(ratio, 1)))
    os.makedirs(ratio_save_dir,exist_ok=True)
    # Split1
    labels_array = train1.values
    labels_with_missing_positives, train_negative_weights = hide_labels_per_sample(labels_array,ratio)
    labels_with_missing_positives_df = train1.copy() 
    labels_with_missing_positives_df.iloc[:,:] = labels_with_missing_positives
    labels_with_missing_positives_df.to_csv(os.path.join(ratio_save_dir,
                                                        'train1_'+str(round(ratio, 1))+'.csv'))
    
    train_negative_weights_df = train1.copy()
    train_negative_weights_df.iloc[:,:]=train_negative_weights
    train_negative_weights_df.to_csv(os.path.join(ratio_save_dir,'negative_weights1_'+str(round(ratio, 1))+'.csv'))
    
    train_positive_weights = get_positive_weights(labels_with_missing_positives)
    train_positive_weights_df = train1.copy()
    train_positive_weights_df.iloc[:,:]=train_positive_weights
    train_positive_weights_df.to_csv(os.path.join(ratio_save_dir,'positive_weights'+str(round(ratio, 1))+'.csv'),
                                                        index = False)
    
    test1.to_csv(os.path.join(ratio_save_dir,'test1_'+str(round(ratio, 1))+'.csv'))  
    
    
    # Split2
    labels_array = train2.values
    labels_with_missing_positives, train_negative_weights = hide_labels_per_sample(labels_array,ratio)
    labels_with_missing_positives_df = train2.copy() 
    labels_with_missing_positives_df.iloc[:,:] = labels_with_missing_positives
    labels_with_missing_positives_df.to_csv(os.path.join(ratio_save_dir,
                                                        'train2_'+str(round(ratio, 1))+'.csv'))
    
    train_negative_weights_df = train2.copy()
    train_negative_weights_df.iloc[:,:]=train_negative_weights
    train_negative_weights_df.to_csv(os.path.join(ratio_save_dir,'negative_weights2_'+str(round(ratio, 1))+'.csv'))
    
    train_positive_weights = get_positive_weights(labels_with_missing_positives)
    train_positive_weights_df = train2.copy()
    train_positive_weights_df.iloc[:,:]=train_positive_weights
    train_positive_weights_df.to_csv(os.path.join(ratio_save_dir,'positive_weights'+str(round(ratio, 1))+'.csv'),
                                                        index = False)
    
    test2.to_csv(os.path.join(ratio_save_dir,'test2_'+str(round(ratio, 1))+'.csv'))  
    
    # Split3
    labels_array = train3.values
    labels_with_missing_positives, train_negative_weights = hide_labels_per_sample(labels_array,ratio)
    labels_with_missing_positives_df = train3.copy() 
    labels_with_missing_positives_df.iloc[:,:] = labels_with_missing_positives
    labels_with_missing_positives_df.to_csv(os.path.join(ratio_save_dir,
                                                        'train3_'+str(round(ratio, 1))+'.csv'))
    
    train_negative_weights_df = train3.copy()
    train_negative_weights_df.iloc[:,:]=train_negative_weights
    train_negative_weights_df.to_csv(os.path.join(ratio_save_dir,'negative_weights3_'+str(round(ratio, 1))+'.csv'))
    
    train_positive_weights = get_positive_weights(labels_with_missing_positives)
    train_positive_weights_df = train3.copy()
    train_positive_weights_df.iloc[:,:]=train_positive_weights
    train_positive_weights_df.to_csv(os.path.join(ratio_save_dir,'positive_weights'+str(round(ratio, 1))+'.csv'),
                                                        index = False)
    
    test3.to_csv(os.path.join(ratio_save_dir,'test3_'+str(round(ratio, 1))+'.csv'))  

                                  
    # Split4
    labels_array = train4.values
    labels_with_missing_positives, train_negative_weights = hide_labels_per_sample(labels_array,ratio)
    labels_with_missing_positives_df = train4.copy() 
    labels_with_missing_positives_df.iloc[:,:] = labels_with_missing_positives
    labels_with_missing_positives_df.to_csv(os.path.join(ratio_save_dir,
                                                        'train4_'+str(round(ratio, 1))+'.csv'))
    
    train_negative_weights_df = train4.copy()
    train_negative_weights_df.iloc[:,:]=train_negative_weights
    train_negative_weights_df.to_csv(os.path.join(ratio_save_dir,'negative_weights4_'+str(round(ratio, 1))+'.csv'))
    
    train_positive_weights = get_positive_weights(labels_with_missing_positives)
    train_positive_weights_df = train4.copy()
    train_positive_weights_df.iloc[:,:]=train_positive_weights
    train_positive_weights_df.to_csv(os.path.join(ratio_save_dir,'positive_weights'+str(round(ratio, 1))+'.csv'),
                                                        index = False)
    
    test4.to_csv(os.path.join(ratio_save_dir,'test4_'+str(round(ratio, 1))+'.csv'))  

## Creating correlation weights


In [150]:
negative_weights = np.zeros([len(hot_encoded), len(LABELS_LIST)])
for sample_idx in range(len(hot_encoded)):
    for label_idx in range(len(LABELS_LIST)):
        if hot_encoded.iloc[sample_idx, label_idx+1] == 1:
            negative_weights[sample_idx, label_idx] = 0
        else:
            temp_combination = hot_encoded.iloc[sample_idx,1:].copy()
            temp_combination[label_idx] = 1
            # Compare only columns that are equal to 1, and count number of matches
            # adding one to skip the song_id column, which exists in the hot_encoded dataframe
            positive_columns = np.where(temp_combination.values == 1)[0] + 1
            positive_samples = len(hot_encoded[(hot_encoded.iloc[:, positive_columns].values == 1).all(axis = 1)])
            # Count occurances with the negative sample
            temp_combination[label_idx] = 0
            positive_columns = np.where(temp_combination.values == 1)[0] + 1
            total_occurances_of_pattern = len(hot_encoded[(hot_encoded.iloc[:, positive_columns].values == 1).all(axis = 1)])
            negative_weights[sample_idx, label_idx] = (total_occurances_of_pattern - positive_samples) / total_occurances_of_pattern
negative_weights_df = pd.DataFrame(negative_weights, columns=LABELS_LIST)

In [156]:
# Save the weights
negative_weights_df.index = global_labels.index
negative_weights_df.to_csv(os.path.join('/srv/workspace/research/mlml/Sample-level-weighted-loss/labels_balanced_4labels/','negative_weights_global.csv'))