In [1]:
# import relevant packages
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [2]:
# set random state so as to get reproducible results
np.random.seed(2021)

In [3]:
# read label info from file and convert to list
label_info = pd.read_csv("label_info.txt", delimiter="\t", index_col="number")
label_ls = label_info['name'].tolist()

In [4]:
# read labels of images from gtlabels file
with open("gtlabels.txt") as ex_labels_file:
    ex_labels_str = ex_labels_file.read().strip()
ex_labels = ex_labels_str.split("\n")
ex_labels = [line.split() for line in ex_labels]

In [5]:
# initialise dataframe from labels info and image info
df = pd.DataFrame (ex_labels,columns=['filename']+label_ls)
df

Unnamed: 0,filename,Partylife,Family_Friends,Beach_Holidays,Building_Sights,Snow,Citylife,Landscape_Nature,Sports,Desert,...,old_person,happy,funny,euphoric,active,scary,unpleasant,melancholic,inactive,calm
0,0039b5a7-c1ad-423a-92a0-3f38558043a2.jpg,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
1,003eaf28-898d-404f-abe5-e8e86d422fa2.jpg,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,1,0
2,005f4848-780a-4d31-8c09-4abdfd46804c.jpg,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,1,1,1
3,00a53424-5a0d-4c9d-ac90-242a6099fc35.jpg,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
4,00b5c2a5-2b8f-492b-ad5d-ed3310e6da56.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,{FFA81171-399C-432E-8373-753B7FC5F85B}.jpg,0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1
7996,{FFB02EA2-96FB-4C60-ADEF-C57C0CC2266E}.jpg,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
7997,{FFB61D93-7A87-487B-8478-58B78E878823}.jpg,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7998,{FFC6F015-ABF9-4AD9-ACC9-0941FD14A251}.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [6]:
# take out irrelevant classes from dataframe
chosen_classes = ["Spring", "Summer", "Autumn"]
df = df.filter(["filename"]+chosen_classes)
df

Unnamed: 0,filename,Spring,Summer,Autumn
0,0039b5a7-c1ad-423a-92a0-3f38558043a2.jpg,0,0,0
1,003eaf28-898d-404f-abe5-e8e86d422fa2.jpg,0,0,0
2,005f4848-780a-4d31-8c09-4abdfd46804c.jpg,0,0,0
3,00a53424-5a0d-4c9d-ac90-242a6099fc35.jpg,0,0,0
4,00b5c2a5-2b8f-492b-ad5d-ed3310e6da56.jpg,0,0,0
...,...,...,...,...
7995,{FFA81171-399C-432E-8373-753B7FC5F85B}.jpg,0,1,0
7996,{FFB02EA2-96FB-4C60-ADEF-C57C0CC2266E}.jpg,0,0,0
7997,{FFB61D93-7A87-487B-8478-58B78E878823}.jpg,0,0,0
7998,{FFC6F015-ABF9-4AD9-ACC9-0941FD14A251}.jpg,0,0,0


In [7]:
# change labels from str to int
df = df.astype({'Spring': 'int32', 'Summer': 'int32', 'Autumn': 'int32'})
df

Unnamed: 0,filename,Spring,Summer,Autumn
0,0039b5a7-c1ad-423a-92a0-3f38558043a2.jpg,0,0,0
1,003eaf28-898d-404f-abe5-e8e86d422fa2.jpg,0,0,0
2,005f4848-780a-4d31-8c09-4abdfd46804c.jpg,0,0,0
3,00a53424-5a0d-4c9d-ac90-242a6099fc35.jpg,0,0,0
4,00b5c2a5-2b8f-492b-ad5d-ed3310e6da56.jpg,0,0,0
...,...,...,...,...
7995,{FFA81171-399C-432E-8373-753B7FC5F85B}.jpg,0,1,0
7996,{FFB02EA2-96FB-4C60-ADEF-C57C0CC2266E}.jpg,0,0,0
7997,{FFB61D93-7A87-487B-8478-58B78E878823}.jpg,0,0,0
7998,{FFC6F015-ABF9-4AD9-ACC9-0941FD14A251}.jpg,0,0,0


In [8]:
# filter out rows with only chosen labels
df = df[df["Spring"] + df["Summer"] + df["Autumn"] == 1]
df

Unnamed: 0,filename,Spring,Summer,Autumn
16,01d88c87-f2e3-4a7f-be26-b38c549b5e7d.jpg,1,0,0
33,03b0bf90-f66b-41ad-b7ae-0e9f818cb9fd.jpg,0,1,0
37,03e489e7-7c0f-48f3-971d-b44b7e026f25.jpg,0,1,0
43,040b9239-da83-462b-991e-c4f2a1898a62.jpg,0,1,0
48,04c01e0c-6ab4-416d-9417-8796283521f4.jpg,0,0,1
...,...,...,...,...
7969,{FE042A4E-B85C-49EF-82E8-3D5C922A7E8D}.jpg,0,1,0
7978,{FECAD3F9-8EE7-43C1-932E-DD804E4B796B}.jpg,0,1,0
7984,{FF4B4F7C-BBEB-4224-B15E-E514D3CB1949}.jpg,0,1,0
7988,{FF781858-65E9-446F-A089-BB26716FFE2E}.jpg,0,1,0


In [9]:
# add image features from respective .npy files
image_path = "./imagefeatures"
features_col = []
for _, row in df.iterrows():
    features_col.append(np.load(f"{image_path}/{row['filename']}_ft.npy"))
df["features"] = features_col
df

Unnamed: 0,filename,Spring,Summer,Autumn,features
16,01d88c87-f2e3-4a7f-be26-b38c549b5e7d.jpg,1,0,0,"[0.00048693782, 0.00509194, 0.0028135052, 0.00..."
33,03b0bf90-f66b-41ad-b7ae-0e9f818cb9fd.jpg,0,1,0,"[0.0002985283, 0.0064952117, 0.0053253854, 0.0..."
37,03e489e7-7c0f-48f3-971d-b44b7e026f25.jpg,0,1,0,"[0.00049501075, 0.004573596, 0.003921549, 0.00..."
43,040b9239-da83-462b-991e-c4f2a1898a62.jpg,0,1,0,"[0.00014986246, 0.0066579483, 0.0009286591, 0...."
48,04c01e0c-6ab4-416d-9417-8796283521f4.jpg,0,0,1,"[0.00050134765, 0.0032292355, 0.0039382987, 0...."
...,...,...,...,...,...
7969,{FE042A4E-B85C-49EF-82E8-3D5C922A7E8D}.jpg,0,1,0,"[0.00035466423, 0.004972555, 0.00472961, 0.001..."
7978,{FECAD3F9-8EE7-43C1-932E-DD804E4B796B}.jpg,0,1,0,"[0.0004272918, 0.0034273197, 0.0026839902, 0.0..."
7984,{FF4B4F7C-BBEB-4224-B15E-E514D3CB1949}.jpg,0,1,0,"[0.00033894225, 0.0045505385, 0.0032907669, 0...."
7988,{FF781858-65E9-446F-A089-BB26716FFE2E}.jpg,0,1,0,"[0.00031476427, 0.0069268006, 0.0034461773, 0...."


In [10]:
def train_val_test_split_y(df_full, class_label):
    """ To perform train-val-test split for positive output of a certain class.

        Args: 
            df_full - (pandas.core.frame.DataFrame) full dataset
            class_label - (str) class (i.e. "Spring", "Summer" or "Autumn")
        
        Returns:
            class_df_test, class_df_val, class_df_train - (pandas.core.frame.DataFrame) dataframes after split is performed
    """
    class_df = df_full.loc[df_full[class_label] == 1]
    class_df = class_df.sample(frac=1)
    no_of_rows = len(class_df)
    test_end = no_of_rows // 5
    val_end = no_of_rows // 20 * 3 + test_end
    class_df_test, class_df_val, class_df_train = class_df[:test_end], \
                                                  class_df[test_end:val_end], \
                                                  class_df[val_end:]
    return class_df_test, class_df_val, class_df_train

In [11]:
def train_val_test_split_all(df_full, class_labels):
    """ To perform train-val-test split for a certain class.

        Args: 
            df_full - (pandas.core.frame.DataFrame) full dataset
            class_labels - (list) chosen classes (i.e. ["Spring", "Summer", "Autumn"])
        
        Returns:
            df_test, df_val, df_train - (pandas.core.frame.DataFrame) dataframes after split is performed
    """
    df_train_ls, df_val_ls, df_test_ls = [], [], []
    for class_label in class_labels:
        df_test_pos, df_val_pos, df_train_pos = train_val_test_split_y(df_full, class_label)
        df_train_ls.append(df_train_pos)
        df_val_ls.append(df_val_pos)
        df_test_ls.append(df_test_pos)
    df_train = pd.concat(df_train_ls).sample(frac=1)
    df_val = pd.concat(df_val_ls).sample(frac=1)
    df_test = pd.concat(df_test_ls).sample(frac=1)
    return df_train, df_val, df_test

In [12]:
# split dataset into train, val and test sets
df_train, df_val, df_test = train_val_test_split_all(df, chosen_classes)
print("--- train set ---")
print(df_train.describe())
print("--- val set ---")
print(df_val.describe())
print("--- test set ---")
print(df_test.describe())

--- train set ---
           Spring      Summer      Autumn
count  749.000000  749.000000  749.000000
mean     0.092123    0.771696    0.136182
std      0.289393    0.420020    0.343210
min      0.000000    0.000000    0.000000
25%      0.000000    1.000000    0.000000
50%      0.000000    1.000000    0.000000
75%      0.000000    1.000000    0.000000
max      1.000000    1.000000    1.000000
--- val set ---
           Spring      Summer      Autumn
count  168.000000  168.000000  168.000000
mean     0.089286    0.785714    0.125000
std      0.286008    0.411553    0.331708
min      0.000000    0.000000    0.000000
25%      0.000000    1.000000    0.000000
50%      0.000000    1.000000    0.000000
75%      0.000000    1.000000    0.000000
max      1.000000    1.000000    1.000000
--- test set ---
           Spring      Summer      Autumn
count  228.000000  228.000000  228.000000
mean     0.092105    0.776316    0.131579
std      0.289811    0.417630    0.338776
min      0.000000    0.00

In [13]:
def fit_model(X, y, reg_constant, kernel):
    """ To fit the SVM model.

        Args: 
            X - (pandas.core.series.Series) features
            y - (pandas.core.series.Series) labels
            reg_constant - (int) regularisation constant to use with SVM
            kernel - (str) kernel mode to use
        
        Returns:
            (sklearn.pipeline.Pipeline object) model which can be used to make predictions
    """
    clf = make_pipeline(StandardScaler(), \
                        SVC(kernel=kernel, C=reg_constant, probability=True))
    X = np.stack(X.to_list(), axis=0)
    return clf.fit(X, y)

In [14]:
def get_vanilla_accuracy(y_pred, y_true):
    """ To get vanilla accuracy.

        Args: 
            y_pred - (pandas.core.series.Series) predicted labels
            y_true - (pandas.core.series.Series) true labels
        
        Returns:
            (float) vanilla accuracy
    """
    y_pred, y_true = np.array(y_pred), np.array(y_true)
    check_label = np.max(y_pred & y_true, axis=1)
    return np.sum(check_label) / len(y_true)

In [15]:
def get_a_c(y_pred, y_true):
    """ To calculate a_c as written in handout.

        Args:
            y_pred - (pandas.core.series.Series) predicted labels
            y_true - (pandas.core.series.Series) true labels
        
        Returns:
            a_c as given by equation in handout
    """
    y_pred, y_true = np.array(y_pred), np.array(y_true)
    true_positive = y_pred & y_true == 1
    return np.sum(true_positive) / np.sum(y_true)

In [16]:
def get_class_wise_avg_acc(y_pred, y_true):
    """ To calculate class-wise averaged accuracy.

        Args:
            y_pred - (pandas.core.frame.DataFrame) predicted labels of multiple classes
            y_true - (pandas.core.frame.DataFrame) true labels of multiple classes
        
        Returns:
            (float) class-wise averaged accuracy
    """
    sum_a_c = 0
    for class_label in y_true.columns:
        sum_a_c += get_a_c(y_pred[class_label], y_true[class_label])
    return sum_a_c / len(y_true.columns)

In [17]:
def get_best_reg_constant(train_set, val_set, reg_const_ls, class_labels, kernel):
    """ To get the best regularisation constant.

        Args: 
            train_set - (pandas.core.frame.DataFrame) train set with labels and features
            val_set - (pandas.core.frame.DataFrame) val set with labels and features
            reg_c_ls - (list) regularisation constants
            class_labels - (list) chosen classes (i.e. ["Spring", "Summer", "Autumn"])

        
        Returns:
            (float) best regularisation constant
    """
    reg_const_acc_dict = {}
    for constant in reg_const_ls:
        print(f"Using regularisation constant {constant}...")
        y_store = val_set.copy().replace(1.0, np.NaN)
        y_store = y_store.replace(0.0, np.NaN)
        for class_label in class_labels:
            model = fit_model(train_set["features"], train_set[class_label], constant, kernel)
            X_val = np.stack(val_set["features"].to_list(), axis=0)
            pred_probs = model.predict_proba(X_val)
            y_store[class_label] = pred_probs[:, 1]
        y_store[class_labels] = (y_store[class_labels] == y_store[class_labels].max(axis=1)[:, None]).astype(int)
        vanilla_acc = get_vanilla_accuracy(y_store[class_labels], val_set[class_labels])
        print(f"Vanilla accuracy is {vanilla_acc}.")
        class_wise_avg_acc = get_class_wise_avg_acc(y_store[class_labels], val_set[class_labels])
        print(f"Class-wise averaged accuracy is {class_wise_avg_acc}.")
        reg_const_acc_dict[constant] = class_wise_avg_acc
    return max(reg_const_acc_dict, key=reg_const_acc_dict.get)

In [18]:
# initialise reg_constants to store all possible constants
reg_constants = [0.01, 0.1, 0.1**0.5, 1, 10**0.5, 10, 100]
reg_constants

[0.01, 0.1, 0.31622776601683794, 1, 3.1622776601683795, 10, 100]

In [19]:
# initialise list of kernels to try out
kernels = ["linear", "rbf"]
kernels

['linear', 'rbf']

In [20]:
# iterate through all classes to get the best regularisation constant
best_reg_consts = {}
for kernel in kernels:
    print(f"--- For kernel mode '{kernel}' ---")
    best_reg_consts[kernel] = get_best_reg_constant(df_train, df_val, reg_constants, chosen_classes, kernel)
    print(f"Best regularisation constant found is {best_reg_consts[kernel]}.")

--- For kernel mode 'linear' ---
Using regularisation constant 0.01...
Vanilla accuracy is 0.8392857142857143.
Class-wise averaged accuracy is 0.4958874458874459.
Using regularisation constant 0.1...
Vanilla accuracy is 0.8333333333333334.
Class-wise averaged accuracy is 0.4933621933621934.
Using regularisation constant 0.31622776601683794...
Vanilla accuracy is 0.8333333333333334.
Class-wise averaged accuracy is 0.4933621933621934.
Using regularisation constant 1...
Vanilla accuracy is 0.8273809523809523.
Class-wise averaged accuracy is 0.49083694083694085.
Using regularisation constant 3.1622776601683795...
Vanilla accuracy is 0.8333333333333334.
Class-wise averaged accuracy is 0.4933621933621934.
Using regularisation constant 10...
Vanilla accuracy is 0.8273809523809523.
Class-wise averaged accuracy is 0.49083694083694085.
Using regularisation constant 100...
Vanilla accuracy is 0.8333333333333334.
Class-wise averaged accuracy is 0.4933621933621934.
Best regularisation constant foun

In [21]:
def get_final_model(df_train, df_val, class_label, reg_constant, kernel):
    """ To get final model with best reg constant.

        Args: 
            df_train - (pandas.core.frame.DataFrame) training set
            df_val - (pandas.core.frame.DataFrame) validation set 
            class_label - (str) class (i.e. "Spring", "Summer" or "Autumn")
            reg_constant - (int) regularisation constant to use with SVM
            kernel - (str) kernel mode to use
        
        Returns:
            (sklearn.pipeline.Pipeline object) model which can be used to make predictions
    """
    df_train_val = df_train.append(df_val).sample(frac=1)
    return fit_model(df_train_val["features"], df_train_val[class_label], reg_constant, kernel)

In [22]:
# get test vanilla accuracy values and class-wise averaged accuracy
for kernel in kernels:
    print(f"--- For kernel mode '{kernel}' ---")
    y_store = df_test.copy().replace(1.0, np.NaN)
    y_store = y_store.replace(0.0, np.NaN)
    for class_label in chosen_classes:
        model = get_final_model(df_train, df_val, class_label, best_reg_consts[kernel], kernel)
        X_test = np.stack(df_test["features"].to_list(), axis=0)
        pred_probs = model.predict_proba(X_test)
        y_store[class_label] = pred_probs[:, 1]
    y_store[chosen_classes] = (y_store[chosen_classes] == y_store[chosen_classes].max(axis=1)[:, None]).astype(int)
    vanilla_acc = get_vanilla_accuracy(y_store[chosen_classes], df_test[chosen_classes])
    print(f"Vanilla accuracy is {vanilla_acc}.")
    class_wise_avg_acc = get_class_wise_avg_acc(y_store[chosen_classes], df_test[chosen_classes])
    print(f"Class-wise averaged accuracy is {class_wise_avg_acc}.")

--- For kernel mode 'linear' ---
Vanilla accuracy is 0.8070175438596491.
Class-wise averaged accuracy is 0.4480225988700565.
--- For kernel mode 'rbf' ---
Vanilla accuracy is 0.8377192982456141.
Class-wise averaged accuracy is 0.5448479956954534.


In [23]:
# save train-val-test split as .npy files
save_folder = "./dataset_splits"
dataset_types = [("train", df_train), ("val", df_val), ("test", df_test)]
for ds_type, ds in dataset_types:
    np.save(f"{save_folder}/{ds_type}", ds.to_numpy())