In [1]:
import pandas as pd

from global_constants import *

from os.path import join

import csv

from scipy.spatial.distance import cdist

from sklearn.neural_network import MLPClassifier

import matplotlib.image as mpimg

from sklearn import metrics
from sklearn.model_selection import train_test_split

import pickle

from sklearn.decomposition import PCA

from matplotlib import pyplot as plt
%pylab inline

# constants
minimum_number_of_labels = 51

Populating the interactive namespace from numpy and matplotlib


In [2]:
def get_normalised_prob(prediction_weights):
    normalizing_factor = sum(prediction_weights)
    return np.array(prediction_weights)/normalizing_factor

def normalize_rows(ini_matrix, minimum_number_of_labels, binary_predictions=None):
    matrix = ini_matrix.copy()
    matrix_argsort = matrix.argsort()
    
    # Iterate over i
    for i in range(len(matrix)):

        # Decide number to roll with
        number_to_use = minimum_number_of_labels
        
        # Zero out other values
        for k in range(len(matrix[i])):
            matrix[i][k] = 0 if k not in matrix_argsort[i][-number_to_use:] else matrix[i][k]

        # normalize
        matrix[i] = get_normalised_prob(matrix[i])

        # Sanity Check
        # print(sum(matrix[i]))
        
    return matrix

def get_ordered_data_frame(csv_path, mode='train', number_of_train_features=1000, tag="fc"):
    data_frame = pd.read_csv(join(data_path, csv_path), names=["name"] + [tag + str(k) for k in range(number_of_train_features)])
    data_frame["file_name_index"] = pd.to_numeric(data_frame.name.str.extract("((?<=[images_" + mode + "\/])\d*(?=\.))", expand=False))
    data_frame.sort_values('file_name_index', inplace=True)
    return data_frame.drop(["name"], axis=1)

# Data Loading

In [3]:
# Ground Truth

better_tag_data = pd.read_csv('better_train_10_tags_adj&noun&verb.csv')

better_tag_test_data = pd.read_csv('better_test_10_tags_adj&noun&verb.csv')

better_tag_test_data.shape

(2000, 7771)

In [30]:
# Train

pca = PCA(n_components=50)

train_tags = pd.read_csv("processed_tags.csv")
pca_train_tags = pca.fit_transform(train_tags.drop(["Name"], axis=1))

train_fc_data = get_ordered_data_frame('features_train/features_resnet1000_train.csv', number_of_train_features=1000, tag="fc")
train_intermediate_data = get_ordered_data_frame('features_train/features_resnet1000intermediate_train.csv', number_of_train_features=2048, tag="pl")

combined_train_net_data = train_intermediate_data.join(train_fc_data.set_index('file_name_index'), on='file_name_index')
# tags_fc_pool_train_net_data = combined_train_net_data.join(train_tags.drop(["Name"], axis=1), on="file_name_index")
pca_fc_pool_train_net_data = combined_train_net_data.join(pd.DataFrame(pca_train_tags), on="file_name_index")

# Test

test_tags = pd.read_csv("processed_tags_test.csv")
pca_test_tags = pca.transform(test_tags.drop(["Name"], axis=1))

test_fc_data = get_ordered_data_frame('features_test/features_resnet1000_test.csv', "test", number_of_train_features=1000, tag="fc")     
test_intermediate_data = get_ordered_data_frame('features_test/features_resnet1000intermediate_test.csv', "test", number_of_train_features=2048, tag="pl")     

combined_test_net_data = test_intermediate_data.join(test_fc_data.set_index('file_name_index'), on='file_name_index')
# tags_fc_pool_test_net_data = combined_test_net_data.join(test_tags.drop(["Name"], axis=1), on="file_name_index")
pca_fc_pool_test_net_data = combined_test_net_data.join(pd.DataFrame(pca_test_tags), on="file_name_index")

# X = train_fc_data.drop(["file_name_index"], axis=1).values
# X_Test = test_fc_data.drop(["file_name_index"], axis=1).values

# X = combined_train_net_data.drop(["file_name_index"], axis=1).values
# X_Test = combined_test_net_data.drop(["file_name_index"], axis=1).values

X_Test = pca_fc_pool_test_net_data.drop(["file_name_index"], axis=1).values

In [31]:
train_size = 0.8
n_components = 2000

if train_size != 1.0:
    X_Train, X_Val, Y_train, Y_Val = train_test_split(pca_fc_pool_train_net_data, better_tag_data.values, train_size=0.8)

    gt_files = X_Val.file_name_index.values
    better_tag_val_data = Y_Val.copy()

    X_Train = X_Train.drop(["file_name_index"], axis=1).values    
    
    X_Val = X_Val.drop(["file_name_index"], axis=1).values
    
    overall_pca = PCA(n_components=n_components)
    X_Train = overall_pca.fit_transform(X_Train)
    X_Val = overall_pca.transform(X_Val)
    X_Test = overall_pca.transform(X_Test)

    Y_train[Y_train != 0] = 1    
#     Y_Val[Y_Val != 0] = 1

else:
    X_Train, Y_train = pca_fc_pool_train_net_data.drop(["file_name_index"], axis=1).values, better_tag_data.values
    Y_train[Y_train != 0] = 1
    
    overall_pca = PCA(n_components=n_components)
    X_Train = overall_pca.fit_transform(X_Train)
    X_Test = overall_pca.transform(X_Test)

In [6]:
def get_accuracy_on_set(X_Val, better_tag_val_data, gt_files, classifier):

    Y_val_proba = classifier.predict_proba(X_Val)
    temp_simi = cdist(better_tag_val_data, Y_val_proba, metric='cosine')

    score = 0.0
    for i_check in range(len(X_Val)):
        pred_list = list(temp_simi[i_check].argsort()[:20])

        k = [gt_files[i] for i in pred_list]
        if gt_files[i_check] in k:
            score += float(20 - k.index(gt_files[i_check]))/20

    return temp_simi, score/2000

In [33]:
first_classifier = MLPClassifier(hidden_layer_sizes=(700), verbose=True, max_iter=3, warm_start=True)

In [None]:
print(X_Train.shape, Y_train.shape, X_Val.shape, better_tag_val_data.shape, X_Test.shape, better_tag_test_data.shape)
for i in range(9):
    first_classifier.fit(X_Train, Y_train)
    pickle.dump(first_classifier, open("3000_classifier_" + str(i) + ".pkl", "wb"))
    
    _, accuracy = get_accuracy_on_set(X_Val, better_tag_val_data, gt_files, first_classifier)
    print("Accuracy on val : " + str(accuracy))

(8000, 2000) (8000, 7771) (2000, 2000) (2000, 7771) (2000, 2000) (2000, 7771)
Iteration 1, loss = 515.41834554
Iteration 2, loss = 81.04675279
Iteration 3, loss = 72.73065632




Accuracy on val : 0.42369999999999997
Iteration 4, loss = 71.15827050
Iteration 5, loss = 58.52234475
Iteration 6, loss = 52.04865810
Accuracy on val : 0.603225000000003
Iteration 7, loss = 51.93734045
Iteration 8, loss = 44.78489297


In [21]:
# X_Train_copy = X_Train.copy()
# X_Val_copy = X_Val.copy()
# X_Test_copy = X_Test.copy()

next_training_input = np.hstack((X_Train, first_classifier.predict_proba(X_Train)))
next_val_input = np.hstack((X_Val, first_classifier.predict_proba(X_Val)))
next_test_input = np.hstack((X_Test, first_classifier.predict_proba(X_Test)))

next_training_input.shape, next_val_input.shape, next_test_input.shape

In [23]:
second_classifier = MLPClassifier(hidden_layer_sizes=(2000), verbose=True, max_iter=3, warm_start=True)

In [24]:
print(next_training_input.shape, Y_train.shape, next_val_input.shape, better_tag_val_data.shape, next_test_input.shape, better_tag_test_data.shape)
for i in range(9):
    second_classifier.fit(next_training_input, Y_train)
    pickle.dump(second_classifier, open("500_second_classifier_" + str(i) + ".pkl", "wb"))
    
    _, accuracy = get_accuracy_on_set(next_val_input, better_tag_val_data, gt_files, second_classifier)
    print("Accuracy on val : " + str(accuracy))

(8000, 9771) (8000, 7771) (2000, 9771) (2000, 7771) (2000, 9771) (2000, 7771)
Iteration 1, loss = 702.20040679
Iteration 2, loss = 90.88421398
Iteration 3, loss = 76.43485135




Accuracy on val : 0.4518500000000003
Iteration 4, loss = 70.54190289
Iteration 5, loss = 57.12677862
Iteration 6, loss = 50.05369079
Accuracy on val : 0.6511500000000013
Iteration 7, loss = 48.69385183
Iteration 8, loss = 40.85943049
Iteration 9, loss = 34.75315035
Accuracy on val : 0.6672750000000027
Iteration 10, loss = 33.17021366
Iteration 11, loss = 26.22911522
Iteration 12, loss = 20.89738006
Accuracy on val : 0.6511500000000034
Iteration 13, loss = 19.31910056




KeyboardInterrupt: 

In [112]:
print(X_Test.shape)
Y_test_proba = first_classifier.predict_proba(X_Test)

(2000, 2000)


In [113]:
simi = cdist(better_tag_test_data, Y_test_proba, metric='cosine')

In [None]:
# k = pd.DataFrame(cdist(better_tag_test_data, Y_test_proba, metric='cosine')).to_csv("simi.csv")

In [None]:
rows=[['Descritpion_ID', 'Top_20_Image_IDs']]
for i in range(len(better_tag_test_data)):
    buffer1=[]
    buffer1=simi[i].argsort()[:20]
    rows.append([str(i) + ".txt", " ".join([str(value) + ".jpg" for value in buffer1])])
csv.writer(open("submission_try.csv", "w")).writerows(rows)

In [None]:
d_check=1

output_statement = rows[d_check + 1][1]

row, col = 5,5

f, ax = plt.subplots(nrows=row, ncols=col, figsize=(30,30))
for index, image_1 in enumerate(output_statement.split()):
    if index == row*col:
        break
    img=mpimg.imread('data/images_test/' + image_1)
    

    ax[int(index/row), index%col].imshow(img)
    ax[int(index/row), index%col].set_title(image_1, fontsize=25)