https://www.kaggle.com/c/cs5785-fall19-final/

# Create an Ensemble Model

#### Get the Descriptions and Tags from File

In [1]:
import os
import numpy as np
import gensim
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import pandas as pd

In [2]:
def parse_descriptions(data_dir, num_doc):
    docs = []
    for i in range(num_doc):
        path = os.path.join(data_dir, "%d.txt" % i)
        with open(path) as f:
            docs.append(f.read())
    return docs

In [3]:
train_tags = parse_descriptions("cs5785-fall19-final/tags_train", num_doc=10000)
test_tags = parse_descriptions("cs5785-fall19-final/tags_test", num_doc=2000)

In [4]:
def parse_tags(tags):
    result = []
    for doc in tags:
        doc = doc.strip('\n').split('\n')
        cat_it = []
        if doc[0] == '':
            result.append('no tag')
        else:
            for tag in doc:
                split_tag = tag.split(':')
                cat_it.append(split_tag[0])
                cat_it.append(split_tag[1])
            parsed = (' ').join(list(cat_it))
            result.append(parsed)
    return np.array(result)

In [5]:
train_tags_parsed = parse_tags(train_tags)
test_tags_parsed = parse_tags(test_tags)

In [6]:
# function to preprocess data
def preprocessing(data):
    stop_words = set(stopwords.words('english')) # find stop words in English language
    lemmatizer = WordNetLemmatizer() # declare nltk lemmatizer

    # iterate through every sentence and replace it by itself lemmatized, without punctuation and without stop words
    for i in range(len(data)):
        sentence_no_punct = ''
        # remove punctuation
        
        for char in data[i]:
            if char not in string.punctuation:
                sentence_no_punct = sentence_no_punct + char
        data[i] = sentence_no_punct

        word_tokens = word_tokenize(data[i])
    
        # remove stop words and lemmatize
        word_tokens = [lemmatizer.lemmatize(word) for word in word_tokens if word not in stop_words and len(word) > 1]
        word_tokens = [lemmatizer.lemmatize(word, 'v') for word in word_tokens]
        word_tokens = [lemmatizer.lemmatize(word, 'a') for word in word_tokens]
        
        # remove conjunction words
        word_tokens = [word for word in word_tokens if word[-2:] != 'nt']
        (data[i]) = ' '.join(word_tokens)
        
    return data

In [7]:
train_descs = parse_descriptions('cs5785-fall19-final/descriptions_train', 10000)
test_descs  = parse_descriptions('cs5785-fall19-final/descriptions_test', 2000)
train_descs = preprocessing(train_descs)
test_descs  = preprocessing(test_descs)

#### Create the Bag of Words

In [8]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

In [9]:
def doc_to_vec(sentence, word2vec):
    # get list of word vectors in sentence
    word_vecs = [word2vec.get_vector(w) for w in sentence.split() if w in word2vec.vocab]
    # return average
    return np.stack(word_vecs).mean(0)

In [10]:
x_train = np.array([doc_to_vec(train_descs[i], word2vec) for i in range(len(train_descs))])
x_test = np.array([doc_to_vec(d, word2vec) for d in test_descs])

#### Get Image Feature Vectors

In [8]:
train_feat = pd.read_csv("cs5785-fall19-final/features_train/features_resnet1000_train.csv", header = None, index_col=None)
test_feat = pd.read_csv("cs5785-fall19-final/features_test/features_resnet1000_test.csv", header = None, index_col=None)

#train_inter_feat = pd.read_csv("cs5785-fall19-final/features_train/features_resnet1000intermediate_train.csv", header = None, index_col=None)
#test_inter_feat = pd.read_csv("cs5785-fall19-final/features_test/features_resnet1000intermediate_test.csv", header = None, index_col=None)

In [9]:
for i in range(len(train_feat[0])):
    train_feat[0][i] = int(train_feat[0][i].replace("images_train/", "").replace(".jpg", ""))
    #train_inter_feat[0][i] = int(train_inter_feat[0][i].replace("images_train/", "").replace(".jpg", ""))
    
train_feat_sort = train_feat.sort_values(by=0)
#train_inter_feat_sort = train_inter_feat.sort_values(by=0)

for i in range(len(test_feat[0])):
    test_feat[0][i] = int(test_feat[0][i].replace("images_test/", "").replace(".jpg", ""))
    #test_inter_feat[0][i]  = int(test_inter_feat[0][i].replace("images_test/", "").replace(".jpg", ""))

test_feat_sort = test_feat.sort_values(by=0)
#test_inter_feat_sort = test_inter_feat.sort_values(by=0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


In [13]:
y_train = train_feat_sort.drop(columns=0).to_numpy()
y_test = test_feat_sort.drop(columns=0).to_numpy()
y_train_inter = train_inter_feat_sort.drop(columns=0).to_numpy()
y_test_inter = test_inter_feat_sort.drop(columns=0).to_numpy()

#### Create A Train/Test Split

In [14]:
# import random
# num_train = 8000
# num_dev = 2000
# num_test = 2000
# split = list(range(num_train + num_dev))
# random.shuffle(split)

In [15]:
# x_train_dev = x_train[split[:num_train]]
# y_train_dev = y_train[split[:num_train]]
# y_train_inter_dev = y_train_inter[split[:num_train]]
# x_test_dev = x_train[split[num_train:]]
# y_test_dev = y_train[split[num_train:]]
# y_test_inter_dev = y_train_inter[split[num_train:]]

# train_tags_dev = train_tags_parsed[split[:num_train]]
# test_tags_dev = train_tags_parsed[split[num_train:]]

#### For Later when we need the tags of the whole train and test set

In [16]:
tag_train = np.array([doc_to_vec(d, word2vec) for d in train_tags_parsed])
tag_test = np.array([doc_to_vec(d, word2vec) for d in test_tags_parsed])

#### For Development Right Now when we need the smaller train and test set to check accuracy

In [17]:
# train_tags_dev = np.array([doc_to_vec(d, word2vec) for d in train_tags_dev])
# test_tags_dev = np.array([doc_to_vec(d, word2vec) for d in test_tags_dev])

### Relate Descriptions and Tags with a Model --- SVM

In [18]:
# from sklearn.svm import SVR
# from sklearn.multioutput import MultiOutputRegressor
# supvm = MultiOutputRegressor(SVR(gamma = 'scale')).fit(x_train_dev, train_tags_dev)
# tag_preds = supvm.predict(x_test_dev)

In [19]:
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
supvm = MultiOutputRegressor(SVR(gamma = 'scale')).fit(x_train, tag_train)
tag_preds = supvm.predict(tag_test)

### Relate Image Features and Descriptions Model

In [20]:
# from sklearn.neural_network import MLPRegressor

# clf = MLPRegressor(solver='sgd', alpha=1e-5, hidden_layer_sizes=(300,), random_state=1, max_iter = 1000)
# clf.fit(x_train_dev, y_train_dev)
# #clf.fit(x_train_dev, y_train_inter_dev) # for intermediate features
# image_preds = clf.predict(x_test_dev)

In [21]:
from sklearn.neural_network import MLPRegressor

clf = MLPRegressor(solver='sgd', alpha=1e-5, hidden_layer_sizes=(300,), random_state=1, max_iter = 1000)
clf.fit(x_train, y_train)
#clf.fit(x_train_dev, y_train_inter_dev) # for intermediate features
image_preds = clf.predict(x_test)

### We could try this KNN regressor or the random forest with or instead of SVM

In [22]:
# function to train KNN regressor model, predict with it, and calculate the accuracy and confusion matrix
def KNN_Regressor(x_train, y_train, x_test, y_test):
    
    # set up regressor  
    knn_regressor = KNeighborsRegressor(n_neighbors = 1) 
    
    # fit regressor 
    knn_regressor.fit(x_train, y_train)
    
    # predict
    knn_preds = knn_regressor.predict(x_test)
    
    # return accuracy and confusion matrix
    return knn_preds

In [23]:
# function to train random forest regressor model, predict with it, and calculate the accuracy and confusion matrix
def Random_Forest(x_train, y_train, x_test, y_test):
    # set up regressor  
    rf_regressor = RandomForestRegressor(max_depth=20) 
    
    # fit regressor 
    rf_regressor.fit(x_train, y_train)
    
    # predict
    rf_preds = rf_regressor.predict(x_test)
    
    # return accuracy and confusion matrix
    return rf_preds

### Can try concatenating intermediate features or doing it only with them

### Combine the Models

#### Try Evaluating the Nearest Neighbors and Take A New Ranking Based on Both

In [24]:
from sklearn.neighbors import NearestNeighbors
# function to return the number of nearest neighbors specified
def knn_function(data, point, neighbors_number):
    knn = NearestNeighbors(n_neighbors=neighbors_number)
    knn.fit(data)
    return knn.kneighbors(point, neighbors_number, return_distance=False)

In [25]:
# tag_neighbors = knn_function(test_tags_dev, tag_preds, 50)
# image_neighbors = knn_function(y_test_dev, image_preds, 50)

In [26]:
tag_neighbors2 = knn_function(tag_test, tag_preds, 50)
image_neighbors2 = knn_function(y_test, image_preds, 50)

In [27]:
# neighbors1 = []
# for i in range(len(tag_neighbors)):
#     neighbors1.append(list(set(tag_neighbors[i]) & set(image_neighbors[i]))[:20])
#     j = 0
#     while len(neighbors1[i]) < 20:
#         if image_neighbors[i][j] not in neighbors1[i]:
#             neighbors1[i].append(image_neighbors[i][j])
#         j += 1
    
#     neighbors1[i] = np.array(neighbors1[i])

In [39]:
neighbors2 = [[] for _ in range(len(tag_neighbors2))]
for i in range(len(tag_neighbors2)):
#     for im in image_neighbors2[i]:
#         if im in tag_neighbors2[i]:
#             neighbors2[i] += [im]
    
    neighbors2[i] += [im for im in image_neighbors2[i] if im in tag_neighbors2[i]]
    
    if len(neighbors2[i]) < 20:
        for im in image_neighbors2[i]:
            if not im in neighbors2[i]:
                neighbors2[i] += [im]
            if len(neighbors2[i]) == 20:
                break
    
#     neighbors2.append(list(set(tag_neighbors2[i]) & set(image_neighbors2[i]))[:20])
#     j = 0
#     while len(neighbors2[i]) < 20:
#         if image_neighbors2[i][j] not in neighbors2[i]:
#             neighbors2[i].append(image_neighbors2[i][j])
#         j += 1
    
    neighbors2[i] = np.array(neighbors2[i][:20])

In [None]:
def combine_tags_images(ims, tags):
    neighbors = [[] for _ in range(len(ims))]
    
    for i in range(len(ims)):
        neighbors[i] += [im for im in ims[i] if im in tags]
        
        for im in neighbors[i]:
            if len(neighbors[i]) == 20:
                break
            if not im in neighbors[i]:
                neighbors[i] += [im]
        neighbors[i] = np.array(neighbors[i])

    return np.array(neighbors)

In [40]:
# neighbors1 = np.array(neighbors1)

In [41]:
neighbors2 = np.array(neighbors2)

In [None]:
# neighbors_adjusted = []
# file_list = []
# for i, row in enumerate(neighbors1):
#     newlst = []
#     for val in row:
#         newlst.append(str(val)+'.jpg')
#     neighbors_adjusted.append((' ').join(newlst))
#     file_list.append(str(i)+'.txt')

In [42]:
neighbors_adjusted2 = []
file_list = []
for i, row in enumerate(neighbors2):
    newlst = []
    for val in row:
        newlst.append(str(val)+'.jpg')
    neighbors_adjusted2.append((' ').join(newlst))
    file_list.append(str(i)+'.txt')

In [43]:
neighbors_adjusted2[0]

'563.jpg 1992.jpg 1967.jpg 345.jpg 1980.jpg 338.jpg 760.jpg 1380.jpg 1040.jpg 124.jpg 1296.jpg 669.jpg 683.jpg 105.jpg 1077.jpg 1499.jpg 46.jpg 714.jpg 1883.jpg 1011.jpg'

### Evaluate the Model

In [44]:
def scores(i):
    return (20+1-(i+1))/20

In [45]:
def eval_accuracy(ytest, neighbors):
    # EVALUATE THE MODEL USING THE MEAN AVERAGE PRECISION AT 20
    scs = []
    for i in range(len(neighbors)):
        good = False
        for j, n in enumerate(neighbors[i]):
            if i == n:
                scs.append(scores(j))
                good = True
                break
        if good == False:
            scs.append(0)
    return sum(scs)/len(neighbors)

In [46]:
def dist(x1, x2):
    return sum((x1 - x2) ** 2) ** 0.5

def evaluate_model_with_error(ytest, ypred):
    # EVALUATE THE MODEL USING SUM SQUARED ERROR
    scores1 = []
    for i in range(len(ypred)):
        distances = []
        for j in range(len(ytest)):
            distances.append(dist(ypred[i], ytest[j]))
        pred_dist_idx = list(np.argsort(distances))
        dev_pos = pred_dist_idx.index(i)
        if dev_pos < 20:
            scores1.append(scores(dev_pos))
        else:
            scores1.append(0.0)
    return np.mean(scores1)

In [49]:
print('accuracy of svm tags', eval_accuracy(y_test_dev, tag_neighbors[:20]))

NameError: name 'y_test_dev' is not defined

In [50]:
print('accuracy of svm tags', evaluate_model_with_error(y_test_dev, tag_neighbors[:20]))

NameError: name 'y_test_dev' is not defined

In [51]:
print('accuracy of image features', eval_accuracy(y_test_dev, image_neighbors[:20]))

NameError: name 'y_test_dev' is not defined

In [52]:
print('accuracy of image features', evaluate_model_with_error(y_test_dev, image_neighbors[:20]))

NameError: name 'y_test_dev' is not defined

In [53]:
print('accuracy of svm tags combined with image features', eval_accuracy(y_test_dev, neighbors1))

NameError: name 'y_test_dev' is not defined

In [54]:
print('accuracy of svm tags combined with image features', evaluate_model_with_error(y_test_dev, neighbors1))

NameError: name 'y_test_dev' is not defined

In [None]:
# for r in neighbors1:
#     print(len(r))

In [None]:
# tag_neighbors.shape

In [None]:
# y_test_dev.shape

In [57]:
import csv
def outputCSV(predictions):
    with open("image_prediction_mlp2.csv", "w") as outputFile:
        headers = ["Descritpion_ID", "Top_20_Image_IDs"]
        fileWriter = csv.DictWriter(outputFile, fieldnames=headers)
        fileWriter.writeheader()
        for index, pred in enumerate(predictions):
            fileWriter.writerow({headers[0]: "{}.txt".format(index), headers[1]: ''.join(predictions[index])})

In [58]:
outputCSV(neighbors_adjusted2)