https://www.kaggle.com/c/cs5785-fall19-final/

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import csv
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import sklearn
from os import listdir
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import NearestNeighbors

%pylab inline

Populating the interactive namespace from numpy and matplotlib


#### Import Files

In [2]:
def get_flat_descriptions_from_folder(folder, num):
    descriptions = []
    for i in range(num):
        with open((folder).format(i), newline='') as f:
            desc1 = []
            reader = csv.reader(f)
            for row in reader:
                desc1.append(row)
            descriptions.append(desc1)
    
    # FIRST FLATTENING
    descriptions2 = []
    for description in descriptions:
        descriptions2.append([desc for sublist in description for desc in sublist])
        
    # MADE THE SENTENCES ALL ONE FOR EACH DESCRIPTION FILE
    flat_descriptions = []
    for description in descriptions2:
        desc1 = []
        for sentence in description:
            desc1 += sentence.split(' ')
        flat_descriptions.append(desc1)
    
    for i in range(len(flat_descriptions)):
        flat_descriptions[i] = (' ').join(flat_descriptions[i]).lower()
    
    return flat_descriptions

In [3]:
# function to preprocess data
def preprocessing(data):
    stop_words = set(stopwords.words('english')) # find stop words in English language
    lemmatizer = WordNetLemmatizer() # declare nltk lemmatizer

    # iterate through every sentence and replace it by itself lemmatized, without punctuation and without stop words
    for i in range(len(data)):
        sentence_no_punct = ''
        # remove punctuation
        
        for char in data[i]:
            if char not in string.punctuation:
                sentence_no_punct = sentence_no_punct + char
        data[i] = sentence_no_punct

        word_tokens = word_tokenize(data[i])
    
        # remove stop words and lemmatize
        word_tokens = [lemmatizer.lemmatize(word) for word in word_tokens if word not in stop_words and len(word) > 1]
        word_tokens = [lemmatizer.lemmatize(word, 'v') for word in word_tokens]
        word_tokens = [lemmatizer.lemmatize(word, 'a') for word in word_tokens]
        
        # remove conjunction words
        word_tokens = [word for word in word_tokens if word[-2:] != 'nt']
        (data[i]) = ' '.join(word_tokens)
        
    return data

In [4]:
# [  [ unprocessed description of image 1  ] , [ unprocessed description of image 2  ]   , ...  ]
train_flat_descs = get_flat_descriptions_from_folder('cs5785-fall19-final/descriptions_train/{}.txt', 10000)
test_flat_descs  = get_flat_descriptions_from_folder('cs5785-fall19-final/descriptions_test/{}.txt', 2000)

In [5]:
# [  [ processed description of image 1  ] , [ processed description of image 2  ]   , ...  ]
train_descs = preprocessing(train_flat_descs)
test_descs  = preprocessing(test_flat_descs)

#### Bag Of Words Features

In [6]:
# CREATE THE BAG OF WORDS DICTIONARY
def create_bow(train_descs):
    BOW = {}
    for description in train_descs:
        sentence_lst = description.split(' ')
        for word in sentence_lst:
            BOW[word] = 0
    BOW['null'] = 0
    return BOW

In [7]:
def create_bow_vectors(data, BOW):
    feature_vectors = []
    for description in data:
        feat_vec = BOW.copy()
        sentence_lst = description.split(' ')
        for word in sentence_lst:
            if word in feat_vec:
                feat_vec[word] += 1
            else:
                feat_vec['null'] += 1       
        feature_vectors.append(feat_vec)

    # TURN DICTIONARIES INTO A MATRIX with each row as one description
    feature_vector_matrix = []
    for feature_vec in feature_vectors:
        feature_vector_matrix.append(list(feature_vec.values()))

    # NORMALIZE THE FEATURES
    feature_vector_matrix = sklearn.preprocessing.normalize(feature_vector_matrix) # default is L2 norm
    return feature_vector_matrix

#### Get Image Features

In [8]:
train_feat = pd.read_csv("cs5785-fall19-final/features_train/features_resnet1000_train.csv", header = None, index_col=None)
test_feat = pd.read_csv("cs5785-fall19-final/features_test/features_resnet1000_test.csv", header = None, index_col=None)

In [9]:
for i in range(len(train_feat[0])):
    train_feat[0][i] = int(train_feat[0][i].replace("images_train/", "").replace(".jpg", ""))

train_feat_sort = train_feat.sort_values(by=0)

for i in range(len(test_feat[0])):
    test_feat[0][i] = int(test_feat[0][i].replace("images_test/", "").replace(".jpg", ""))

test_feat_sort = test_feat.sort_values(by=0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


### Try Multilayer Perceptron

In [19]:
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import accuracy_score

cv = KFold(3)

X = np.array(train_descs)
y = train_feat_sort.drop(columns=0).to_numpy()
accuracies = []

for train, test in cv.split(X, y):
    # MAKE THE BAG OF WORDS
    BOW = create_bow(X[train])
    X_train = create_bow_vectors(X[train], BOW)
    X_test = create_bow_vectors(X[test], BOW)
    # MAKE THE MODEL
    clf = MLPRegressor(solver='sgd', alpha=1e-5, hidden_layer_sizes=(10,), random_state=1, max_iter = 1000)
    # FIT THE MODEL
    clf.fit(X_train, y[train])
    # FIND r^2
    accuracies.append(clf.score(X_test, y[test]))



This gave about 0.39-0.4 R^2

## Do MLP with Proper KNN Scoring

In [24]:
# function to return the number of nearest neighbors specified
def knn_function(data, point, neighbors_number):
    knn = NearestNeighbors(n_neighbors=neighbors_number, algorithm='ball_tree')
    knn.fit(data)
    return knn.kneighbors(point, neighbors_number)

In [10]:
def scores(i):
    return (20+1-i)/20

In [23]:
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPRegressor

cv = KFold(3)

X = np.array(train_descs)
y = train_feat_sort.drop(columns=0).to_numpy()
accuracies = []

for train, test in cv.split(X, y):
    # FIND THE BAG OF WORDS
    BOW = create_bow(X[train])
    X_train = create_bow_vectors(X[train], BOW)
    X_test = create_bow_vectors(X[test], BOW)
    # FIND THE MODEL
    clf = MLPRegressor(solver='sgd', alpha=1e-5, hidden_layer_sizes=(10,), random_state=1, max_iter = 1000)
    # FIT THE MODEL
    clf.fit(X_train, y[train])
    # PREDICT
    predictions = clf.predict(X_test)
    # FIND NEAREST NEIGHBORS
    neighbors = knn_function(y[test], predictions, 20)
    # EVALUATE THE MODEL USING THE MEAN AVERAGE PRECISION AT 20
    scs = []
    for i in range(len(predictions)):
        good = False
        for j, n in enumerate(neighbors[i]):
            if i == n:
                scs.append(scores(j))
                good = True
                break
        if good == False:
            scs.append(0)
    accuracies.append(sum(scs)/len(predictions))

In [25]:
np.mean(accuracies)

0.09035502230486997

## Try Using Sum Squared Distance Instead of KNN

In [11]:
def dist(x1, x2):
    return sum((x1 - x2) ** 2) ** 0.5

In [12]:
from sklearn.neural_network import MLPRegressor
# train test split
from sklearn.model_selection import train_test_split
Xtrain, Xtest, Ytrain, Ytest = train_test_split(np.array(train_descs), train_feat_sort.drop(columns=0).to_numpy())

accuracies = []

BOW = create_bow(Xtrain)
X_train = create_bow_vectors(Xtrain, BOW)
X_test = create_bow_vectors(Xtest, BOW)
clf = MLPRegressor(solver='sgd', alpha=1e-5, hidden_layer_sizes=(10,), random_state=1, max_iter = 1000)
clf.fit(X_train, Ytrain)
predictions = clf.predict(X_test)

In [19]:
# EVALUATE THE MODEL USING SUM SQUARED ERROR
scores1 = []
for i in range(len(predictions)):
    distances = []
    for j in range(len(predictions)):
        distances.append(dist(predictions[i], Ytest[j]))
    if i % 100 == 0:
        print(i)
    pred_dist_idx = list(np.argsort(distances))
    dev_pos = pred_dist_idx.index(i)
    if dev_pos < 20:
        scores1.append(scores(dev_pos))
    else:
        scores1.append(0.0)
np.mean(scores1)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400


0.14106000000000002

Not too bad accuracy as compared to with KNN

## Train On Whole Set and Run On Test Data for the First MLP Model with BOW and KNN

In [12]:
BOW = create_bow(train_descs)

In [13]:
feature_vector_matrix = create_bow_vectors(train_descs, BOW)

In [14]:
feature_vector_matrix_test = create_bow_vectors(test_descs, BOW)

In [15]:
from sklearn.neural_network import MLPRegressor

X = feature_vector_matrix
X_test = feature_vector_matrix_test
y = train_feat_sort.drop(columns=0).to_numpy()
y_test = test_feat_sort.drop(columns=0).to_numpy()

In [16]:
clf = MLPRegressor(solver='sgd', alpha=1e-5, hidden_layer_sizes=(10,), random_state=1, max_iter = 1000)
clf.fit(X, y)
predictions = clf.predict(X_test)
neighbors = knn_function(y_test, predictions, 20)

In [17]:
neighbors_adjusted = []
file_list = []
for i, row in enumerate(neighbors):
    newlst = []
    for val in row:
        newlst.append(str(val)+'.jpg')
    neighbors_adjusted.append((' ').join(newlst))
    file_list.append(str(i)+'.txt')

In [18]:
df = pd.DataFrame(zip(file_list, neighbors_adjusted), columns = ['Descritpion_ID', 'Top_20_Image_IDs'])

In [19]:
df.to_csv('MLP_neuralnet.csv', index = False)

This only gave 5.3% accuracy on kaggle. We need a better method.

## Word To Vec

In [24]:
import os
import csv
import random
import gensim
import numpy as np

In [25]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

In [26]:
def doc_to_vec(sentence, word2vec):
    # get list of word vectors in sentence
    word_vecs = [word2vec.get_vector(w) for w in sentence.split() if w in word2vec.vocab]
    # return average
    return np.stack(word_vecs).mean(0)

In [27]:
x_train = np.array([doc_to_vec(train_flat_descs[i], word2vec) for i in range(len(train_flat_descs))])
x_test = np.array([doc_to_vec(d, word2vec) for d in test_flat_descs])

In [28]:
y_train = train_feat_sort.drop(columns=0).to_numpy()
y_test = test_feat_sort.drop(columns=0).to_numpy()

In [30]:
from sklearn.neural_network import MLPRegressor
# train test split
from sklearn.model_selection import train_test_split
Xtrain, Xtest, Ytrain, Ytest = train_test_split(x_train, y_train)

accuracies = []

clf = MLPRegressor(solver='sgd', alpha=1e-5, hidden_layer_sizes=(10,), random_state=1, max_iter = 1000)
clf.fit(Xtrain, Ytrain)
predictions = clf.predict(Xtest)

In [31]:
scores1 = []
for i in range(len(predictions)):
    # find distances to every point
    distances = []
    for j in range(len(predictions)):
        distances.append(dist(predictions[i], Ytest[j]))
    if i % 500 == 0: # make sure its working
        print(i)
    pred_dist_idx = list(np.argsort(distances))
    dev_pos = pred_dist_idx.index(i)
    if dev_pos < 20:
        scores1.append(scores(dev_pos))
    else:
        scores1.append(0.0)
np.mean(scores1)

0
500
1000
1500
2000


0.1242

### Increase the Complexity?

In [34]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(x_train, y_train)

accuracies = []

clf = MLPRegressor(solver='sgd', alpha=1e-5, hidden_layer_sizes=(20,), random_state=1, max_iter = 1000)
clf.fit(Xtrain, Ytrain)
predictions = clf.predict(Xtest)

In [35]:
scores1 = []
for i in range(len(predictions)):
    # find distances to every point
    distances = []
    for j in range(len(predictions)):
        distances.append(dist(predictions[i], Ytest[j]))
    if i % 500 == 0: # make sure its working
        print(i)
    pred_dist_idx = list(np.argsort(distances))
    dev_pos = pred_dist_idx.index(i)
    if dev_pos < 20:
        scores1.append(scores(dev_pos))
    else:
        scores1.append(0.0)
np.mean(scores1)

0
500
1000
1500
2000


0.21912000000000004

### Increase the number of neurons more???

In [36]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(x_train, y_train)

accuracies = []

clf = MLPRegressor(solver='sgd', alpha=1e-5, hidden_layer_sizes=(100,), random_state=1, max_iter = 1000)
clf.fit(Xtrain, Ytrain)
predictions = clf.predict(Xtest)

In [37]:
scores1 = []
for i in range(len(predictions)):
    # find distances to every point
    distances = []
    for j in range(len(predictions)):
        distances.append(dist(predictions[i], Ytest[j]))
    if i % 500 == 0: # make sure its working
        print(i)
    pred_dist_idx = list(np.argsort(distances))
    dev_pos = pred_dist_idx.index(i)
    if dev_pos < 20:
        scores1.append(scores(dev_pos))
    else:
        scores1.append(0.0)
np.mean(scores1)

0
500
1000
1500
2000


0.32356

## Add another layer

In [42]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(x_train, y_train)

accuracies = []

clf = MLPRegressor(solver='sgd', alpha=1e-5, hidden_layer_sizes=(100,200), random_state=1, max_iter = 1000)
clf.fit(Xtrain, Ytrain)
predictions = clf.predict(Xtest)

In [43]:
scores1 = []
for i in range(len(predictions)):
    # find distances to every point
    distances = []
    for j in range(len(predictions)):
        distances.append(dist(predictions[i], Ytest[j]))
    if i % 500 == 0: # make sure its working
        print(i)
    pred_dist_idx = list(np.argsort(distances))
    dev_pos = pred_dist_idx.index(i)
    if dev_pos < 20:
        scores1.append(scores(dev_pos))
    else:
        scores1.append(0.0)
np.mean(scores1)

0
500
1000
1500
2000


0.34342

## And another layer

In [44]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(x_train, y_train)

accuracies = []

clf = MLPRegressor(solver='sgd', alpha=1e-5, hidden_layer_sizes=(100,200,400), random_state=1, max_iter = 1000)
clf.fit(Xtrain, Ytrain)
predictions = clf.predict(Xtest)

In [45]:
scores1 = []
for i in range(len(predictions)):
    # find distances to every point
    distances = []
    for j in range(len(predictions)):
        distances.append(dist(predictions[i], Ytest[j]))
    if i % 500 == 0: # make sure its working
        print(i)
    pred_dist_idx = list(np.argsort(distances))
    dev_pos = pred_dist_idx.index(i)
    if dev_pos < 20:
        scores1.append(scores(dev_pos))
    else:
        scores1.append(0.0)
np.mean(scores1)

0
500
1000
1500
2000


0.30582

This was not as good as the two layer model.

## Get the Top 20 images for the test set using the best model

In [None]:
# NOT BEEN RUN YET 
clf = MLPRegressor(solver='sgd', alpha=1e-5, hidden_layer_sizes=(100,200), random_state=1, max_iter = 1000)
clf.fit(x_train, y_train)
predictions = clf.predict(x_test)

preds = []
for i in range(len(predictions)):
    distances = []
    for j in range(len(y_test)):
        distances.append(dist(predictions[i], y_test[j]))
    test_idx = list(np.argsort(distances))
    row = [str(i)+'.jpg' for i in test_idx[:20]]
    preds.append(" ".join(row))

In [None]:
file_list = []
for i, row in enumerate(neighbors):
    file_list.append(str(i)+'.txt')

In [None]:
df = pd.DataFrame(zip(file_list, preds), columns = ['Descritpion_ID', 'Top_20_Image_IDs'])

In [None]:
df.to_csv('MLP_multilayerNeuralnet_wordnet.csv', index = False)