# Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from os import listdir
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import csv
import gensim
from gensim.models import Word2Vec

from sklearn.ensemble import RandomForestRegressor, BaggingRegressor

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=DeprecationWarning)

# Definitions and Data Load

In [2]:
dataFolder = "cs5785-fall19-final"

descTrainFolder = dataFolder + "/descriptions_train"
descTestFolder = dataFolder + "/descriptions_test"

featTrainFolder = dataFolder + "/features_train"
featTestFolder = dataFolder + "/features_test"

imagesTrainFolder = dataFolder + "/images_train"
imagesTestFolder = dataFolder + "/images_test"

tagsTrainFolder = dataFolder + "/tags_train"
tagsTestFolder = dataFolder + "/tags_test"

folders = [descTrainFolder,   descTestFolder,   featTrainFolder, featTestFolder, 
           imagesTrainFolder, imagesTestFolder, tagsTrainFolder, tagsTestFolder]


In [3]:
def getFilesFromFolder(folder):
    return listdir(folder)

# Preprocess data

In [4]:
# function to preprocess data
def preprocessing(data):
    stop_words = set(stopwords.words('english')) # find stop words in English language
    lemmatizer = WordNetLemmatizer() # declare nltk lemmatizer

    # iterate through every sentence and replace it by itself lemmatized, without punctuation and without stop words
    for i in range(len(data)):
        sentence_no_punct = ''
        
        # remove punctuation
        for char in data[i]:
            if char not in string.punctuation:
                sentence_no_punct = sentence_no_punct + char
        data[i] = sentence_no_punct

        """
        Analyzing if words are upper/lower case is more for analyzing the intensity of the sentiment rather than classifying it. 
        """
        word_tokens = list(map(lambda r: r.lower(), word_tokenize(data[i])))
        
    
        # remove stop words and lemmatize
        word_tokens = [lemmatizer.lemmatize(word) for word in word_tokens if word not in stop_words and len(word) > 1]
        word_tokens = [lemmatizer.lemmatize(word, 'v') for word in word_tokens]
        word_tokens = [lemmatizer.lemmatize(word, 'a') for word in word_tokens]
        
        # remove conjunction words
        word_tokens = [word for word in word_tokens if word[-2:] != 'nt']
        (data[i]) = ' '.join(word_tokens)
    
    assert(len(data) == 1)
    return set(data[0].split())

In [5]:
def get_flat_descriptions_from_folder(folder):
    flat_descriptions = []
    amtFiles = len(getFilesFromFolder(folder))
    indexes = list(sorted([str(i) for i in range(amtFiles)], key= lambda x: int(x)))
    
    for index in indexes:
        filename = folder + "/{}.txt".format(index)
        with open(filename, 'r') as d_file:
            flat_descriptions += [[' '.join(d_file.readlines()).replace("\n", "")]]
            
    return flat_descriptions

In [6]:
# [  [ unprocessed description of image 1  ] , [ unprocessed description of image 2  ]   , ...  ]
train_flat_descs = get_flat_descriptions_from_folder(descTrainFolder)
test_flat_descs  = get_flat_descriptions_from_folder(descTestFolder)

In [7]:
def unique_words_in_desc(desc):
    return np.array([list(preprocessing(d)) for d in desc])

def word_to_num(descs):
    m = Word2Vec(descs, min_count = 1, size = 1, window = 5)
    return np.array([np.array([m[d][0] for d in desc]) for desc in descs])

def index_words(descs):
    word_for_index = {}
    words_in_dict = 0
    for desc in descs:
        for word in desc:
            if word_for_index.get(word) is None:
                word_for_index[word] = words_in_dict
                words_in_dict += 1
    return word_for_index, words_in_dict

def make_bow(descs, dictionary, amt_words):
    train_bow = [[0 for _ in range(amt_words)] for _ in range(len(descs))]
    m = Word2Vec(descs, min_count = 1, size = 1, window = 5)
    
    for i, d in enumerate(descs):
        for word in d:
            if not dictionary.get(word) is None:
                train_bow[i][dictionary[word]] = m[d][0]
#             else:
#                 print(word)
                
    return train_bow    

In [8]:
# [  { processed description of image 1  } , { processed description of image 2  }   , ...  ]
train_descs = unique_words_in_desc(train_flat_descs)
test_descs  = unique_words_in_desc(test_flat_descs)

print("Creating dictionary...")
word_for_index, words_in_dict = index_words(train_descs)

Creating dictionary...


# Bag of Words

In [9]:
# [ [0, 0, 0, 0 float, 0, 0, float, ...], [0, 0, 0, 0 float, 0, 0, float, ...], ...]
train_bow = make_bow(train_descs, word_for_index, words_in_dict)

In [10]:
# [ [0, 0, 0, 0 float, 0, 0, float, ...], [0, 0, 0, 0 float, 0, 0, float, ...], ...]
test_bow = make_bow(test_descs, word_for_index, words_in_dict)

# Feature Extraction

In [11]:
def get_file_num(filename):
    return int(filename.replace(".jpg", "").replace("images_train/", "").replace("images_test/", ""))

def get_feat_from_file(filename):
    with open(filename, 'r') as f:
        feats = list(csv.reader(f))
    sorted_feats = list(sorted(feats, key= lambda l: get_file_num(l[0])))
    
    return np.array([s[1:] for s in sorted_feats], dtype='float')

In [13]:
resTrainFile = featTrainFolder + "/features_resnet1000_train.csv"
train_feat = get_feat_from_file(resTrainFile)

resTestFile = featTestFolder + "/features_resnet1000_test.csv"
test_feat = get_feat_from_file(resTestFile)

# Predictions

In [14]:
def predict_feats_forest(test_bow):
    regForest = RandomForestRegressor(max_depth = 40, n_estimators = 100)
    regForest.fit(train_bow, train_feat)
    preds = regForest.predict(test_bow)
    return preds

In [15]:
preds = predict_feats_forest(test_bow)

In [16]:
from scipy.spatial import distance_matrix
dm = distance_matrix(preds, test_feat)

In [17]:
id_dists = []
for i, d in enumerate(dm):
    temp_dists = [(j, d[j]) for j in range(len(d))]
    id_dists.append(list(sorted(temp_dists, key= lambda x: abs(x[1])))[:20])

In [18]:
top_images = [["{}.jpg".format(id) for id, _ in row] for row in id_dists]

# CSV Output

In [19]:
def outputCSV(predictions):
    with open("image_prediction.csv", "w") as outputFile:
        headers = ["Description_ID", "Top_20_Image_IDs"]
        fileWriter = csv.DictWriter(outputFile, fieldnames=headers)
        fileWriter.writeheader()
        for index, pred in enumerate(predictions):
            fileWriter.writerow({headers[0]: "{}.txt".format(index), headers[1]: ' '.join(predictions[index])})

In [20]:
outputCSV(top_images)