# Word2Vec + Models

In this document, we are trying to train a lot of different models using word2vec as an encriptor for the X (the descriptions).

The document is structred as follows:

- **Load data**
    - Descriptions (x)
    - Features (y)
    
- **Word2Vec (for descriptions)**
- **PCA (for features)**

- **Model(s)** training *make sure to comment the best results of each method with its hyperparameters*
    1. MLP
    2. XX

- **Accuracy in the validation set**


___

Ideas of things to try:

- 

In [1]:
import os
import numpy as np
import gensim
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
import string
import pandas as pd
import sklearn

In [28]:
def parse_descriptions(data_dir, num_doc):
    docs = []
    for i in range(num_doc):
        path = os.path.join(data_dir, "%d.txt" % i)
        with open(path) as f:
            docs.append(f.read())
    return docs

# function to preprocess data
def preprocessing(data):
    stop_words = set(stopwords.words('english')) # find stop words in English language
    lemmatizer = WordNetLemmatizer() # declare nltk lemmatizer

    # iterate through every sentence and replace it by itself lemmatized, without punctuation and without stop words
    for i in range(len(data)):
        sentence_no_punct = ''
        # remove punctuation
        
        for char in data[i]:
            if char not in string.punctuation:
                sentence_no_punct = sentence_no_punct + char
        data[i] = sentence_no_punct

        word_tokens = word_tokenize(data[i])
    
        # remove stop words and lemmatize
        word_tokens = [lemmatizer.lemmatize(word) for word in word_tokens if word not in stop_words and len(word) > 1]
        word_tokens = [lemmatizer.lemmatize(word, 'v') for word in word_tokens]
        word_tokens = [lemmatizer.lemmatize(word, 'a') for word in word_tokens]
        
        # remove conjunction words
        word_tokens = [word for word in word_tokens if word[-2:] != 'nt']
        (data[i]) = ' '.join(word_tokens)
        
    return data

def StandardScaler (data):
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    scaler.fit(data)
    return scaler.transform(data)

## Load data

#### Descriptions

In [6]:
train_descs = parse_descriptions('cs5785-fall19-final/descriptions_train', 10000)
test_descs  = parse_descriptions('cs5785-fall19-final/descriptions_test', 2000)

train_descs = preprocessing(train_descs)
test_descs  = preprocessing(test_descs)

#### Features

In [72]:
train_feat = pd.read_csv("cs5785-fall19-final/features_train/features_resnet1000_train.csv", header = None, index_col=None)
test_feat = pd.read_csv("cs5785-fall19-final/features_test/features_resnet1000_test.csv", header = None, index_col=None)

In [73]:
for i in range(len(train_feat[0])):
    train_feat[0][i] = int(train_feat[0][i].replace("images_train/", "").replace(".jpg", ""))    
train_feat_sort = train_feat.sort_values(by=0)

for i in range(len(test_feat[0])):
    test_feat[0][i] = int(test_feat[0][i].replace("images_test/", "").replace(".jpg", ""))
test_feat_sort = test_feat.sort_values(by=0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [74]:
train_feat_sort = train_feat_sort.drop(columns=0).to_numpy()
test_feat_sort = test_feat_sort.drop(columns=0).to_numpy()

In [77]:
## DATA NORMALIZATION
train_feat_sort = StandardScaler(train_feat_sort)
test_feat_sort = StandardScaler(test_feat_sort)

## Word2vec - Descriptions

In [10]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

In [39]:
def doc_to_vec(sentence, word2vec):
    word_vecs = [word2vec.get_vector(w) for w in sentence.split() if w in word2vec.vocab]
    return np.stack(word_vecs).mean(0)

In [40]:
x_train = np.array([doc_to_vec(train_descs[i], word2vec) for i in range(len(train_descs))])
x_test = np.array([doc_to_vec(d, word2vec) for d in test_descs])

In [41]:
## DATA NORMALIZATION
x_train = StandardScaler(x_train)
x_test = StandardScaler(x_test)

## PCA - Features

In [156]:
from sklearn.decomposition import PCA

In [157]:
pca = PCA(n_components = 100)
pca.fit(train_feat_sort)

PCA(copy=True, iterated_power='auto', n_components=100, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [158]:
y_train = pca.transform(train_feat_sort)
y_test = pca.transform(test_feat_sort)

In [159]:
len(y_train[0])

100

# Model(s) Training

## 1. MLP

In [160]:
from sklearn.model_selection import train_test_split
xtrain, xval, ytrain, yval = train_test_split(x_train, y_train)

In [161]:
from sklearn.neural_network import MLPRegressor

In [182]:
clf = MLPRegressor(solver='adam', alpha=1e-5, hidden_layer_sizes=(100,), random_state=1, max_iter = 1000)

clf.fit(xtrain, ytrain)

yval_pred = clf.predict(xval)

In [None]:
'''
RESULTS: 
Development MAP@20: 0.15715323095892755 [adam, hidden_layer_sizes = (100,)]

'''

## 2. Ridge

## Accuracy in the validation set

In [183]:
from scipy.spatial.distance import cdist

def dist_matrix(x1, x2):
    return cdist(x1,x2,'cosine')

In [184]:
val_distances = dist_matrix(yval_pred, yval)

In [185]:
val_scores = []
val_pos_list = []

for i in range(len(yval)):
    pred_dist_idx = list(np.argsort(val_distances[i]))
    val_pos = pred_dist_idx.index(i)
    val_pos_list.append(val_pos)
    if val_pos < 20:
        val_scores.append(1 / (val_pos + 1))
    else:
        val_scores.append(0.0)

print("Development MAP@20:", np.mean(val_scores))
print("Mean index of true image", np.mean(val_pos_list))
print("Median index of true image", np.median(val_pos_list))

Development MAP@20: 0.15715323095892755
Mean index of true image 78.9964
Median index of true image 22.0
