In [1]:
import pandas as pd
import gensim
import tensorflow as tf
from keras_preprocessing import image as im
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import os
import tensorflow_text as text
from tqdm import tqdm
import tensorflow_hub as hub
import tensorflow.keras.applications as apps
sns.set()

Slow version of gensim.models.doc2vec is being used


# Loading data and processing

In [13]:
def load_data(ABS_PATH):
    annotations = [i for i in os.listdir(ABS_PATH) if i.endswith('csv')]
    folders = [i for i in os.listdir(ABS_PATH) if not i.endswith('csv')]
    df = pd.DataFrame()
    for path in annotations:
        df = pd.concat([df, pd.read_csv(os.path.join(ABS_PATH,path))], axis=0)
    df = df.drop_duplicates('image_name').drop(columns='Unnamed: 0').dropna(how='any')
    for folder in folders:
        path = os.path.join(ABS_PATH, folder)
        in_folder = os.listdir(path)
        df.loc[df['image_name'].isin(in_folder), 'image_name'] = \
        df.loc[df['image_name'].isin(in_folder), 'image_name'].apply(lambda x: os.path.join(path,x))
    df = df[df['image_name'].apply(lambda x: 'data' in x.split('/'))]
    images = []
    for image_name in df['image_name'].values:
        images.append(np.array(im.load_img(image_name, target_size=(224,224))))
    images = np.array(images)
    images = images/255
    annotations = df['annotation'].str.lower().values
    return images, annotations

In [14]:
images, annotations = load_data("../../data/imdb/")

# Saving features

In [4]:
def check_folder(path):
    if not os.path.exists(path):
        os.mkdir(path)
    

In [5]:
check_folder('../image_features')
check_folder('../text_features')

In [6]:
from sklearn.decomposition import IncrementalPCA
import csv

In [7]:
def create_representation_tensorflow(data, model, path, use_pca=False, n_components=128, batch_size=16):
    if use_pca:
        pca = IncrementalPCA(n_components=n_components, batch_size=batch_size)
        data = np.split(data,int(np.ceil(len(data) / batch_size)))
    with open(path,'w') as fw:
        csv_writer = csv.writer(fw, delimiter='\t')
        for sample in tqdm(data):
                if use_pca:
                    sample = tf.convert_to_tensor(sample)
                    features = np.hstack(model(sample).numpy())
                    pca.partial_fit(features)
                    features = pca.transform(features)
                    for feature in features:
                        csv_writer.writerows(feature)
                else:
                    sample = np.expand_dims(sample,axis=0)
                    feature = np.hstack(model(sample))
                    csv_writer.writerow(feature)            
    print('Saved representations to : {}'.format(path))

In [8]:
def write(data, path):
    with open(path,'w') as fw:
        csv_writer = csv.writer(fw, delimiter='\t')
        csv_writer.writerows(data)            
    print('Saved representations to : {}'.format(path))

In [9]:
def load_resnet(block='conv4_block5_out', pooling=True):
    resnet = apps.ResNet152V2(include_top=False, weights='imagenet')
    outputs = [i for i in resnet.layers if i.name==block][0]
    inputs = resnet.layers[0]
    if pooling:
        x = tf.keras.layers.GlobalAveragePooling2D()(outputs.output)
    else:
        x = tf.keras.layers.Flatten()(outputs.output)
    resnet = tf.keras.Model(inputs.input,x)
    return resnet

# resnet

In [10]:
resnet = load_resnet()

In [11]:
create_representation_tensorflow(images, resnet, '../image_features/resnet_conv4_block5.tsv')

100%|██████████| 1615/1615 [02:17<00:00, 11.77it/s]

Saved representations to : ../image_features/resnet_conv4_block5.tsv





In [12]:
import gc
del resnet
gc.collect();

# stbs bert

In [13]:
from sentence_transformers import SentenceTransformer

In [14]:
stbs_bert = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
features = stbs_bert.encode(annotations)
write(features, '../text_features/stbs_bert.tsv')

Saved representations to : ../text_features/stbs_bert.tsv


In [15]:
del features, stbs_bert
gc.collect();

In [16]:
def load_emb_from_disk(path):
    model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)

    return model

In [15]:
from nltk.corpus import stopwords 
def prepare_annotations_embeddings(annotations):
    prepared_annotations = []
    for sentence in annotations:
        sentence = ''.join([i for i in sentence if not (i in [',','.','!','?'] )])
        prepared_annotations.append([i for i in sentence.split(' ') if not (i in stopwords.words('english'))])
    unique_words_annotations = np.unique(np.hstack(prepared_annotations))
    return prepared_annotations, unique_words_annotations

In [16]:
def get_emb_rep(tokens, embeddings):
    dict_tokens = {}
    missing = []
    for w in tqdm(tokens):
        try:
            try:
                dict_tokens.update({w: embeddings.word_vec(w.lower())})
            except:
                dict_tokens.update({w: embeddings.word_vec(w.captialize())})
        except:
            missing.append(w)

    print('{} words where absent in embedding'.format(len(missing)))
    return dict_tokens


In [17]:
def create_representation_embeddings(embeddings, sentences, path, dim=300):
    with open(path, "w") as fw:
        csv_writer = csv.writer(fw, delimiter='\t')
        for n, sentence in enumerate(sentences):
            vector = np.zeros(shape=(dim,))
            counter = 0 
            for word in sentence:
                representation = embeddings.get(word)
                if not(representation is None):
                    vector+=representation
                    counter+=1
            if counter!=0:
                vector/=counter
            csv_writer.writerow(vector)
    print('Saved representation to : {}'.format(path))

# w2v embeddings

In [18]:
prepared_annotations, unique_words_annotations = prepare_annotations_embeddings(annotations)

In [23]:
w2v_embeddings = get_emb_rep(unique_words_annotations, load_emb_from_disk('../embeddings/GoogleNews-vectors-negative300.bin'))

  import sys
100%|██████████| 1446/1446 [00:00<00:00, 311599.03it/s]

8 words where absent in embedding





In [31]:
create_representation_embeddings(w2v_embeddings, prepared_annotations, '../text_features/w2v.tsv')

Saved representation to : ../text_features/w2v.tsv


In [32]:
del w2v_embeddings
gc.collect();

# glove embeddings

In [38]:
def get_emb_rep_glove(tokens, embeddings):
    dict_tokens = {}
    missing = []
    for w in tqdm(tokens):
        if w in embeddings.keys():
            dict_tokens.update({w: embeddings[w]})
        elif w.lower() in embeddings.keys():
            dict_tokens.update({w: embeddings[w.lower()]})
        elif w.capitalize() in embeddings.keys():
            dict_tokens.update({w: embeddings[w.capitalize()]})
        else:
            missing.append(w)

    print('{} words where absent in embedding'.format(len(missing)))
    return dict_tokens

In [34]:
def load_glove(file):
    print("Loading Glove Model")
    f = open(file, 'r')
    glove_embeddings = {}
    for line in f:
        splitLines = line.split()
        word = splitLines[0]
        wordEmbedding = np.array([float(value) for value in splitLines[1:]])
        glove_embeddings[word] = wordEmbedding
    print(len(glove_embeddings), " words loaded!")
    return glove_embeddings


In [35]:
glove = load_glove('../embeddings/glove.6B.300d.txt')

Loading Glove Model
400000  words loaded!


In [39]:
glove = get_emb_rep_glove(unique_words_annotations, glove)

100%|██████████| 1446/1446 [00:00<00:00, 897980.99it/s]

25 words where absent in embedding





In [41]:
create_representation_embeddings(glove,prepared_annotations, '../text_features/glove.tsv')

Saved representation to : ../text_features/glove.tsv


In [45]:
del glove
gc.collect();

# doc2vec

In [2]:
import gensim

In [6]:
def load_doc2vec(path_model, path_syn0, path_syn1):
    model = gensim.models.Doc2Vec.load(path_model)
    syn0 = np.array(np.load(path_syn0, mmap_mode='r'))
    syn1 = np.array(np.load(path_syn1, mmap_mode='r'))
    model.syn1 = syn1
    model.syn0 = syn0
    return model

In [7]:
doc2vec = load_doc2vec('../embeddings/enwiki_dbow/doc2vec.bin','../embeddings/enwiki_dbow/doc2vec.bin.syn0.npy',
            '../embeddings/enwiki_dbow/doc2vec.bin.syn1neg.npy')