In [1]:
import numpy as np
from keras.preprocessing import sequence
from tqdm import tqdm
import preprocessing as preproc
from utils import *
from collections import defaultdict
import pandas as pd
import matplotlib.pyplot as plt

Using Theano backend.
Using gpu device 0: GeForce GTX 1080 (CNMeM is disabled, cuDNN 5110)


In [2]:
base_path = app_10_length_15_data_path

train_path = base_path + train_folder
val_path = base_path + val_folder

general_datastruct_path = base_path + general_datastruct_folder


# Average number of appearances for predicted words

## Create word -> no.of.occurrences map

In [3]:
captions = preproc.get_captions_from_batch(train_path+"captions/",0)
len(captions)

52848

In [4]:
nr_unique_words = len(preproc.get_unique_words(captions))
nr_unique_words

3225

In [5]:
real_captions_2_app = preproc.most_common_words(captions,nr_unique_words)
len(real_captions_2_app)

3092

In [6]:
real_captions_2_app[-10:]

[(u'wall,', 1),
 (u'caption', 1),
 (u'early', 1),
 (u'indicating', 1),
 (u'STANDING', 1),
 (u'theme', 1),
 (u'need', 1),
 (u'TENNIS', 1),
 (u'WALKING', 1),
 (u'toast,', 1)]

In [7]:
real_captions_2_app[:10]

[(u'man', 7026),
 (u'sitting', 6330),
 (u'standing', 5318),
 (u'next', 4603),
 (u'white', 3636),
 (u'people', 3533),
 (u'top', 3260),
 (u'woman', 3220),
 (u'holding', 3064),
 (u'table', 2611)]

## Predictions

In [8]:
predictions = load_array(val_path+"predictions/"+"app_10_length_15_past_word_30_epoch_300d_gru_2x1024_captions")
predictions.shape

(1000,)

In [9]:
predicted_captions_2_app = preproc.most_common_words(predictions,len(preproc.get_unique_words(predictions)))

In [10]:
predicted_captions_2_app[:10]

[(u'sitting', 196),
 (u'man', 195),
 (u'standing', 156),
 (u'top', 137),
 (u'next', 125),
 (u'riding', 107),
 (u'street', 106),
 (u'group', 106),
 (u'white', 103),
 (u'people', 91)]

## Nr app weighted sum average

In [14]:
real_captions_2_app_dict = dict(real_captions_2_app)
predicted_captions_2_app_dict = dict(predicted_captions_2_app)

nr_real_unique_words = len(real_captions_2_app_dict)
nr_pred_unique_words = len(predicted_captions_2_app_dict)

In [15]:
print(nr_real_unique_words)
print(nr_pred_unique_words)

3092
622


In [21]:
weighted_avg_sum = 0.0

In [22]:
for word,pred_nr_app in predicted_captions_2_app_dict.iteritems():
    real_nr_app = real_captions_2_app_dict[word]
    weighted_avg_sum += real_nr_app * pred_nr_app

In [23]:
weighted_avg = weighted_avg_sum / nr_pred_unique_words
weighted_avg

15209.503215434084

## Stats

In [59]:
real_captions_2_app_df = pd.DataFrame(real_captions_2_app_dict.items(), columns=['word', 'real_count'])
pred_captions_2_app_df = pd.DataFrame(predicted_captions_2_app_dict.items(), columns=['word', 'pred_count'])
joined_df = pd.merge(real_captions_2_app_df, pred_captions_2_app_df, on='word', how='outer').fillna(0)

In [60]:
nr_real_words = len(real_captions_2_app_df)
nr_pred_words = len(pred_captions_2_app_df)

In [61]:
#joined_df.sort_values('real_count')[:10]
joined_df[:10]

Unnamed: 0,word,real_count,pred_count
0,hats,26,0.0
1,yellow,801,11.0
2,four,137,2.0
3,woods,57,0.0
4,sleep,9,0.0
5,asian,9,0.0
6,ocean.,17,0.0
7,hanging,302,9.0
8,Skateboarder,11,0.0
9,trolley,28,0.0


## word count per pred limit bucket

In [65]:
def window(fseq, window_size=5):
    for i in xrange(len(fseq) - window_size + 1):
        yield fseq[i:i+window_size]


In [69]:
word_limits = [0,10,20,50,100,1000,10000]

In [70]:
for min_2_max in list(window(word_limits,2)):
    min_limit = min_2_max[0]
    max_limit = min_2_max[1]
    
    nr_items  = len(joined_df[(joined_df['real_count'] > min_limit) & (joined_df['real_count'] < max_limit)])
    
    print("%d %d => %d"%(min_limit,max_limit,nr_items))

0 10 => 945
10 20 => 681
20 50 => 554
50 100 => 279
100 1000 => 400
1000 10000 => 54


# For each limit_bucket, what percentage of real_words have  been used

In [71]:
limits = [0,10,20,50,100,1000,10000]

In [72]:
nr_words = len(joined_df)

nr_pred_words

622

In [75]:
for min_2_max in list(window(limits,2)):
    
    min_limit = min_2_max[0]
    max_limit = min_2_max[1]
    
    limited_df = joined_df[(joined_df['real_count'] > min_limit) & (joined_df['real_count'] < max_limit)]
    nr_total_words = len(limited_df)
    nr_words_not_used = len(limited_df[limited_df['pred_count'] != 0])
    
    perc = float(nr_words_not_used) / nr_total_words
    print("%d -> %d --> %f"%(min_limit,max_limit,perc))

0 -> 10 --> 0.014815
10 -> 20 --> 0.070485
20 -> 50 --> 0.175090
50 -> 100 --> 0.308244
100 -> 1000 --> 0.767500
1000 -> 10000 --> 0.981481


## Another approch