In [1]:
import cv2
import sys
import random
import os
import numpy as np
from matplotlib import pyplot as plt
from glob import glob
import tensorflow as tf
from tensorflow.python.ops import nn_ops
from tensorflow.python.ops import math_ops
import time
from nltk.tokenize import RegexpTokenizer
import operator


In [2]:
DATA_PATH = "/home/will/sdb1/cvdata/"
OUTPUT_PATH = "/home/will/sdb1/cvdata/imgs-halfhalf/"
# CROPPED_PATH = "/home/will/sdb1/cvdata/imgs-halfhalf10/"
CROPPED_PATH = "/home/will/sdb1/cvdata/wikipages500_3/"
STORE_PATH = "/home/will/sdb1/"

In [3]:
def png2jpg(inf):
    '''convert pngs in path to jpg'''
    if os.path.isfile(inf) and inf[-3:] is "png":
        # if img is file
        img = cv2.imread(inf)
        cv2.imwrite(OUTPUT_PATH + inf.split('/')[-1][:-3] + 'jpg', img)
    elif os.path.isdir(inf):
        pngs = glob(inf+'/*.png')
        for j in pngs:
            print("processing " + j)
            img = cv2.imread(j)
            cv2.imwrite(OUTPUT_PATH + j.split('/')[-1][:-3] + 'jpg', img)

In [4]:
def crop10(img, h, w):
    '''crop 10 random sub images from original with (h, w) size'''
    sub_imgs = []
    num_imgs = 10
    oh, ow, _ = img.shape
    if w > ow or h > oh:
        return sub_imgs
    for i in range(num_imgs):
        # get the size of the sub-image
        sh = random.randint(1, oh-h)
        sw = random.randint(1, ow-w)
        temp = img[sh:sh+h, sw:sw+w]
        sub_imgs.append(temp)
        
    return sub_imgs

In [5]:
def print_progress(iteration, total, prefix='PROG', suffix='',
                   decimals=1, length=25, fill='>'):
    percent = ("{0:." +
               str(decimals) + "f}").format(100 * (iteration / float(total)))
    filledLength = int(length * iteration // total)
    bar = fill * filledLength + '-' * (length - filledLength)
    sys.stdout.write('\r[%s] %s: |%s| %s%% %s' % (str(datetime.now().strftime("%y%m%d-%H%M%S")),
                                                prefix, bar, percent, suffix))
    # Print New Line on Complete
    if iteration == total:
        print("")

In [6]:
net_data = np.load(open(DATA_PATH + "bvlc_alexnet.npy", "rb"), encoding="latin1").item()

In [7]:
train_x = np.zeros((1, 227,227,3)).astype(np.float32)
train_y = np.zeros((1, 1000))
xdim = train_x.shape[1:]
ydim = train_y.shape[1]

In [8]:
def conv(input, kernel, biases, k_h, k_w, c_o, s_h, s_w,  padding="VALID", group=1):
    '''From https://github.com/ethereon/caffe-tensorflow
    '''
    c_i = input.get_shape()[-1]
    assert c_i%group==0
    assert c_o%group==0
    convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding)
    
    
    if group==1:
        conv = convolve(input, kernel)
    else:
        #tf.split(value, num_or_size_splits, axis) -- new version
        ## tf.split(axis, num_or_size_splits, value) -- old version
        
        #input_groups = tf.split(3, group, input)
        #kernel_groups = tf.split(3, group, kernel)
        
        input_groups = tf.split(input, group, 3)
        kernel_groups = tf.split(kernel, group, 3)
        
        output_groups = [convolve(i, k) for i,k in zip(input_groups, kernel_groups)]
        conv = tf.concat( output_groups, 3)
    return  tf.reshape(tf.nn.bias_add(conv, biases), [-1]+conv.get_shape().as_list()[1:])



x = tf.placeholder(tf.float32, (None,) + xdim)

In [9]:
k_h = 11; k_w = 11; c_o = 96; s_h = 4; s_w = 4
conv1W = tf.Variable(net_data["conv1"][0])
conv1b = tf.Variable(net_data["conv1"][1])
conv1_in = conv(x, conv1W, conv1b, k_h, k_w, c_o, s_h, s_w, padding="SAME", group=1)
conv1 = tf.nn.relu(conv1_in)

#lrn1
#lrn(2, 2e-05, 0.75, name='norm1')
radius = 2; alpha = 2e-05; beta = 0.75; bias = 1.0
lrn1 = tf.nn.local_response_normalization(conv1,
                                                  depth_radius=radius,
                                                  alpha=alpha,
                                                  beta=beta,
                                                  bias=bias)

#maxpool1
#max_pool(3, 3, 2, 2, padding='VALID', name='pool1')
k_h = 3; k_w = 3; s_h = 2; s_w = 2; padding = 'VALID'
maxpool1 = tf.nn.max_pool(lrn1, ksize=[1, k_h, k_w, 1], strides=[1, s_h, s_w, 1], padding=padding)


#conv2
#conv(5, 5, 256, 1, 1, group=2, name='conv2')
k_h = 5; k_w = 5; c_o = 256; s_h = 1; s_w = 1; group = 2
conv2W = tf.Variable(net_data["conv2"][0])
conv2b = tf.Variable(net_data["conv2"][1])
conv2_in = conv(maxpool1, conv2W, conv2b, k_h, k_w, c_o, s_h, s_w, padding="SAME", group=group)
conv2 = tf.nn.relu(conv2_in)


#lrn2
#lrn(2, 2e-05, 0.75, name='norm2')
radius = 2; alpha = 2e-05; beta = 0.75; bias = 1.0
lrn2 = tf.nn.local_response_normalization(conv2,
                                                  depth_radius=radius,
                                                  alpha=alpha,
                                                  beta=beta,
                                                  bias=bias)

#maxpool2
#max_pool(3, 3, 2, 2, padding='VALID', name='pool2')                                                  
k_h = 3; k_w = 3; s_h = 2; s_w = 2; padding = 'VALID'
maxpool2 = tf.nn.max_pool(lrn2, ksize=[1, k_h, k_w, 1], strides=[1, s_h, s_w, 1], padding=padding)

#conv3
#conv(3, 3, 384, 1, 1, name='conv3')
k_h = 3; k_w = 3; c_o = 384; s_h = 1; s_w = 1; group = 1
conv3W = tf.Variable(net_data["conv3"][0])
conv3b = tf.Variable(net_data["conv3"][1])
conv3_in = conv(maxpool2, conv3W, conv3b, k_h, k_w, c_o, s_h, s_w, padding="SAME", group=group)
conv3 = tf.nn.relu(conv3_in)

#conv4
#conv(3, 3, 384, 1, 1, group=2, name='conv4')
k_h = 3; k_w = 3; c_o = 384; s_h = 1; s_w = 1; group = 2
conv4W = tf.Variable(net_data["conv4"][0])
conv4b = tf.Variable(net_data["conv4"][1])
conv4_in = conv(conv3, conv4W, conv4b, k_h, k_w, c_o, s_h, s_w, padding="SAME", group=group)
conv4 = tf.nn.relu(conv4_in)


#conv5
#conv(3, 3, 256, 1, 1, group=2, name='conv5')
k_h = 3; k_w = 3; c_o = 256; s_h = 1; s_w = 1; group = 2
conv5W = tf.Variable(net_data["conv5"][0])
conv5b = tf.Variable(net_data["conv5"][1])
conv5_in = conv(conv4, conv5W, conv5b, k_h, k_w, c_o, s_h, s_w, padding="SAME", group=group)

conv5 = tf.nn.relu(conv5_in)

#maxpool5
#max_pool(3, 3, 2, 2, padding='VALID', name='pool5')
k_h = 3; k_w = 3; s_h = 2; s_w = 2; padding = 'VALID'
maxpool5 = tf.nn.max_pool(conv5, ksize=[1, k_h, k_w, 1], strides=[1, s_h, s_w, 1], padding=padding)

#fc6
#fc(4096, name='fc6')
fc6W = tf.Variable(net_data["fc6"][0])
fc6b = tf.Variable(net_data["fc6"][1])

#fc6 before relu
fc6_before_relu = nn_ops.bias_add( math_ops.matmul(tf.reshape(maxpool5, [-1, int(np.prod(maxpool5.get_shape()[1:]))]), fc6W)   ,fc6b)

fc6 = tf.nn.relu_layer(tf.reshape(maxpool5, [-1, int(np.prod(maxpool5.get_shape()[1:]))]), fc6W, fc6b)

#fc7
#fc(4096, name='fc7')
fc7W = tf.Variable(net_data["fc7"][0])
fc7b = tf.Variable(net_data["fc7"][1])
fc7 = tf.nn.relu_layer(fc6, fc7W, fc7b)

fc7_xw_plus_b = nn_ops.bias_add(math_ops.matmul(fc6, fc7W), fc7b)


fc7_after_relu = nn_ops.relu(fc7_xw_plus_b)


#fc8
#fc(1000, relu=False, name='fc8')
fc8W = tf.Variable(net_data["fc8"][0])
fc8b = tf.Variable(net_data["fc8"][1])
fc8 = tf.nn.xw_plus_b(fc7, fc8W, fc8b)


#prob
#softmax(name='prob'))
prob = tf.nn.softmax(fc8)

#init = tf.initialize_all_variables()
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)

t = time.time()

In [10]:
import PIL
from PIL import Image
import skimage

In [11]:
# Crop images
# h = 227
# w = 227
# count = 0
# total = len(os.listdir(OUTPUT_PATH))
# for item in os.listdir(OUTPUT_PATH):
#     count += 1
#     img_path = OUTPUT_PATH + item
#     img = cv2.imread(img_path)
#     imgs10 = crop10(img, h, w)
#     print(count)
#     if len(imgs10) is not 10:
#         continue
#     for i in range(10):
#         cv2.imwrite(CROPPED_PATH + item[:-4]+ "-part" + str(i) + ".jpg", imgs10[i])

In [12]:
datasize =500 * 3
image_list = []
raw_image_list = []
file_list = []

count = 0
for item in sorted(os.listdir(CROPPED_PATH)):
    count += 1
    if datasize < count:
        break
    print "reading " + CROPPED_PATH + item
    im = cv2.imread(CROPPED_PATH + item)
    raw_image_list.append(im)
    im = (im).astype(np.float32)
    im = im - np.mean(im)
    im[:, :, 0], im[:, :, 2] = im[:, :, 2], im[:, :, 0]
    image_list.append(im)
    file_list.append(item)

reading /home/will/sdb1/cvdata/wikipages500_3/en-0-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-1-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-10-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-100-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-101-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-102-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-103-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-104-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-105-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-106-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-107-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-108-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-109-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-11-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-110-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-111-wiki.jpg
reading /home/

reading /home/will/sdb1/cvdata/wikipages500_3/en-339-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-34-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-340-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-341-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-342-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-343-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-344-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-345-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-346-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-347-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-348-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-349-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-35-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-350-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-351-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/en-352-wiki.jpg
reading /h

reading /home/will/sdb1/cvdata/wikipages500_3/ru-114-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-115-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-116-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-117-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-118-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-119-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-12-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-120-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-121-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-122-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-123-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-124-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-125-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-126-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-127-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-128-wiki.jpg
reading /

reading /home/will/sdb1/cvdata/wikipages500_3/ru-354-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-355-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-356-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-357-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-358-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-359-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-36-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-360-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-361-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-362-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-363-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-364-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-365-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-366-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-367-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/ru-368-wiki.jpg
reading /

reading /home/will/sdb1/cvdata/wikipages500_3/zh-145-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-146-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-147-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-148-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-149-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-15-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-150-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-151-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-152-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-153-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-154-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-155-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-156-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-157-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-158-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-159-wiki.jpg
reading /

reading /home/will/sdb1/cvdata/wikipages500_3/zh-384-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-385-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-386-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-387-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-388-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-389-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-39-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-390-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-391-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-392-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-393-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-394-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-395-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-396-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-397-wiki.jpg
reading /home/will/sdb1/cvdata/wikipages500_3/zh-398-wiki.jpg
reading /

In [13]:
print(len(image_list))

1500


In [14]:
im_fc7 = sess.run(fc7_xw_plus_b, feed_dict = {x:image_list})
im_fc7 = np.array(im_fc7)

im_fc6 = sess.run(fc6_before_relu, feed_dict = {x:image_list})
im_fc6 = np.array(im_fc6)

im_conv5 = sess.run(conv5_in, feed_dict = {x:image_list})
im_conv5 = np.array(im_conv5)

In [15]:
dataset = 'wiki-en-zh'

np.save(STORE_PATH + dataset +'_raw_image_list.npy', raw_image_list)
raw_image_list = np.array(raw_image_list)
print(raw_image_list.shape)

np.save(STORE_PATH + dataset + '_image_list.npy', image_list)
image_list = np.array(image_list)
print(image_list.shape)

(1500, 227, 227, 3)
(1500, 227, 227, 3)


In [16]:
np.save(STORE_PATH + dataset + '_fc7.npy', im_fc7)
print(im_fc7.shape)

np.save(STORE_PATH + dataset + '_fc6.npy', im_fc6)
print(im_fc6.shape)

np.save(STORE_PATH + dataset + '_conv5.npy', im_conv5)
print(im_conv5.shape)

(1500, 4096)
(1500, 4096)
(1500, 13, 13, 256)


In [17]:
'''get threshold using 90% percentile'''
percentile_fc7 = np.percentile(im_fc7, 90, axis = 0)
percentile_fc6 = np.percentile(im_fc6, 90, axis = 0)
percentile_conv5 = np.percentile(im_conv5, 90, axis = 0)

In [18]:
np.save(STORE_PATH +  dataset +'_percentile_fc7.npy', percentile_fc7)
np.save(STORE_PATH +  dataset +'_percentile_fc6.npy', percentile_fc6)
np.save(STORE_PATH +  dataset +'_percentile_conv5.npy', percentile_conv5)

percentile_fc7 = np.array(percentile_fc7)
print(percentile_fc7.shape)
percentile_fc6 = np.array(percentile_fc6)
print(percentile_fc6.shape)
percentile_conv5 = np.array(percentile_conv5)
print(percentile_conv5.shape)

(4096,)
(4096,)
(13, 13, 256)


In [19]:
# Topics count (7, 20, 50, 100)
NUM_TOPIC = 3
# Features for Topics
NUM_FEATURE = 4096
# Amount of images being classified
NUM_IMAGE = datasize

In [20]:
X = im_fc7

In [21]:
# Get data statistics, as a sanity check
print("Daxta shape: ", X.shape)
print("Number of 0s: ", np.sum(X == 1))
print("Number of 1s: ", np.sum(X == 0))
print("Anomailes: ",np.sum([X < 0]))


('Daxta shape: ', (1500, 4096))
('Number of 0s: ', 0)
('Number of 1s: ', 0)
('Anomailes: ', 5423476)


In [22]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from gensim import corpora, models

In [23]:
binary_vector_fc7 = np.greater(im_fc7, percentile_fc7).astype(int)

In [24]:
print("Data shape: ", binary_vector_fc7.shape)
print("Number of 1s: ", np.sum(binary_vector_fc7 == 1))
print("Number of 0s: ", np.sum(binary_vector_fc7 == 0))
print("Anomailes: ",np.sum([binary_vector_fc7 < 0]))


('Data shape: ', (1500, 4096))
('Number of 1s: ', 610796)
('Number of 0s: ', 5533204)
('Anomailes: ', 0)


In [25]:
X = binary_vector_fc7
DATA_PATH = "/home/will/cvdata/workspace/data2/"

In [26]:
# Prepare for corpus
corpus = [[(j, X[i, j]) for j in range(NUM_FEATURE) if X[i, j]==1] for i in range(NUM_IMAGE)]
# corpora.MmCorpus.serialize(DATA_PATH + 'corpus.mm', corpus)

In [27]:
curr_time = str(int(time.time()))
model_name = str(NUM_TOPIC) + '-topics.model'
topics_file_name = 'topic_model_features_' + str(NUM_TOPIC) +'_topics_'+ curr_time + '.npy'
topics_per_image_file_name = 'topics_per_image-' + str(NUM_TOPIC) +'_topics_'+ curr_time + '.npy'
print("Loading model: " + model_name)

Loading model: 3-topics.model


In [28]:
K = NUM_TOPIC
# Create the Topic Model
model_name = str(K) + '-topics.model'
lda = models.ldamodel.LdaModel(corpus, num_topics = K)
# lda.save('data/' + model_name)

# # Get topic for each image
# img_by_topic = [[] for _ in range(K)]
# for i in range(num_images):
#     ind, val = sorted(lda.get_document_topics(corpus[i]), key=lambda x:x[1])[-1]
#     img_by_topic[ind].append((i, val))

# for j in range(K):
#     img_by_topic[j].sort(key = lambda x: -x[1])

    # Save results
#     with open(DATA_PATH + str(K) + "-topic-res.txt", "wb") as fp:
#         pickle.dump(img_by_topic, fp)

# # load trained data
# lda = models.ldamodel.LdaModel.load(DATA_PATH + model_name)
# print(lda)

In [29]:
# Process the raw data
doc = []
bow = []
for i in range(NUM_IMAGE):
    doc_list = []
    bow_list = []
    for j in range(X.shape[1]): # 4096
        doc_list.append((j, X[i,j]))
        bow_list.append(X[i,j])
    doc.append(doc_list)
    bow.append(bow_list)
    

In [30]:
# Raw output for dist
topic_distribution = lda.print_topics(num_words=NUM_FEATURE)


In [31]:
'''Save distribution of features for every topic'''

# Extract feature ids per topic from raw output
topics = np.zeros([NUM_TOPIC, NUM_FEATURE])
tokenizer = RegexpTokenizer(r'\w+')

for topic_ids in topic_distribution:
    topic_id = topic_ids[0]
    all_features = topic_ids[1]

    feature_values = tokenizer.tokenize(all_features)     # Tokenize the string to keep all numbers
    feature_values = list(map(int, feature_values))       # Convert values of list to int
    feature_values = np.asarray(feature_values[2::3]) * np.asarray(feature_values[1::3])   # Remove all features with 0 weightage in topic
    
    topics[topic_id] = feature_values
        
# Save feature values in npy file
# np.save(DATA_PATH + topics_file_name, topics)

#topics is a numpy array with one row representing one topic. The columns contain the indices of the features belonging to that topic, in decreasing order of how strongly linked a feature is to that topic. The vectors have been padded with 0s in the end.

In [32]:
'''Save distribution of topics, for every image'''

# Save top k topics per image to file
k = min(5, NUM_TOPIC)

topics_per_image = lda[doc]
topics_per_image_matrix = np.zeros([len(topics_per_image),k,2])
print ("No of documents: ", len(topics_per_image))

i = 0
topic_dict = {}
for image_topics in topics_per_image:
    image_topics.sort(key=operator.itemgetter(1), reverse=True)
    print ("Image ",file_list[i], " Topics: ", image_topics[0][0])
    if image_topics[0][0] not in topic_dict:
        topic_dict[image_topics[0][0]] = []
    topic_dict[image_topics[0][0]].append(i)
    for j in range(min(k,len(image_topics))):
        topics_per_image_matrix[i][j][0] = image_topics[j][0]    # Store topic id
        topics_per_image_matrix[i][j][1] = image_topics[j][1]    # Store probability of document having that topic
    i = i+1
    
# Save feature values in npy file
# np.save(DATA_PATH + topics_per_image_file_name, topics_per_image_matrix)

('No of documents: ', 1500)
('Image ', 'en-0-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-1-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-10-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-100-wiki.jpg', ' Topics: ', 1)
('Image ', 'en-101-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-102-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-103-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-104-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-105-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-106-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-107-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-108-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-109-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-11-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-110-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-111-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-112-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-113-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-114-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-115-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-116-wiki.jpg', ' Topics: ', 2)
('Image ', '

('Image ', 'en-28-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-280-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-281-wiki.jpg', ' Topics: ', 1)
('Image ', 'en-282-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-283-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-284-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-285-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-286-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-287-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-288-wiki.jpg', ' Topics: ', 1)
('Image ', 'en-289-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-29-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-290-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-291-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-292-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-293-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-294-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-295-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-296-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-297-wiki.jpg', ' Topics: ', 1)
('Image ', 'en-298-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-299-wiki.jpg', ' Topi

('Image ', 'en-45-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-450-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-451-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-452-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-453-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-454-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-455-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-456-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-457-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-458-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-459-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-46-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-460-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-461-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-462-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-463-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-464-wiki.jpg', ' Topics: ', 1)
('Image ', 'en-465-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-466-wiki.jpg', ' Topics: ', 0)
('Image ', 'en-467-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-468-wiki.jpg', ' Topics: ', 2)
('Image ', 'en-469-wiki.jpg', ' Topi

('Image ', 'ru-173-wiki.jpg', ' Topics: ', 0)
('Image ', 'ru-174-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-175-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-176-wiki.jpg', ' Topics: ', 1)
('Image ', 'ru-177-wiki.jpg', ' Topics: ', 1)
('Image ', 'ru-178-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-179-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-18-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-180-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-181-wiki.jpg', ' Topics: ', 1)
('Image ', 'ru-182-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-183-wiki.jpg', ' Topics: ', 1)
('Image ', 'ru-184-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-185-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-186-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-187-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-188-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-189-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-19-wiki.jpg', ' Topics: ', 1)
('Image ', 'ru-190-wiki.jpg', ' Topics: ', 1)
('Image ', 'ru-191-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-192-wiki.jpg', ' Topi

('Image ', 'ru-337-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-338-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-339-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-34-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-340-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-341-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-342-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-343-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-344-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-345-wiki.jpg', ' Topics: ', 1)
('Image ', 'ru-346-wiki.jpg', ' Topics: ', 1)
('Image ', 'ru-347-wiki.jpg', ' Topics: ', 1)
('Image ', 'ru-348-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-349-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-35-wiki.jpg', ' Topics: ', 0)
('Image ', 'ru-350-wiki.jpg', ' Topics: ', 1)
('Image ', 'ru-351-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-352-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-353-wiki.jpg', ' Topics: ', 1)
('Image ', 'ru-354-wiki.jpg', ' Topics: ', 1)
('Image ', 'ru-355-wiki.jpg', ' Topics: ', 1)
('Image ', 'ru-356-wiki.jpg', ' Topi

('Image ', 'ru-55-wiki.jpg', ' Topics: ', 0)
('Image ', 'ru-56-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-57-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-58-wiki.jpg', ' Topics: ', 1)
('Image ', 'ru-59-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-6-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-60-wiki.jpg', ' Topics: ', 1)
('Image ', 'ru-61-wiki.jpg', ' Topics: ', 1)
('Image ', 'ru-62-wiki.jpg', ' Topics: ', 1)
('Image ', 'ru-63-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-64-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-65-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-66-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-67-wiki.jpg', ' Topics: ', 1)
('Image ', 'ru-68-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-69-wiki.jpg', ' Topics: ', 1)
('Image ', 'ru-7-wiki.jpg', ' Topics: ', 1)
('Image ', 'ru-70-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-71-wiki.jpg', ' Topics: ', 2)
('Image ', 'ru-72-wiki.jpg', ' Topics: ', 1)
('Image ', 'ru-73-wiki.jpg', ' Topics: ', 1)
('Image ', 'ru-74-wiki.jpg', ' Topics: ', 1)
('Image ', '

('Image ', 'zh-220-wiki.jpg', ' Topics: ', 2)
('Image ', 'zh-221-wiki.jpg', ' Topics: ', 1)
('Image ', 'zh-222-wiki.jpg', ' Topics: ', 0)
('Image ', 'zh-223-wiki.jpg', ' Topics: ', 2)
('Image ', 'zh-224-wiki.jpg', ' Topics: ', 2)
('Image ', 'zh-225-wiki.jpg', ' Topics: ', 1)
('Image ', 'zh-226-wiki.jpg', ' Topics: ', 0)
('Image ', 'zh-227-wiki.jpg', ' Topics: ', 2)
('Image ', 'zh-228-wiki.jpg', ' Topics: ', 2)
('Image ', 'zh-229-wiki.jpg', ' Topics: ', 1)
('Image ', 'zh-23-wiki.jpg', ' Topics: ', 0)
('Image ', 'zh-230-wiki.jpg', ' Topics: ', 2)
('Image ', 'zh-231-wiki.jpg', ' Topics: ', 0)
('Image ', 'zh-232-wiki.jpg', ' Topics: ', 1)
('Image ', 'zh-233-wiki.jpg', ' Topics: ', 0)
('Image ', 'zh-234-wiki.jpg', ' Topics: ', 2)
('Image ', 'zh-235-wiki.jpg', ' Topics: ', 1)
('Image ', 'zh-236-wiki.jpg', ' Topics: ', 2)
('Image ', 'zh-237-wiki.jpg', ' Topics: ', 2)
('Image ', 'zh-238-wiki.jpg', ' Topics: ', 2)
('Image ', 'zh-239-wiki.jpg', ' Topics: ', 2)
('Image ', 'zh-24-wiki.jpg', ' Topi

('Image ', 'zh-389-wiki.jpg', ' Topics: ', 0)
('Image ', 'zh-39-wiki.jpg', ' Topics: ', 1)
('Image ', 'zh-390-wiki.jpg', ' Topics: ', 1)
('Image ', 'zh-391-wiki.jpg', ' Topics: ', 2)
('Image ', 'zh-392-wiki.jpg', ' Topics: ', 2)
('Image ', 'zh-393-wiki.jpg', ' Topics: ', 0)
('Image ', 'zh-394-wiki.jpg', ' Topics: ', 2)
('Image ', 'zh-395-wiki.jpg', ' Topics: ', 1)
('Image ', 'zh-396-wiki.jpg', ' Topics: ', 1)
('Image ', 'zh-397-wiki.jpg', ' Topics: ', 2)
('Image ', 'zh-398-wiki.jpg', ' Topics: ', 1)
('Image ', 'zh-399-wiki.jpg', ' Topics: ', 2)
('Image ', 'zh-4-wiki.jpg', ' Topics: ', 1)
('Image ', 'zh-40-wiki.jpg', ' Topics: ', 0)
('Image ', 'zh-400-wiki.jpg', ' Topics: ', 2)
('Image ', 'zh-401-wiki.jpg', ' Topics: ', 2)
('Image ', 'zh-402-wiki.jpg', ' Topics: ', 2)
('Image ', 'zh-403-wiki.jpg', ' Topics: ', 2)
('Image ', 'zh-404-wiki.jpg', ' Topics: ', 2)
('Image ', 'zh-405-wiki.jpg', ' Topics: ', 1)
('Image ', 'zh-406-wiki.jpg', ' Topics: ', 2)
('Image ', 'zh-407-wiki.jpg', ' Topics

In [33]:
print topics_per_image_matrix

[[[0.         0.90775186]
  [1.         0.09120916]
  [0.         0.        ]]

 [[0.         0.98831916]
  [0.         0.        ]
  [0.         0.        ]]

 [[2.         0.99088699]
  [0.         0.        ]
  [0.         0.        ]]

 ...

 [[2.         0.99622434]
  [0.         0.        ]
  [0.         0.        ]]

 [[0.         0.53524536]
  [1.         0.45893496]
  [0.         0.        ]]

 [[1.         0.70534462]
  [0.         0.29368287]
  [0.         0.        ]]]


In [34]:
# Statistics
t = [0, 0, 0]
for i in range(500):
    t[int(topics_per_image_matrix[i][0][0])] += 1
print "For ENG articles, " + str(t[0]) + " belongs to topic 0"
print ", " + str(t[1]) + " belongs to topic 1"
print ", " + str(t[2]) + " belongs to topic 2"

t = [0, 0, 0]
for i in range(500,1000):
    t[int(topics_per_image_matrix[i][0][0])] += 1
print "For RUS articles, " + str(t[0]) + " belongs to topic 0"
print ", " + str(t[1]) + " belongs to topic 1"
print ", " + str(t[2]) + " belongs to topic 2"

t = [0, 0, 0]
for i in range(1000, 1500):
    t[int(topics_per_image_matrix[i][0][0])] += 1
print "For CHN articles, " + str(t[0]) + " belongs to topic 0"
print ", " + str(t[1]) + " belongs to topic 1"
print ", " + str(t[2]) + " belongs to topic 2"

For ENG articles, 170 belongs to topic 0
, 47 belongs to topic 1
, 283 belongs to topic 2
For RUS articles, 44 belongs to topic 0
, 164 belongs to topic 1
, 292 belongs to topic 2
For CHN articles, 127 belongs to topic 0
, 112 belongs to topic 1
, 261 belongs to topic 2
