In [1]:
import numpy as np
import math
import pickle
import operator
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import urllib2
import time
from scipy.misc import imread
from scipy.misc import imresize
from scipy.optimize import linear_sum_assignment
from gensim import corpora, models
from nltk.tokenize import RegexpTokenizer

In [2]:
DATA_PATH = "/home/will/cvdata/workspace/data2/"

In [None]:
# Topics count (7, 20, 50, 100)
NUM_TOPIC = 7
# Features for Topics
NUM_FEATURE = 4096
# Amount of images being classified
NUM_IMAGE = 3000

In [4]:
# Load image data
image_data = np.load(DATA_PATH + 'image_list_yfcc100m_output_100k_crop.npy')
print(image_data.shape)

(99939,)


In [5]:
# Load Training set data
X = np.load(DATA_PATH + 'one_hot_yfcc100m.npy')

In [6]:
# Get data statistics, as a sanity check
print("Daxta shape: ", X.shape)
print("Number of 0s: ", np.sum(X == 1))
print("Number of 1s: ", np.sum(X == 0))
print("Anomailes: ",np.sum([X < 0]))

('Daxta shape: ', (99939, 4096))
('Number of 0s: ', 52660733)
('Number of 1s: ', 356689411)
('Anomailes: ', 0)


In [None]:
# Prepare for corpus
# corpus = [[(j, X[i, j]) for j in range(NUM_FEATURE) if X[i, j]==1] for i in range(NUM_IMAGE)]
# corpora.MmCorpus.serialize('data/corpus.mm', corpus)

# Load corpus
corpus = corpora.MmCorpus('data/corpus.mm')

In [7]:
curr_time = str(int(time.time()))
model_name = str(NUM_TOPIC) + '-topics.model'
topics_file_name = 'topic_model_features_' + str(NUM_TOPIC) +'_topics_'+ curr_time + '.npy'
topics_per_image_file_name = 'topics_per_image-' + str(NUM_TOPIC) +'_topics_'+ curr_time + '.npy'
print("Loading model: " + model_name)

Loading model: 20-topics.model


In [8]:
# for K in num_topics:
#     # Create the Topic Model
#     model_name = str(K) + '-topics.model'
#     lda = models.ldamodel.LdaModel(corpus, num_topics = K)
#     lda.save('data/' + model_name)

#     # Get topic for each image
#     img_by_topic = [[] for _ in range(K)]
#     for i in range(num_images):
#         ind, val = sorted(lda.get_document_topics(corpus[i]), key=lambda x:x[1])[-1]
#         img_by_topic[ind].append((i, val))

#     for j in range(K):
#         img_by_topic[j].sort(key = lambda x: -x[1])

#     # Save results
#     with open(DATA_PATH + str(K) + "-topic-res.txt", "wb") as fp:
#         pickle.dump(img_by_topic, fp)
        
# load trained data
lda = models.ldamodel.LdaModel.load(DATA_PATH + model_name)
print(lda)

LdaModel(num_terms=4096, num_topics=20, decay=0.5, chunksize=2000)


In [9]:
# Process the raw data
doc = []
bow = []
for i in range(NUM_IMAGE):
    doc_list = []
    bow_list = []
    for j in range(X.shape[1]): # 4096
        doc_list.append((j, X[i,j]))
        bow_list.append(X[i,j])
    doc.append(doc_list)
    bow.append(bow_list)

In [10]:
# Raw output for dist
topic_distribution = lda.print_topics(num_words=NUM_FEATURE)

In [11]:
'''Save distribution of features for every topic'''

# Extract feature ids per topic from raw output
topics = np.zeros([NUM_TOPIC, NUM_FEATURE])
tokenizer = RegexpTokenizer(r'\w+')

for topic_ids in topic_distribution:
    topic_id = topic_ids[0]
    all_features = topic_ids[1]

    feature_values = tokenizer.tokenize(all_features)     # Tokenize the string to keep all numbers
    feature_values = list(map(int, feature_values))       # Convert values of list to int
    feature_values = np.asarray(feature_values[2::3]) * np.asarray(feature_values[1::3])   # Remove all features with 0 weightage in topic
    
    topics[topic_id] = feature_values
        
# Save feature values in npy file
np.save(DATA_PATH + topics_file_name, topics)

#topics is a numpy array with one row representing one topic. The columns contain the indices of the features belonging to that topic, in decreasing order of how strongly linked a feature is to that topic. The vectors have been padded with 0s in the end.

In [12]:
'''Save distribution of topics, for every image'''

# Save top k topics per image to file
k = 5

topics_per_image = lda[doc]
topics_per_image_matrix = np.zeros([len(topics_per_image),k,2])
print ("No of documents: ", len(topics_per_image))

i = 0
topic_dict = {}
for image_topics in topics_per_image:
    image_topics.sort(key=operator.itemgetter(1), reverse=True)
    print ("Image ",i, " Topics: ", image_topics[0][0])
    if image_topics[0][0] not in topic_dict:
        topic_dict[image_topics[0][0]] = []
    topic_dict[image_topics[0][0]].append(i)
    for j in range(min(k,len(image_topics))):
        topics_per_image_matrix[i][j][0] = image_topics[j][0]    # Store topic id
        topics_per_image_matrix[i][j][1] = image_topics[j][1]    # Store probability of document having that topic
    i = i+1
    
# Save feature values in npy file
np.save(DATA_PATH + topics_per_image_file_name, topics_per_image_matrix)

('No of documents: ', 3000)
('Image ', 0, ' Topics: ', 9)
('Image ', 1, ' Topics: ', 8)
('Image ', 2, ' Topics: ', 10)
('Image ', 3, ' Topics: ', 10)
('Image ', 4, ' Topics: ', 2)
('Image ', 5, ' Topics: ', 19)
('Image ', 6, ' Topics: ', 7)
('Image ', 7, ' Topics: ', 3)
('Image ', 8, ' Topics: ', 1)
('Image ', 9, ' Topics: ', 15)
('Image ', 10, ' Topics: ', 9)
('Image ', 11, ' Topics: ', 11)
('Image ', 12, ' Topics: ', 4)
('Image ', 13, ' Topics: ', 10)
('Image ', 14, ' Topics: ', 18)
('Image ', 15, ' Topics: ', 19)
('Image ', 16, ' Topics: ', 2)
('Image ', 17, ' Topics: ', 7)
('Image ', 18, ' Topics: ', 7)
('Image ', 19, ' Topics: ', 2)
('Image ', 20, ' Topics: ', 4)
('Image ', 21, ' Topics: ', 8)
('Image ', 22, ' Topics: ', 6)
('Image ', 23, ' Topics: ', 17)
('Image ', 24, ' Topics: ', 13)
('Image ', 25, ' Topics: ', 6)
('Image ', 26, ' Topics: ', 3)
('Image ', 27, ' Topics: ', 6)
('Image ', 28, ' Topics: ', 2)
('Image ', 29, ' Topics: ', 18)
('Image ', 30, ' Topics: ', 18)
('Image '

('Image ', 280, ' Topics: ', 11)
('Image ', 281, ' Topics: ', 4)
('Image ', 282, ' Topics: ', 6)
('Image ', 283, ' Topics: ', 19)
('Image ', 284, ' Topics: ', 11)
('Image ', 285, ' Topics: ', 1)
('Image ', 286, ' Topics: ', 14)
('Image ', 287, ' Topics: ', 18)
('Image ', 288, ' Topics: ', 1)
('Image ', 289, ' Topics: ', 9)
('Image ', 290, ' Topics: ', 9)
('Image ', 291, ' Topics: ', 17)
('Image ', 292, ' Topics: ', 11)
('Image ', 293, ' Topics: ', 1)
('Image ', 294, ' Topics: ', 18)
('Image ', 295, ' Topics: ', 11)
('Image ', 296, ' Topics: ', 0)
('Image ', 297, ' Topics: ', 10)
('Image ', 298, ' Topics: ', 11)
('Image ', 299, ' Topics: ', 17)
('Image ', 300, ' Topics: ', 1)
('Image ', 301, ' Topics: ', 1)
('Image ', 302, ' Topics: ', 2)
('Image ', 303, ' Topics: ', 4)
('Image ', 304, ' Topics: ', 19)
('Image ', 305, ' Topics: ', 1)
('Image ', 306, ' Topics: ', 2)
('Image ', 307, ' Topics: ', 19)
('Image ', 308, ' Topics: ', 18)
('Image ', 309, ' Topics: ', 7)
('Image ', 310, ' Topics:

('Image ', 534, ' Topics: ', 15)
('Image ', 535, ' Topics: ', 17)
('Image ', 536, ' Topics: ', 18)
('Image ', 537, ' Topics: ', 2)
('Image ', 538, ' Topics: ', 15)
('Image ', 539, ' Topics: ', 18)
('Image ', 540, ' Topics: ', 0)
('Image ', 541, ' Topics: ', 17)
('Image ', 542, ' Topics: ', 19)
('Image ', 543, ' Topics: ', 16)
('Image ', 544, ' Topics: ', 1)
('Image ', 545, ' Topics: ', 9)
('Image ', 546, ' Topics: ', 3)
('Image ', 547, ' Topics: ', 6)
('Image ', 548, ' Topics: ', 12)
('Image ', 549, ' Topics: ', 3)
('Image ', 550, ' Topics: ', 15)
('Image ', 551, ' Topics: ', 3)
('Image ', 552, ' Topics: ', 8)
('Image ', 553, ' Topics: ', 19)
('Image ', 554, ' Topics: ', 18)
('Image ', 555, ' Topics: ', 18)
('Image ', 556, ' Topics: ', 2)
('Image ', 557, ' Topics: ', 18)
('Image ', 558, ' Topics: ', 13)
('Image ', 559, ' Topics: ', 17)
('Image ', 560, ' Topics: ', 8)
('Image ', 561, ' Topics: ', 18)
('Image ', 562, ' Topics: ', 3)
('Image ', 563, ' Topics: ', 11)
('Image ', 564, ' Topi

('Image ', 823, ' Topics: ', 11)
('Image ', 824, ' Topics: ', 19)
('Image ', 825, ' Topics: ', 16)
('Image ', 826, ' Topics: ', 5)
('Image ', 827, ' Topics: ', 3)
('Image ', 828, ' Topics: ', 8)
('Image ', 829, ' Topics: ', 8)
('Image ', 830, ' Topics: ', 4)
('Image ', 831, ' Topics: ', 13)
('Image ', 832, ' Topics: ', 13)
('Image ', 833, ' Topics: ', 7)
('Image ', 834, ' Topics: ', 7)
('Image ', 835, ' Topics: ', 11)
('Image ', 836, ' Topics: ', 18)
('Image ', 837, ' Topics: ', 9)
('Image ', 838, ' Topics: ', 18)
('Image ', 839, ' Topics: ', 10)
('Image ', 840, ' Topics: ', 2)
('Image ', 841, ' Topics: ', 1)
('Image ', 842, ' Topics: ', 9)
('Image ', 843, ' Topics: ', 7)
('Image ', 844, ' Topics: ', 2)
('Image ', 845, ' Topics: ', 2)
('Image ', 846, ' Topics: ', 9)
('Image ', 847, ' Topics: ', 2)
('Image ', 848, ' Topics: ', 7)
('Image ', 849, ' Topics: ', 6)
('Image ', 850, ' Topics: ', 2)
('Image ', 851, ' Topics: ', 12)
('Image ', 852, ' Topics: ', 4)
('Image ', 853, ' Topics: ', 8

('Image ', 1107, ' Topics: ', 8)
('Image ', 1108, ' Topics: ', 2)
('Image ', 1109, ' Topics: ', 10)
('Image ', 1110, ' Topics: ', 2)
('Image ', 1111, ' Topics: ', 7)
('Image ', 1112, ' Topics: ', 3)
('Image ', 1113, ' Topics: ', 5)
('Image ', 1114, ' Topics: ', 19)
('Image ', 1115, ' Topics: ', 6)
('Image ', 1116, ' Topics: ', 2)
('Image ', 1117, ' Topics: ', 5)
('Image ', 1118, ' Topics: ', 12)
('Image ', 1119, ' Topics: ', 13)
('Image ', 1120, ' Topics: ', 11)
('Image ', 1121, ' Topics: ', 18)
('Image ', 1122, ' Topics: ', 6)
('Image ', 1123, ' Topics: ', 0)
('Image ', 1124, ' Topics: ', 6)
('Image ', 1125, ' Topics: ', 10)
('Image ', 1126, ' Topics: ', 8)
('Image ', 1127, ' Topics: ', 7)
('Image ', 1128, ' Topics: ', 6)
('Image ', 1129, ' Topics: ', 11)
('Image ', 1130, ' Topics: ', 3)
('Image ', 1131, ' Topics: ', 7)
('Image ', 1132, ' Topics: ', 1)
('Image ', 1133, ' Topics: ', 18)
('Image ', 1134, ' Topics: ', 4)
('Image ', 1135, ' Topics: ', 18)
('Image ', 1136, ' Topics: ', 10)

('Image ', 1364, ' Topics: ', 13)
('Image ', 1365, ' Topics: ', 7)
('Image ', 1366, ' Topics: ', 0)
('Image ', 1367, ' Topics: ', 10)
('Image ', 1368, ' Topics: ', 2)
('Image ', 1369, ' Topics: ', 7)
('Image ', 1370, ' Topics: ', 9)
('Image ', 1371, ' Topics: ', 19)
('Image ', 1372, ' Topics: ', 5)
('Image ', 1373, ' Topics: ', 4)
('Image ', 1374, ' Topics: ', 6)
('Image ', 1375, ' Topics: ', 3)
('Image ', 1376, ' Topics: ', 5)
('Image ', 1377, ' Topics: ', 0)
('Image ', 1378, ' Topics: ', 4)
('Image ', 1379, ' Topics: ', 4)
('Image ', 1380, ' Topics: ', 13)
('Image ', 1381, ' Topics: ', 18)
('Image ', 1382, ' Topics: ', 6)
('Image ', 1383, ' Topics: ', 16)
('Image ', 1384, ' Topics: ', 11)
('Image ', 1385, ' Topics: ', 12)
('Image ', 1386, ' Topics: ', 13)
('Image ', 1387, ' Topics: ', 11)
('Image ', 1388, ' Topics: ', 6)
('Image ', 1389, ' Topics: ', 9)
('Image ', 1390, ' Topics: ', 3)
('Image ', 1391, ' Topics: ', 13)
('Image ', 1392, ' Topics: ', 13)
('Image ', 1393, ' Topics: ', 1

('Image ', 1627, ' Topics: ', 6)
('Image ', 1628, ' Topics: ', 2)
('Image ', 1629, ' Topics: ', 13)
('Image ', 1630, ' Topics: ', 7)
('Image ', 1631, ' Topics: ', 9)
('Image ', 1632, ' Topics: ', 14)
('Image ', 1633, ' Topics: ', 5)
('Image ', 1634, ' Topics: ', 3)
('Image ', 1635, ' Topics: ', 5)
('Image ', 1636, ' Topics: ', 13)
('Image ', 1637, ' Topics: ', 10)
('Image ', 1638, ' Topics: ', 2)
('Image ', 1639, ' Topics: ', 14)
('Image ', 1640, ' Topics: ', 13)
('Image ', 1641, ' Topics: ', 5)
('Image ', 1642, ' Topics: ', 2)
('Image ', 1643, ' Topics: ', 2)
('Image ', 1644, ' Topics: ', 7)
('Image ', 1645, ' Topics: ', 0)
('Image ', 1646, ' Topics: ', 2)
('Image ', 1647, ' Topics: ', 1)
('Image ', 1648, ' Topics: ', 14)
('Image ', 1649, ' Topics: ', 13)
('Image ', 1650, ' Topics: ', 14)
('Image ', 1651, ' Topics: ', 2)
('Image ', 1652, ' Topics: ', 2)
('Image ', 1653, ' Topics: ', 17)
('Image ', 1654, ' Topics: ', 12)
('Image ', 1655, ' Topics: ', 19)
('Image ', 1656, ' Topics: ', 4

('Image ', 1904, ' Topics: ', 12)
('Image ', 1905, ' Topics: ', 18)
('Image ', 1906, ' Topics: ', 12)
('Image ', 1907, ' Topics: ', 0)
('Image ', 1908, ' Topics: ', 18)
('Image ', 1909, ' Topics: ', 8)
('Image ', 1910, ' Topics: ', 12)
('Image ', 1911, ' Topics: ', 6)
('Image ', 1912, ' Topics: ', 3)
('Image ', 1913, ' Topics: ', 4)
('Image ', 1914, ' Topics: ', 5)
('Image ', 1915, ' Topics: ', 6)
('Image ', 1916, ' Topics: ', 9)
('Image ', 1917, ' Topics: ', 8)
('Image ', 1918, ' Topics: ', 16)
('Image ', 1919, ' Topics: ', 18)
('Image ', 1920, ' Topics: ', 13)
('Image ', 1921, ' Topics: ', 15)
('Image ', 1922, ' Topics: ', 18)
('Image ', 1923, ' Topics: ', 9)
('Image ', 1924, ' Topics: ', 8)
('Image ', 1925, ' Topics: ', 1)
('Image ', 1926, ' Topics: ', 2)
('Image ', 1927, ' Topics: ', 9)
('Image ', 1928, ' Topics: ', 8)
('Image ', 1929, ' Topics: ', 12)
('Image ', 1930, ' Topics: ', 7)
('Image ', 1931, ' Topics: ', 4)
('Image ', 1932, ' Topics: ', 2)
('Image ', 1933, ' Topics: ', 13

('Image ', 2186, ' Topics: ', 4)
('Image ', 2187, ' Topics: ', 10)
('Image ', 2188, ' Topics: ', 9)
('Image ', 2189, ' Topics: ', 2)
('Image ', 2190, ' Topics: ', 2)
('Image ', 2191, ' Topics: ', 10)
('Image ', 2192, ' Topics: ', 11)
('Image ', 2193, ' Topics: ', 9)
('Image ', 2194, ' Topics: ', 5)
('Image ', 2195, ' Topics: ', 7)
('Image ', 2196, ' Topics: ', 7)
('Image ', 2197, ' Topics: ', 9)
('Image ', 2198, ' Topics: ', 7)
('Image ', 2199, ' Topics: ', 17)
('Image ', 2200, ' Topics: ', 2)
('Image ', 2201, ' Topics: ', 10)
('Image ', 2202, ' Topics: ', 17)
('Image ', 2203, ' Topics: ', 7)
('Image ', 2204, ' Topics: ', 4)
('Image ', 2205, ' Topics: ', 5)
('Image ', 2206, ' Topics: ', 12)
('Image ', 2207, ' Topics: ', 0)
('Image ', 2208, ' Topics: ', 17)
('Image ', 2209, ' Topics: ', 6)
('Image ', 2210, ' Topics: ', 19)
('Image ', 2211, ' Topics: ', 5)
('Image ', 2212, ' Topics: ', 19)
('Image ', 2213, ' Topics: ', 7)
('Image ', 2214, ' Topics: ', 0)
('Image ', 2215, ' Topics: ', 5)


('Image ', 2495, ' Topics: ', 15)
('Image ', 2496, ' Topics: ', 10)
('Image ', 2497, ' Topics: ', 7)
('Image ', 2498, ' Topics: ', 10)
('Image ', 2499, ' Topics: ', 16)
('Image ', 2500, ' Topics: ', 5)
('Image ', 2501, ' Topics: ', 4)
('Image ', 2502, ' Topics: ', 0)
('Image ', 2503, ' Topics: ', 14)
('Image ', 2504, ' Topics: ', 16)
('Image ', 2505, ' Topics: ', 9)
('Image ', 2506, ' Topics: ', 15)
('Image ', 2507, ' Topics: ', 10)
('Image ', 2508, ' Topics: ', 1)
('Image ', 2509, ' Topics: ', 9)
('Image ', 2510, ' Topics: ', 18)
('Image ', 2511, ' Topics: ', 0)
('Image ', 2512, ' Topics: ', 15)
('Image ', 2513, ' Topics: ', 5)
('Image ', 2514, ' Topics: ', 16)
('Image ', 2515, ' Topics: ', 0)
('Image ', 2516, ' Topics: ', 7)
('Image ', 2517, ' Topics: ', 7)
('Image ', 2518, ' Topics: ', 17)
('Image ', 2519, ' Topics: ', 7)
('Image ', 2520, ' Topics: ', 18)
('Image ', 2521, ' Topics: ', 2)
('Image ', 2522, ' Topics: ', 16)
('Image ', 2523, ' Topics: ', 7)
('Image ', 2524, ' Topics: ',

('Image ', 2774, ' Topics: ', 17)
('Image ', 2775, ' Topics: ', 0)
('Image ', 2776, ' Topics: ', 5)
('Image ', 2777, ' Topics: ', 0)
('Image ', 2778, ' Topics: ', 10)
('Image ', 2779, ' Topics: ', 10)
('Image ', 2780, ' Topics: ', 18)
('Image ', 2781, ' Topics: ', 0)
('Image ', 2782, ' Topics: ', 2)
('Image ', 2783, ' Topics: ', 15)
('Image ', 2784, ' Topics: ', 9)
('Image ', 2785, ' Topics: ', 8)
('Image ', 2786, ' Topics: ', 15)
('Image ', 2787, ' Topics: ', 1)
('Image ', 2788, ' Topics: ', 4)
('Image ', 2789, ' Topics: ', 5)
('Image ', 2790, ' Topics: ', 4)
('Image ', 2791, ' Topics: ', 10)
('Image ', 2792, ' Topics: ', 3)
('Image ', 2793, ' Topics: ', 0)
('Image ', 2794, ' Topics: ', 0)
('Image ', 2795, ' Topics: ', 2)
('Image ', 2796, ' Topics: ', 13)
('Image ', 2797, ' Topics: ', 0)
('Image ', 2798, ' Topics: ', 2)
('Image ', 2799, ' Topics: ', 0)
('Image ', 2800, ' Topics: ', 10)
('Image ', 2801, ' Topics: ', 7)
('Image ', 2802, ' Topics: ', 17)
('Image ', 2803, ' Topics: ', 12)

In [13]:
for x in topic_dict:
    print (str(x) + ': ' + str(len(topic_dict[x])))
#     print (topic_dict[x])

0: 129
1: 126
2: 300
3: 136
4: 111
5: 137
6: 199
7: 210
8: 109
9: 131
10: 261
11: 106
12: 112
13: 143
14: 79
15: 127
16: 117
17: 107
18: 198
19: 162


In [None]:
# Visualize
with open(DATA_PATH + , "rb") as fp:
    img_by_topic = pickle.load(fp)
    
print img_by_topic[topic][i]