# Classificador de tags (II)

Entrenem un Multilabel Naive Bayes Classifier amb els tags

In [5]:
import numpy as np
import pandas as pd
import nltk
import pickle
import math
from random import shuffle
import urllib2
import re
from urllib2 import URLError
import os
import time
from bs4 import BeautifulSoup
import selenium.webdriver as webdriver
from selenium.common.exceptions import NoSuchElementException

In [6]:
# load and save lists and dictionaries with pickle
def save_obj(obj, name ):
    with open(''+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('' + name + '.pkl', 'rb') as f:
        return pickle.load(f)
    
# normalize with L2 norm
def l2_normalizer(vec):
    denom = np.sum([el**2 for el in vec])
    return [(el / math.sqrt(denom)) for el in vec]

# L2 norm of a vector
def l2_norm(vec):
    denom = np.sum([el**2 for el in vec])
    return math.sqrt(denom)

def img_counter_per_rest_types(img_list):
    n = len(set([item[1] for item in img_list]))
    counters = [0]*n
    for item in img_list:
        for caca in range(8):
            if item[1] == caca:
                counters[caca] = counters[caca] + 1
    return counters

def listHashtags(string): 
    hashtaglist = []
    while string.find('#',0) != -1:
        m = re.search('(?<=#)\w+', string)
        string = string.replace('#'+m.group(0),'')
        hashtaglist.append(m.group(0))
    hashtaglist = [item.lower() for item in hashtaglist]
    return hashtaglist

In [7]:
# load rest_list (list of dicts, each a rest, with all the info (name, loc, code, images, ...))
rest_list = load_obj('rest_list')
# we just need tags and rest type (especialidad)
img_list = []
for rest in rest_list:
    try:
        for img in rest['IMAGES']:
            if len(img['tags']) != 0:
                img_list.append([img['tags'],rest['ESPECIALIDAD']])
    except KeyError:
        None
# cocina casera -> 0, tapas -> 1, paellas y arroces -> 2, rapida -> 3, italiana -> 4, asiatica -> 5, carnes -> 6
# dietetica -> 7
rest_types = ['Cocina casera','Tapas, pintxos y platillos','Paellas y arroces','Cocina rapida']
rest_types = rest_types + ['Cocina italiana','Cocina asiatica','Carnes a la brasa']
rest_types = rest_types + ['Cocina dietetica, naturista, vegetariana y biologica']
for img in img_list:
    img[1] = rest_types.index(img[1])
# every tag in lowercase
for i in range(len(img_list)):
    img_list[i] = [[item.lower() for item in img_list[i][0]] ,img_list[i][1]]

In [8]:
# we don't want the entire data set, and since it is 'ordered' by restaurant, we shuffle it.
shuffle(img_list)

In [9]:
# let's use pandas for taking just max_n_of_images pics (maximum) per rest type
max_n_of_images = 9000
img_list_df = pd.DataFrame(img_list)
red_img_list = []
for rest_type in range(8):
    img_list_df_label = img_list_df[img_list_df[1] == rest_type][:max_n_of_images]
    red_img_list = red_img_list + img_list_df_label.values.tolist()
shuffle(red_img_list) 

In [10]:
# let us divide in train and test. red_rest_list will be the train set and img_list_test will be the test set.
print 'Size of the entire image set per rest_type (0-7): ' + str(img_counter_per_rest_types(red_img_list))
img_count_per_type = img_counter_per_rest_types(red_img_list)
img_list_test = []
n_images_test = [int(caca*0.1) for caca in img_count_per_type]
counter = [0]*len(img_count_per_type)
for img in red_img_list:
    if counter[img[1]] < n_images_test[img[1]]:
        img_list_test.append(img)
        red_img_list.remove(img)
        counter[img[1]] = counter[img[1]] + 1
print 'Size of the train set per rest_type (0-7): ' + str(img_counter_per_rest_types(red_img_list))
print 'Size of the test set per rest_type (0-7): ' + str(img_counter_per_rest_types(img_list_test))

Size of the entire image set per rest_type (0-7): [2311, 8955, 684, 425, 1168, 1371, 1398, 585]
Size of the train set per rest_type (0-7): [2080, 8060, 616, 383, 1052, 1234, 1259, 527]
Size of the test set per rest_type (0-7): [231, 895, 68, 42, 116, 137, 139, 58]


In [11]:
# construct the dictionary (every fucking word appearing as a tag in the train image set)
dic = []
for img in red_img_list:
        for tag in img[0]:
                dic.append(str(tag).lower())
dic = list(set(dic))
print 'Length of the dictionary: ' + str(len(dic)) + ' different tags'

Length of the dictionary: 23159 different tags


In [12]:
# construct the frequency set: we associate to each image a list of length len(dic)
freq_set = []
for img in red_img_list:
    zeroes = [0]*len(dic)
    for tag in img[0]:
        zeroes[dic.index(str(tag))] = 1
    freq_set.append([zeroes,img[1]])

test_freq_set = []
for img in img_list_test:
    zeroes = [0]*len(dic)
    for tag in img[0]:
        try:
            zeroes[dic.index(str(tag))] = 1
        except ValueError:
            None
    test_freq_set.append([zeroes,img[1]])

In [13]:
# construct inverse document frequency matrix
doclist = [item[0] for item in freq_set]
n_samples = len(doclist)
# how many times does a hashtag appear in all the samples?
doclist_traspose = list(zip(*doclist))
numContaining = [sum(col) for col in doclist_traspose]
# finally compute idf vector -> idf matrix
my_idf_vector = [np.log(n_samples / (float(caca))) for caca in numContaining]
my_idf_matrix = np.zeros((len(my_idf_vector), len(my_idf_vector)))
np.fill_diagonal(my_idf_matrix, my_idf_vector)

In [14]:
# TF-IDF
doc_term_matrix_tfidf = []

counter = 0
total = len(doclist)
for tf_vector in doclist:
    doc_term_matrix_tfidf.append(np.dot(tf_vector, my_idf_matrix))
    counter = counter + 1
    if counter % 100 == 0:
        print str(100*counter/total) + '%'

0%
1%
1%
2%
3%
3%
4%
5%
5%
6%
7%
7%
8%
9%
9%
10%
11%
11%
12%
13%
13%
14%
15%
15%
16%
17%
17%
18%
19%
19%
20%
21%
21%
22%
23%
23%
24%
24%
25%
26%
26%
27%
28%
28%
29%
30%
30%
31%
32%
32%
33%
34%
34%
35%
36%
36%
37%
38%
38%
39%
40%
40%
41%
42%
42%
43%
44%
44%
45%
46%
46%
47%
47%
48%
49%
49%
50%
51%
51%
52%
53%
53%
54%
55%
55%
56%
57%
57%
58%
59%
59%
60%
61%
61%
62%
63%
63%
64%
65%
65%
66%
67%
67%
68%
69%
69%
70%
71%
71%
72%
72%
73%
74%
74%
75%
76%
76%
77%
78%
78%
79%
80%
80%
81%
82%
82%
83%
84%
84%
85%
86%
86%
87%
88%
88%
89%
90%
90%
91%
92%
92%
93%
94%
94%
95%
95%
96%
97%
97%
98%
99%
99%


In [15]:
# normalize with L2 norm
counter = 0
total = len(doc_term_matrix_tfidf)
doc_term_matrix_tfidf_l2 = []
for tf_vector in doc_term_matrix_tfidf:
    doc_term_matrix_tfidf_l2.append(l2_normalizer(tf_vector))
    counter = counter + 1
    if counter % 100 == 0:
        print str(100*counter/total) + '%'

0%
1%
1%
2%
3%
3%
4%
5%
5%
6%
7%
7%
8%
9%
9%
10%
11%
11%
12%
13%
13%
14%
15%
15%
16%
17%
17%
18%
19%
19%
20%
21%
21%
22%
23%
23%
24%
24%
25%
26%
26%
27%
28%
28%
29%
30%
30%
31%
32%
32%
33%
34%
34%
35%
36%
36%
37%
38%
38%
39%
40%
40%
41%
42%
42%
43%
44%
44%
45%
46%
46%
47%
47%
48%
49%
49%
50%
51%
51%
52%
53%
53%
54%
55%
55%
56%
57%
57%
58%
59%
59%
60%
61%
61%
62%
63%
63%
64%
65%
65%
66%
67%
67%
68%
69%
69%
70%
71%
71%
72%
72%
73%
74%
74%
75%
76%
76%
77%
78%
78%
79%
80%
80%
81%
82%
82%
83%
84%
84%
85%
86%
86%
87%
88%
88%
89%
90%
90%
91%
92%
92%
93%
94%
94%
95%
95%
96%
97%
97%
98%
99%
99%


In [16]:
# TRAIN THE MODEL
X = np.array(doc_term_matrix_tfidf_l2)
y = np.array([item[1] for item in freq_set])
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X, y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [17]:
# ACCURACY OVER THE TRAINING SET
a = clf.predict(doclist)
count_match = 0
count_tot = 0
for item in freq_set:
    if a[count_tot] == item[1]:
        count_match = count_match + 1
    count_tot = count_tot + 1
print 'Accuracy over the training set: ' + str(100*float(count_match)/float(count_tot)) + '%'

Accuracy over the training set: 71.9873775557%


In [18]:
# ACCURACY OVER THE TEST SET
for i in range(8):
    a = clf.predict([img[0] for img in test_freq_set if img[1] == i])
    count_match = 0
    count_tot = 0
    for item in [img for img in test_freq_set if img[1] == i]:
        if a[count_tot] == item[1]:
            count_match = count_match + 1
        count_tot = count_tot + 1
    print str(100*float(count_match)/float(count_tot)) + '% of the ' + rest_types[i] + ' images were well classified.'

a = clf.predict([img[0] for img in test_freq_set])
count_match = 0
count_tot = 0
for item in test_freq_set:
    if a[count_tot] == item[1]:
        count_match = count_match + 1
    count_tot = count_tot + 1
print 'Accuracy over the entire test set: ' + str(100*float(count_match)/float(count_tot)) + '%.'

28.5714285714% of the Cocina casera images were well classified.
99.6648044693% of the Tapas, pintxos y platillos images were well classified.
7.35294117647% of the Paellas y arroces images were well classified.
9.52380952381% of the Cocina rapida images were well classified.
23.275862069% of the Cocina italiana images were well classified.
41.6058394161% of the Cocina asiatica images were well classified.
28.0575539568% of the Carnes a la brasa images were well classified.
25.8620689655% of the Cocina dietetica, naturista, vegetariana y biologica images were well classified.
Accuracy over the entire test set: 65.5397390273%.


In [15]:
# GIVE ME A INSTAGRAM PICTURE URL (WITH ENOUGH HASHTAGS) AND I WILL TELL YOU WHICH IS ITS FOOD STYLE
# THIS IS A PICTURE TAKEN IN A KFC, SO IT SHOULD SAY COCINA RAPIDA
url = 'https://www.instagram.com/p/BHqpEI7DfaG/'
try:
    source = urllib2.urlopen(url)
    htmlcode = source.read()
    start = htmlcode.find('"caption":') + 12
    end = htmlcode.find('",',start)
    hash_list = listHashtags(htmlcode[start:end])
    freq_samp = [0]*len(dic)
    for tag in hash_list:
        try:
            freq_samp[dic.index(str(tag))] = 1
        except ValueError:
            None
    if sum(freq_samp) == 0:
        print sum(freq_samp)
        print 'I cannot classify this image because I do not understand its tags.'
except URLError:
    print 'URL Error'
freq_samp = [freq_samp]
clf.predict(freq_samp)
for i in range(len(rest_types)):
    caca = clf.predict_proba(freq_samp)[0]
    print rest_types[i] + ': ' + str(caca[i])
print '----------------------------------------------------------------------'
print 'This picture should be ' + rest_types[clf.predict(freq_samp)[0]] + '.'

Cocina casera: 0.0755525454506
Tapas, pintxos y platillos: 0.0371862154177
Paellas y arroces: 0.010996304745
Cocina rapida: 0.71780172546
Cocina italiana: 0.056856858131
Cocina asiatica: 0.0681818685556
Carnes a la brasa: 0.0251471642141
Cocina dietetica, naturista, vegetariana y biologica: 0.00827731802632
----------------------------------------------------------------------
This picture should be Cocina rapida.


In [19]:
img_list_per_rest = []
for rest in rest_list:
    try:
        only_img_list = []
        for img in rest['IMAGES']:
            if len(img['tags']) != 0:
                only_img_list.append([tag.lower() for tag in img['tags']])
        img_list_per_rest.append([only_img_list,rest['ESPECIALIDAD'],rest['NOMBRE']])
    except KeyError:
        None

for img in img_list_per_rest:
    img[1] = rest_types.index(img[1])
for rest in img_list_per_rest:
    if len(rest[0]) == 0:
        img_list_per_rest.remove(rest)

rests_to_remove = []
for rest in img_list_per_rest:
    suma = np.array([0]*8)
    counter = 0
    for img in rest[0]:
        hash_appearing = [0]*len(dic)
        for tag in img:
            try:
                hash_appearing[dic.index(str(tag))] = 1
            except ValueError:
                None
        if sum(hash_appearing) != 0:
            suma = suma + clf.predict_proba([hash_appearing])
            counter = counter + 1
        else:
            None
    if np.sum(suma) == 0:
        rests_to_remove.append(rest)
    else:
        mean = suma/counter
        rest.append((suma/counter).tolist()[0])
for rest in rests_to_remove:
    img_list_per_rest.remove(rest)

In [20]:
# ACCURACY OF THE RESTAURANT CLASSIFICATION
counter_pos = 0
counter = 0
for rest in img_list_per_rest:
    maxim = max(rest[3])
    if rest[3].index(maxim) == rest[1]:
        counter_pos = counter_pos + 1
    counter = counter + 1
print str(100*float(counter_pos)/counter) + '% of the restaurants were well classified.'
print 'Note that many images of those restaurants belong to the training set.'

55.3097345133% of the restaurants were well classified.
Note that many images of those restaurants belong to the training set.


In [18]:
# PLS SAVE
save_obj(rest_types,'rest_types')
save_obj(clf,'clf_'+str(max_n_of_images))
save_obj(dic,'dic_'+str(max_n_of_images))
save_obj(img_list_per_rest,'known_rests_probs_mean_'+str(max_n_of_images))