In [1]:
import pandas as pd
from jellyfish import metaphone,soundex,nysiis,match_rating_codex
import json

In [2]:
title_and_class = pd.read_csv("data/title_description.csv")

In [3]:
title_and_class = title_and_class.drop(columns=['description','image_name'])

In [4]:
title_and_class['link']=title_and_class['link'].str.replace('/','').str.lower()
title_and_class['title'] = title_and_class['title'].str.replace('_',' ').str.lower()
title_and_class.rename(columns={'title': 'title', 'link': 'class'}, inplace=True)

In [5]:
title_and_class

Unnamed: 0,title,class
0,y u no,y-u-no
1,willy wonka,willy-wonka
2,the most interesting man in the world,the-most-interesting-man-in-the-world
3,futurama fry,futurama-fry
4,success kid,success-kid
5,one does not simply,one-does-not-simply
6,bad luck brian,bad-luck-brian
7,first world problems,first-world-problems
8,philosoraptor,philosoraptor
9,grumpy cat,grumpy-cat


In [6]:
def closure_finger_print_cluster(finger_print_algo,dict_):
    def make_finger_print_cluster(row):
#         print(row)
        title = row['title']
        label = row['class']
        print(title)
        fingerprint = finger_print_algo(title)
        if fingerprint in dict_.keys():
            dict_[fingerprint].append(label)
        else:
            dict_[fingerprint]=[label]
    return make_finger_print_cluster

In [7]:
metaphone_figerprint_cluster = {}
soundex_figerprint_cluster = {}
nysiis_figerprint_cluster = {}
match_rating_codex_figerprint_cluster = {}

In [8]:
%%capture
title_and_class.apply(closure_finger_print_cluster(metaphone,metaphone_figerprint_cluster),axis=1)
title_and_class.apply(closure_finger_print_cluster(soundex,soundex_figerprint_cluster),axis=1)
title_and_class.apply(closure_finger_print_cluster(nysiis,nysiis_figerprint_cluster),axis=1)
title_and_class.apply(closure_finger_print_cluster(match_rating_codex,match_rating_codex_figerprint_cluster),axis=1)

In [9]:
print(len(metaphone_figerprint_cluster.keys()))
print(len(soundex_figerprint_cluster.keys()))
print(len(nysiis_figerprint_cluster.keys()))
print(len(match_rating_codex_figerprint_cluster.keys()))

502
388
335
495


In [10]:
soundex_figerprint_cluster

{'Y500': ['y-u-no', 'yu-no', 'y-you-no'],
 'W452': ['willy-wonka'],
 'T523': ['the-most-interesting-man-in-the-world'],
 'F365': ['futurama-fry'],
 'S222': ['success-kid',
  'so-i-guess-you-could-say-things-are-getting-pretty-serious'],
 'O532': ['one-does-not-simply', 'one-does-not-simply-harry-s'],
 'B342': ['bad-luck-brian'],
 'F623': ['first-world-problems', 'first-day-on-the-internet-kid'],
 'P426': ['philosoraptor'],
 'G651': ['grumpy-cat',
  'grumpy-cat-good',
  'grumpy-cat-santa-hat',
  'grumpy-cat-2',
  'grumpy-cat-5'],
 'W536': ['winter-is-coming'],
 'F616': ['forever-alone', 'forever-alone-date-myself-fail-life'],
 'G322': ['good-guy-greg', 'good-guy-gregs-dog', 'good-guy-jesus'],
 'S512': ['scumbag-steve', 'scumbag-god', 'simpsons-homer', 'scumbag-nerd'],
 'W313': ['what-if-i-told-you',
  'what-if-i-told-you-matrix-morpheus',
  'what-if-i-told-you-matri'],
 'C521': ['conspiracy-keanu'],
 'K653': ['kermit-the-frog-drinking-tea', 'kermit-the-frog'],
 'Y320': ['yo-dawg'],
 'A4

## SIFT now

In [None]:
cluster_dicts={} 

In [147]:
import cv2
from os import listdir
import numpy as np

In [173]:
err = []


In [174]:
def sift_sim(path_a, path_b):    
    orb = cv2.ORB_create()
#     print(path_a,path_b)  
    # get the images
    img_a = cv2.imread(path_a)
    img_b = cv2.imread(path_b)

    # find the keypoints and descriptors with SIFT
    kp_a, desc_a = orb.detectAndCompute(img_a, None)
    kp_b, desc_b = orb.detectAndCompute(img_b, None)

    # initialize the bruteforce matcher
    bf = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck=True)

    # match.distance is a float between {0:100} - lower means more similar
    try:
        matches = bf.match(desc_a, desc_b)
    except:
        err.append((path_a,path_b))
        matches=[]

    similar_regions = [i for i in matches if i.distance < 70]
    if len(matches) == 0:
        return 0
    return len(similar_regions) / len(matches)

In [175]:
image_sim_clusters = []

In [176]:
list_of_all_images = listdir('data/memes/')
list_of_all_images = ['data/memes/'+x for x in list_of_all_images]

In [177]:
image_sim_clusters.append([list_of_all_images[0]])

In [178]:
def get_sim_score_of_cluster(img,cluster):
    scores = [sift_sim(img,img1) for img1 in cluster]
    return sum(scores)/len(scores)

In [179]:
threshold = 0.825
l = len(list_of_all_images[1:])

In [180]:
for i,image in enumerate(list_of_all_images[1:]):
    print(i,l)
    sim_score_clusters = np.array([get_sim_score_of_cluster(image,cluster) for cluster in image_sim_clusters])
    index_above_threshold = np.argwhere(sim_score_clusters>=threshold)
    if len(index_above_threshold)==0:
        image_sim_clusters.append([image])
    else:
        image_sim_clusters[np.argmax(sim_score_clusters)].append(image)

0 508
1 508
2 508
3 508
4 508
5 508
6 508
7 508
8 508
9 508
10 508
11 508
12 508
13 508
14 508
15 508
16 508
17 508
18 508
19 508
20 508
21 508
22 508
23 508
24 508
25 508
26 508
27 508
28 508
29 508
30 508
31 508
32 508
33 508
34 508
35 508
36 508
37 508
38 508
39 508
40 508
41 508
42 508
43 508
44 508
45 508
46 508
47 508
48 508
49 508
50 508
51 508
52 508
53 508
54 508
55 508
56 508
57 508
58 508
59 508
60 508
61 508
62 508
63 508
64 508
65 508
66 508
67 508
68 508
69 508
70 508
71 508
72 508
73 508
74 508
75 508
76 508
77 508
78 508
79 508
80 508
81 508
82 508
83 508
84 508
85 508
86 508
87 508
88 508
89 508
90 508
91 508
92 508
93 508
94 508
95 508
96 508
97 508
98 508
99 508
100 508
101 508
102 508
103 508
104 508
105 508
106 508
107 508
108 508
109 508
110 508
111 508
112 508
113 508
114 508
115 508
116 508
117 508
118 508
119 508
120 508
121 508
122 508
123 508
124 508
125 508
126 508
127 508
128 508
129 508
130 508
131 508
132 508
133 508
134 508
135 508
136 508
137 508
138 50

In [169]:
len(image_sim_clusters)

389

In [170]:
cluster_dicts[threshold] = image_sim_clusters

In [171]:
cluster_dicts

{0.8: [['data/memes/0.jpeg',
   'data/memes/105.jpeg',
   'data/memes/402.jpeg',
   'data/memes/438.jpeg'],
  ['data/memes/1.jpeg'],
  ['data/memes/10.jpeg',
   'data/memes/27.jpeg',
   'data/memes/328.jpeg',
   'data/memes/335.jpeg'],
  ['data/memes/100.jpeg',
   'data/memes/59.jpeg',
   'data/memes/251.jpeg',
   'data/memes/267.jpeg'],
  ['data/memes/101.jpeg', 'data/memes/498.jpeg'],
  ['data/memes/102.jpeg', 'data/memes/55.jpeg', 'data/memes/206.jpeg'],
  ['data/memes/103.jpeg'],
  ['data/memes/104.jpeg'],
  ['data/memes/106.jpeg'],
  ['data/memes/107.jpeg',
   'data/memes/127.jpeg',
   'data/memes/257.jpeg',
   'data/memes/47.jpeg'],
  ['data/memes/108.jpeg'],
  ['data/memes/109.jpeg',
   'data/memes/285.jpeg',
   'data/memes/325.jpeg',
   'data/memes/403.jpeg'],
  ['data/memes/11.jpeg', 'data/memes/393.jpeg'],
  ['data/memes/110.jpeg'],
  ['data/memes/111.jpeg',
   'data/memes/112.jpeg',
   'data/memes/52.jpeg',
   'data/memes/125.jpeg',
   'data/memes/283.jpeg'],
  ['data/memes/

In [172]:
err

[]