In [1]:
import os
import re
import sys
import json
import requests
from pyspark import SparkContext


## Load metadata

In [2]:
with open('../metadata.json', 'r') as f:
    data = json.load(f)

## Define Word_Frequency and InvertedID

In [40]:
class Word_Frequency:
    '''A class that calculates word frequency for cvpr metadata using spark.'''
    def __init__(self, data):
        self.data = data
        self.sc = SparkContext()
    
    def _get_category_list(self, category = 'title'):
        result = []
        for item in self.data['cvpr']:
            if category in item:
                result.append(item[category])
        return result
    
    def get_wc(self, category):
        '''
            get the word count for a specific category in metadata.
            argv:
                category : One category in data that needs to construct word count list.
                
                e.g. 
                data = {'cvpr': 
                [{'subject': '2013 IEEE Conference on Computer Vision and Pattern Recognition',
                   'links': 'http://openaccess.thecvf.com/content_cvpr_2014/papers/Cheng_Fast_and_Accurate_2014_CVPR_paper.pdf',
                   'pages': 8,
                   'title': 'Fast and Accurate Image Matching with Cascade Hashing for 3D Reconstruction',
                   'year': 2014,
                   'author': 'Jian Cheng, Cong Leng, Jiaxiang Wu, Hainan Cui, Hanqing Lu'},
                  {'subject': '2013 IEEE Conference on Computer Vision and Pattern Recognition',
                   'links': 'http://openaccess.thecvf.com/content_cvpr_2014/papers/Hartmann_Predicting_Matchability_2014_CVPR_paper.pdf',
                   'pages': 8,
                   'title': 'Predicting Matchability',
                   'year': 2014,
                   'author': 'Wilfried Hartmann, Michal Havlena, Konrad Schindler'}...
                ]
                }
                
                category = 'title'
                wf = Word_Frequency(data)
                title_wc_list = wf.get_wc(category = category)
                # title_wc_list = ['fast:10', 'image:20', 'cnn:200', ...]
        '''
        try:
            category_list = self._get_category_list(category)
            parallel_data = self.sc.parallelize(category_list, 5)
            result = parallel_data.flatMap(lambda s : s.split()).\
                                      map(lambda w : re.sub(r'[/,#$.?!:"<>();&-]', '', w)).\
                                      map(lambda w : w.lower()).\
                                      map(lambda w : (w, 1)).\
                                      reduceByKey(lambda a, b : a + b).filter(lambda U : len(U[0]) >= 1).\
                                      map(lambda U : str(U[0])+':'+str(U[1])).collect()
            return result
        except ValueError:
            print(str(ValueError))
        except IOError:
            print(str(IOError))
        except Exception as e:
            print(str(e))
        finally:
            self.sc.stop()

In [38]:
class Inverted_ID:
    """
        A class that constructs the inverted index for cvpr metadata.
        input:
            dic : A dictionary that stores the cvpr research papers. e.g.
                {'cvpr': 
                [{'subject': '2013 IEEE Conference on Computer Vision and Pattern Recognition',
                   'links': 'http://openaccess.thecvf.com/content_cvpr_2014/papers/Cheng_Fast_and_Accurate_2014_CVPR_paper.pdf',
                   'pages': 8,
                   'title': 'Fast and Accurate Image Matching with Cascade Hashing for 3D Reconstruction',
                   'year': 2014,
                   'author': 'Jian Cheng, Cong Leng, Jiaxiang Wu, Hainan Cui, Hanqing Lu'},
                  {'subject': '2013 IEEE Conference on Computer Vision and Pattern Recognition',
                   'links': 'http://openaccess.thecvf.com/content_cvpr_2014/papers/Hartmann_Predicting_Matchability_2014_CVPR_paper.pdf',
                   'pages': 8,
                   'title': 'Predicting Matchability',
                   'year': 2014,
                   'author': 'Wilfried Hartmann, Michal Havlena, Konrad Schindler'}...
                ]
                }
            key_type :
                'all', 'subject', 'title', 'author', 'year'
    """
    
    def __init__(self, dic, key_type = 'all'):
        self.word_lst = dic['cvpr']
        self.inverted_idx = {}
        self.key_type = key_type
        self._construct_idx()

        
        
    def _construct_idx(self):
        
        for i in range(len(self.word_lst)):
            cur_dict = self.word_lst[i]
            
            if (self.key_type == 'all' or self.key_type == 'subject') and \
               'subject' in cur_dict:
                self._add(cur_dict['subject'], i)
            
            if (self.key_type == 'all' or self.key_type == 'title') and \
               'title' in cur_dict:
                self._add(cur_dict['title'], i)
            
            if self.key_type == 'year' and 'year' in cur_dict:
                self._add(cur_dict['year'], i)
            
            if (self.key_type == 'all' or self.key_type == 'author') and \
               'author' in cur_dict:
                self._add(cur_dict['author'], i)
                
                
    
    def _add(self, words, index):
        for word in str(words).split():
            word = re.sub(r'[/,#$.?!:"<>();&-]', '', word)
            word = word.lower()
            
            #if word.isdigit() or len(word) <= 1:
            #    continue
            if len(word) <= 1:
                continue
                        
            if word not in self.inverted_idx:
                self.inverted_idx[word] = set()
            self.inverted_idx[word].add(index)
            
    
    def search(self, words, vis = True):
        """
            return a index list where each record of that index contains the words.
            input:
                words : A word list to search.
                vis : If True: print the records out.
            
            return:
                List of index.
        """
        result_lst = set()
        words = words.split()
        
        for word in words:
            word = re.sub(r'[/,#$.:?!"<>();&-]', '', word)
            word = word.lower()
            if word in self.inverted_idx:
                for idx in self.inverted_idx[word]:
                    result_lst.add(idx)
        
        #print(result_lst)
        if vis:
            for ele in result_lst:
                print(self.word_lst[ele])
        
        return list(result_lst)
    
    def get_inverted_idx(self):
        return self.inverted_idx


## Calculate word Frequency and inverted index

In [43]:
wf = Word_Frequency(data)
category = 'title'
word_freq = wf.get_wc(category = category)
word_freq

['distractors:1',
 'mad:1',
 'poseguided:2',
 'head:6',
 '250:1',
 'nisp:1',
 'adjacency:1',
 'separation:9',
 'crafting:1',
 'motions:1',
 'cost:7',
 'mesh:7',
 'shapes:18',
 'bottomtopbottom:1',
 'realworld:6',
 'annotating:1',
 'streams:2',
 'count:2',
 'decision:8',
 'shoe:1',
 'recipes:1',
 'interpretation:2',
 'multicontext:3',
 'dnns:1',
 'right:1',
 'investigate:1',
 'diagnosis:4',
 'simulate:1',
 'quasirandom:1',
 'euclideantoriemannian:1',
 'viewpoints:6',
 'sold:1',
 'role:3',
 'translucent:4',
 'improvements:1',
 'cocktail:1',
 'reconstruction:95',
 'interpretability:3',
 'imagenet:1',
 'score:3',
 'disturblabel:1',
 'interspecies:1',
 'eco:1',
 'egocentric:15',
 'conquer:1',
 'benchmark:19',
 'target:5',
 'lifestyle:1',
 'splitting:3',
 'appearance:17',
 '+:1',
 'observation:1',
 'range:7',
 'choose:1',
 'hypercolumns:1',
 'streak:2',
 'labeling:25',
 'cameras:24',
 'hierarchicallyconstrained:1',
 'deepshape:1',
 'induced:1',
 'correlational:1',
 'map:19',
 'me:9',
 'perso

In [19]:
inverted_all = Inverted_ID(data, key_type='all')
inverted_subject = Inverted_ID(data, key_type='subject')
inverted_author = Inverted_ID(data, key_type='author')
inverted_title = Inverted_ID(data, key_type='title')
inverted_year = Inverted_ID(data, key_type='year')

In [20]:
inverted_index_all = inverted_all.get_inverted_idx()
inverted_subject = inverted_subject.get_inverted_idx()
inverted_author = inverted_author.get_inverted_idx()
inverted_title = inverted_title.get_inverted_idx()
inverted_year = inverted_year.get_inverted_idx()

In [21]:
def func(dic):
    for key in dic.keys():
        dic[key] = list(dic[key])
    return dic

In [22]:
inverted_index_all = func(inverted_index_all)
inverted_subject = func(inverted_subject)
inverted_author = func(inverted_author)
inverted_title = func(inverted_title)
inverted_year = func(inverted_year)

In [23]:
data['indexes'] = {'all' : inverted_index_all,\
                   'conference' : inverted_subject, \
                   'author' : inverted_author, \
                   'title' : inverted_title, \
                   'year' : inverted_year}
data['word_priority'] = word_freq

In [25]:
with open('../metadata_word_priority_new.json', 'w+') as f:
    root = {'root' : data}
    json.dump(root, f)

## Upload data to firebase

In [26]:
url = 'https://inf551-project-aa3f8.firebaseio.com/'
upload_data = {'root' : data}

In [33]:
def data_upload(url, data):
    data = json.dumps(data)
    res1 = requests.put(os.path.join(url, 'root.json'), data)
    print(res1.json())

In [36]:
data_upload(url, data)

{'indexes': {'all': {'mad': [1313], 'manjunath': [1619, 885], 'modality': [1225, 1774], 'katircioglu': [3434], 'jiankang': [3295], 'crossdomain': [1792, 2641, 3478, 1678, 3080], 'zehuan': [2164], 'xilin': [3109, 2717, 554, 2187, 1356, 237, 2799, 211, 3284, 757, 1653, 1049, 220, 2365, 1502], 'harvesting': [2511], 'descriptions': [1379, 873, 1138, 1142, 2392, 58, 2300], 'polarimetric': [2961, 1943], 'epameinondas': [1123, 228, 1582, 1753, 1785, 1497, 2491], 'shugao': [972, 1078, 1343], 'oleksandr': [2568], 'absolute': [692, 1494, 62, 783], 'deflection': [3448], 'acquiring': [2151], 'objectspecific': [1508], 'saccade': [2891], 'dusk': [1721], 'loong': [2857], 'xu': [3075, 3076, 1033, 2057, 12, 20, 2586, 2588, 3105, 3108, 551, 2090, 555, 3114, 2094, 1590, 2617, 1596, 3139, 2632, 1100, 1622, 1625, 94, 2144, 609, 3169, 612, 105, 620, 628, 124, 2176, 3206, 650, 2698, 1176, 2713, 158, 3235, 1700, 3240, 1707, 178, 691, 3251, 2233, 2234, 2748, 192, 2242, 2243, 731, 2781, 1246, 1758, 3294, 1762, 