In [1]:
import os
import re
import sys
import json

In [55]:
with open('../metadata.json', 'r') as f:
    data = json.load(f)

In [56]:
data

{'cvpr': [{'author': 'Jian Cheng, Cong Leng, Jiaxiang Wu, Hainan Cui, Hanqing Lu',
   'links': 'http://openaccess.thecvf.com/content_cvpr_2014/papers/Cheng_Fast_and_Accurate_2014_CVPR_paper.pdf',
   'pages': 8,
   'subject': '2013 IEEE Conference on Computer Vision and Pattern Recognition',
   'title': 'Fast and Accurate Image Matching with Cascade Hashing for 3D Reconstruction',
   'year': 2014},
  {'author': 'Wilfried Hartmann, Michal Havlena, Konrad Schindler',
   'links': 'http://openaccess.thecvf.com/content_cvpr_2014/papers/Hartmann_Predicting_Matchability_2014_CVPR_paper.pdf',
   'pages': 8,
   'subject': '2013 IEEE Conference on Computer Vision and Pattern Recognition',
   'title': 'Predicting Matchability',
   'year': 2014},
  {'author': 'Jean Ponce, Martial Hebert',
   'links': 'http://openaccess.thecvf.com/content_cvpr_2014/papers/Ponce_Trinocular_Geometry_Revisited_2014_CVPR_paper.pdf',
   'pages': 8,
   'subject': '2013 IEEE Conference on Computer Vision and Pattern Recogn

In [57]:
class Inverted_ID:
    """
        A class that constructs the inverted index for cvpr metadata.
        input:
            dic : A dictionary that stores the cvpr research papers. e.g.
                {'cvpr': 
                [{'subject': '2013 IEEE Conference on Computer Vision and Pattern Recognition',
                   'links': 'http://openaccess.thecvf.com/content_cvpr_2014/papers/Cheng_Fast_and_Accurate_2014_CVPR_paper.pdf',
                   'pages': 8,
                   'title': 'Fast and Accurate Image Matching with Cascade Hashing for 3D Reconstruction',
                   'year': 2014,
                   'author': 'Jian Cheng, Cong Leng, Jiaxiang Wu, Hainan Cui, Hanqing Lu'},
                  {'subject': '2013 IEEE Conference on Computer Vision and Pattern Recognition',
                   'links': 'http://openaccess.thecvf.com/content_cvpr_2014/papers/Hartmann_Predicting_Matchability_2014_CVPR_paper.pdf',
                   'pages': 8,
                   'title': 'Predicting Matchability',
                   'year': 2014,
                   'author': 'Wilfried Hartmann, Michal Havlena, Konrad Schindler'}...
                ]
                }
            key_type :
                'all', 'subject', 'title', 'author', 'year'
    """
    
    def __init__(self, dic, key_type = 'all'):
        self.word_lst = dic['cvpr']
        self.inverted_idx = {}
        self.key_type = key_type
        self._construct_idx()

        
        
    def _construct_idx(self):
        
        for i in range(len(self.word_lst)):
            cur_dict = self.word_lst[i]
            
            if (self.key_type == 'all' or self.key_type == 'subject') and \
               'subject' in cur_dict:
                self._add(cur_dict['subject'], i)
            
            if (self.key_type == 'all' or self.key_type == 'title') and \
               'title' in cur_dict:
                self._add(cur_dict['title'], i)
            
            if self.key_type == 'year' and 'year' in cur_dict:
                self._add(cur_dict['year'], i)
            
            if (self.key_type == 'all' or self.key_type == 'author') and \
               'author' in cur_dict:
                self._add(cur_dict['author'], i)
                
                
    
    def _add(self, words, index):
        for word in str(words).split():
            word = re.sub(r'[/,#$.?!:"<>();&-]', '', word)
            word = word.lower()
            
            #if word.isdigit() or len(word) <= 1:
            #    continue
            if len(word) <= 1:
                continue
                        
            if word not in self.inverted_idx:
                self.inverted_idx[word] = set()
            self.inverted_idx[word].add(index)
            
    
    def search(self, words, vis = True):
        """
            return a index list where each record of that index contains the words.
            input:
                words : A word list to search.
                vis : If True: print the records out.
            
            return:
                List of index.
        """
        result_lst = set()
        words = words.split()
        
        for word in words:
            word = re.sub(r'[/,#$.:?!"<>();&-]', '', word)
            word = word.lower()
            if word in self.inverted_idx:
                for idx in self.inverted_idx[word]:
                    result_lst.add(idx)
        
        #print(result_lst)
        if vis:
            for ele in result_lst:
                print(self.word_lst[ele])
        
        return list(result_lst)
    
    def get_inverted_idx(self):
        return self.inverted_idx

In [58]:
inverted_all = Inverted_ID(data, key_type='all')
inverted_subject = Inverted_ID(data, key_type='subject')
inverted_author = Inverted_ID(data, key_type='author')
inverted_title = Inverted_ID(data, key_type='title')
inverted_year = Inverted_ID(data, key_type='year')

In [59]:
inverted_index_all = inverted_all.get_inverted_idx()
inverted_subject = inverted_subject.get_inverted_idx()
inverted_author = inverted_author.get_inverted_idx()
inverted_title = inverted_title.get_inverted_idx()
inverted_year = inverted_year.get_inverted_idx()

In [60]:
def func(dic):
    for key in dic.keys():
        dic[key] = list(dic[key])
    return dic

In [61]:
inverted_index_all = func(inverted_index_all)
inverted_subject = func(inverted_subject)
inverted_author = func(inverted_author)
inverted_title = func(inverted_title)
inverted_year = func(inverted_year)

In [53]:
data['all'] = inverted_index_all
data['subject'] = inverted_subject
data['author'] = inverted_author
data['title'] = inverted_title
data['year'] = inverted_year

In [62]:
data['indexes'] = {'all' : inverted_index_all,\
                   'conference' : inverted_subject, \
                   'author' : inverted_author, \
                   'title' : inverted_title, \
                   'year' : inverted_year}

In [63]:
with open('../metadata_sep.json', 'w+') as f:
    root = {'root' : data}
    json.dump(root, f)