In [14]:
import os
import re
import sys
import json

In [150]:
with open('../metadata.json', 'r') as f:
    data = json.load(f)

In [151]:
data

{'cvpr': [{'subject': '2013 IEEE Conference on Computer Vision and Pattern Recognition',
   'links': 'http://openaccess.thecvf.com/content_cvpr_2014/papers/Cheng_Fast_and_Accurate_2014_CVPR_paper.pdf',
   'pages': 8,
   'title': 'Fast and Accurate Image Matching with Cascade Hashing for 3D Reconstruction',
   'year': 2014,
   'author': 'Jian Cheng, Cong Leng, Jiaxiang Wu, Hainan Cui, Hanqing Lu'},
  {'subject': '2013 IEEE Conference on Computer Vision and Pattern Recognition',
   'links': 'http://openaccess.thecvf.com/content_cvpr_2014/papers/Hartmann_Predicting_Matchability_2014_CVPR_paper.pdf',
   'pages': 8,
   'title': 'Predicting Matchability',
   'year': 2014,
   'author': 'Wilfried Hartmann, Michal Havlena, Konrad Schindler'},
  {'subject': '2013 IEEE Conference on Computer Vision and Pattern Recognition',
   'links': 'http://openaccess.thecvf.com/content_cvpr_2014/papers/Ponce_Trinocular_Geometry_Revisited_2014_CVPR_paper.pdf',
   'pages': 8,
   'title': 'Trinocular Geometry Re

In [152]:
class Inverted_ID:
    """
        A class that constructs the inverted index for cvpr metadata.
        input:
            dic : A dictionary that stores the cvpr research papers. e.g.
                {'cvpr': 
                [{'subject': '2013 IEEE Conference on Computer Vision and Pattern Recognition',
                   'links': 'http://openaccess.thecvf.com/content_cvpr_2014/papers/Cheng_Fast_and_Accurate_2014_CVPR_paper.pdf',
                   'pages': 8,
                   'title': 'Fast and Accurate Image Matching with Cascade Hashing for 3D Reconstruction',
                   'year': 2014,
                   'author': 'Jian Cheng, Cong Leng, Jiaxiang Wu, Hainan Cui, Hanqing Lu'},
                  {'subject': '2013 IEEE Conference on Computer Vision and Pattern Recognition',
                   'links': 'http://openaccess.thecvf.com/content_cvpr_2014/papers/Hartmann_Predicting_Matchability_2014_CVPR_paper.pdf',
                   'pages': 8,
                   'title': 'Predicting Matchability',
                   'year': 2014,
                   'author': 'Wilfried Hartmann, Michal Havlena, Konrad Schindler'}...
                ]
                }
    """
    
    def __init__(self, dic):
        self.word_lst = dic['cvpr']
        self.inverted_idx = {}
        self._construct_idx()

        
        
    def _construct_idx(self):
        
        for i in range(len(self.word_lst)):
            cur_dict = self.word_lst[i]
            
            if 'subject' in cur_dict:
                self._add(cur_dict['subject'], i)
            
            #if 'pages' in cur_dict:
            #    self._add(cur_dict['pages'], i)
            
            #if 'year' in cur_dict:
            #    self._add(cur_dict['year'], i)
            
            if 'author' in cur_dict:
                self._add(cur_dict['author'], i)
                
                
    
    def _add(self, words, index):
        for word in str(words).split():
            word = re.sub(r'[/,.?!"<>();&-]', '', word)
            word = word.lower()
            
            if word.isdigit():
                continue
                        
            if word not in self.inverted_idx:
                self.inverted_idx[word] = set()
            self.inverted_idx[word].add(index)
            
    
    def search(self, words, vis = True):
        """
            return a index list where each record of that index contains the words.
            input:
                words : A word list to search.
                vis : If True: print the records out.
            
            return:
                List of index.
        """
        result_lst = set()
        words = words.split()
        
        for word in words:
            word = re.sub(r'[/,.?!"<>();&-]', '', word)
            word = word.lower()
            if word in self.inverted_idx:
                for idx in self.inverted_idx[word]:
                    result_lst.add(idx)
        
        #print(result_lst)
        if vis:
            for ele in result_lst:
                print(self.word_lst[ele])
        
        return list(result_lst)
    
    def get_inverted_idx(self):
        return self.inverted_idx

In [153]:
invertedID = Inverted_ID(data)

In [154]:
inverted_index = invertedID.get_inverted_idx()

In [155]:
for key in inverted_index.keys():
    inverted_index[key] = list(inverted_index[key])
inverted_index

{'ieee': [0,
  1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  33,
  34,
  35,
  36,
  37,
  38,
  39,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  49,
  50,
  51,
  52,
  53,
  54,
  55,
  56,
  57,
  58,
  59,
  60,
  61,
  62,
  63,
  64,
  65,
  66,
  67,
  68,
  69,
  70,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  82,
  83,
  84,
  85,
  86,
  87,
  88,
  89,
  90,
  91,
  92,
  93,
  94,
  95,
  96,
  97,
  98,
  99,
  100,
  101,
  102,
  103,
  104,
  105,
  106,
  107,
  108,
  109,
  110,
  111,
  112,
  113,
  114,
  115,
  116,
  117,
  118,
  119,
  120,
  121,
  122,
  123,
  124,
  125,
  126,
  127,
  128,
  129,
  130,
  131,
  132,
  133,
  134,
  135,
  136,
  137,
  138,
  139,
  140,
  141,
  142,
  143,
  144,
  145,
  146,
  147,
  148,
  149,
  150,
  151,
  152,
  153,
  154,
  155,
  156,
  1

In [156]:
data['invertedIndexes'] = inverted_index

In [157]:
with open('../new_metadata.json', 'w+') as f:
    root = {'root' : data}
    json.dump(root, f)

In [13]:
'test.,;/.,'.replace('./;','')

'test.,;/.,'

In [16]:
re.sub(r'[/,.?!"<>();&-]', "", 'test.,;/.,')

'test'

In [26]:
ns = {1,2,3}

In [28]:
for ele in ns:
    print(ele)

1
2
3


In [11]:
'TeSt'.lower()

'test'

In [9]:
'test,'.replace(',', '')

'test'

In [8]:
'2013 IEEE Conference on Computer Vision and Pattern Recognition'.split()

['2013',
 'IEEE',
 'Conference',
 'on',
 'Computer',
 'Vision',
 'and',
 'Pattern',
 'Recognition']