In [1]:
import os
import json 
import glob
from pathlib import Path
from sqlitedict import SqliteDict

In [2]:
n = 2
large_encoder = True

l = 'Large' if large_encoder else 'Small'
project_dir = Path(f'TableTest{l}{n:03d}/')
os.makedirs(project_dir, exist_ok=True)

In [3]:
dicto_tables = SqliteDict(f'{project_dir}/dictOtables.sqlite')

In [5]:
# Prepare tsv:   int_id     sent

with open('Data/tableTest.jsonl', 'r') as tt:
    for line in tt:
        table = json.loads(line)
        id_sent_pairs = list()
        dicto_tables[table['i']] = table['table_record']   
        
        for j, sent in enumerate(table['context_vals']):
            if sent: 
                sent = str(sent).replace('\n', ' ').replace('\t', ' ')
                while '  ' in sent:
                    sent = sent.replace('  ', ' ')                

                int_id = int(table['i']) * 100 + j
                id_sent_pairs.append(f'{int_id}\t{sent}\n')

        with open(f'{project_dir}/tableTest.tsv', 'a') as tt2:
            for pair in id_sent_pairs:
                tt2.write(pair)


In [6]:
tsv_files = glob.glob(f'{project_dir}/*.tsv')
tsv_files

['TableTestLarge002/tableTest2.tsv']

In [8]:
from SimSent.indexer.index_builder import IndexBuilder
from SimSent.vectorizer.sentence_vectorizer import SentenceVectorizer

W0514 11:35:25.424232 4790859200 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [9]:
sv = SentenceVectorizer(large=large_encoder)

Loading model: /Users/lukasferrer/Documents/SimSent/SimSent/vectorizer/model/96e8f1d3d4d90ce86b2db128249eb8143a91db73/
Initializing TF Session...


In [10]:
ibdr = IndexBuilder(project_dir, sentence_vectorizer=sv)

In [11]:
for tsv in tsv_files:
    ibdr.tsv_to_index(tsv)

In [12]:
from SimSent.server.service_handler import QueryHandler

from SimSent.vectorizer.sentence_vectorizer import DockerVectorizer
from SimSent.indexer.deploy_handler import RangeShards

In [13]:
# Ensure correct docker is running the vectorizer container in background 
dv = DockerVectorizer(large=large_encoder)
rs = RangeShards(project_dir, nprobe=8, get_nested=True)

In [14]:
qp = QueryHandler(dv, rs, project_dir=project_dir, get_nested=True)

In [15]:
# Searches by keys (useful for any type of category/range-search)
keys = list(rs.shards.keys())
keys

['tableTest2']

In [106]:
query01 = 'Tesla has revolutionized the automotive industry'
qhits01 = qp.query_corpus(query01, keys, k=5, radius=3.0)

  Query vectorized in --- 0.0371s
  Index searched in ----- 0.0001s
  Payload formatted in -- 0.0055s



In [107]:
for Key, Vals in qhits01.items():
    print(f'\n{Key}:')
    for v in Vals:
        print(f'   * ID:    {v[1]}\n'
              f'   * Score: {v[0]}\n'
              f'   * Text:  {v[2]}\n')


tableTest2:
   * ID:    110303
   * Score: 0.9472362399101257
   * Text:  The Toyota Camry TS-01 is a concept car based on the Toyota MCV30 Camry. It was designed and built mainly using resources of Toyota Australia, and was unveiled at the 2005 Melbourne International Motor Show in Australia.

   * ID:    174003
   * Score: 0.9918836355209351
   * Text:  The Electric Aircraft Corporation ElectraFlyer-C is an American experimental electric aircraft that was designed by Randall Fishman and produced by his company Electric Aircraft Corporation in 2008. The aircraft is a converted Monnett Moni motor glider intended to test electric propulsion technology for the future Electric Aircraft Corporation ElectraFlyer-X.[1][2][3][4][5][6]

   * ID:    110300
   * Score: 0.9975507259368896
   * Text:  Toyota Camry TS-01

   * ID:    110301
   * Score: 0.9975507259368896
   * Text:  Toyota Camry TS-01

   * ID:    90800
   * Score: 1.0352352857589722
   * Text:  Ferrari F2003-GA


top_hits:
   * I

In [108]:
# Check actual table with Table_ID:
# Table_ID = str(divmod(ID, 100)[0])

dicto_tables['110']

{'Mill location': 'Rolvenden , Kent',
 'Grid reference': 'TQ 838 315',
 'Coordinates': '(Show location on an interactive map) 51°3′13″N  0°37′17″E Coordinates : (Show location on an interactive map) 51°3′13″N  0°37′17″E',
 'Year built': '1772',
 'Purpose': 'Corn milling',
 'Type': 'Post mill',
 'Roundhouse storeys': 'Single storey',
 'No. of sails': 'Four',
 'Type of sails': 'Last worked on two Common sails and two Spring sails. Now carries four Common sails',
 'Windshaft': 'Wood with cast iron poll end.',
 'Winding': 'Tailpole',
 'No. of pairs of millstones': 'Two pairs, arranged Head and Tail',
 'Other information': 'Mill may be the one that was standing in 1596.'}

In [109]:
dicto_tables['1740']

{'Role': 'Experimental  electric aircraft',
 'National origin': 'United States',
 'Manufacturer': 'Electric Aircraft Corporation',
 'Designer': 'Randall Fishman',
 'First flight': '2008',
 'Introduction': '2008',
 'Status': 'Developmental prototype only',
 'Produced': '2008',
 'Number built': 'one',
 'Developed from': 'Electric Aircraft Corporation ElectraFlyer Trike and Monnett Moni'}

In [110]:
dicto_tables['908']

{'Races': '12', 'Wins': '7', 'Podiums': '13', 'Poles': '5', 'F.Laps': '5'}

In [145]:
# Try finding this table

dicto_tables['15']

{'Born': 'Akbar Hassani Rad April 27, 1945 (age\xa073) Tehran , Iran',
 'Alma\xa0mater': 'University of Tehran Cardiff University',
 'Occupation': 'Actor',
 'Years\xa0active': '1966–present'}

In [146]:
query02 = 'Iranian Actor from Tehran'
qhits02 = qp.query_corpus(query02, keys, k=5, radius=3.0)

for Key, Vals in qhits02.items():
    print(f'\n{Key}:')
    for v in Vals:
        print(f'   * ID:    {v[1]}\n'
              f'   * Score: {v[0]}\n'
              f'   * Text:  {v[2]}\n')
    break

  Query vectorized in --- 0.0342s
  Index searched in ----- 0.0031s
  Payload formatted in -- 0.0053s


tableTest2:
   * ID:    230701
   * Score: 0.5689262747764587
   * Text:  24th Prime Minister of Iran

   * ID:    118802
   * Score: 0.5867224931716919
   * Text:  Ali Abbas (footballer)

   * ID:    230704
   * Score: 0.6341533064842224
   * Text:  Member of the Parliament of Iran

   * ID:    84501
   * Score: 0.6578137278556824
   * Text:  8th General Director of the Abkhazian State TV and Radio

   * ID:    37502
   * Score: 0.6630013585090637
   * Text:  British-French actress and singer



In [148]:
dicto_tables['2307']

{'Monarch': 'Reza Shah',
 'Preceded by': 'Mahmoud Jam',
 'Succeeded by': 'Ali Mansur',
 'Constituency': 'Meshkin Shahr',
 'Born': '23 January 1897 Tehran , Iran',
 'Died': '26 June 1971 (aged\xa074) Tehran, Iran',
 'Political party': 'Monarchist Party [ citation needed ]'}

In [149]:
dicto_tables['1188']

{'Full name': 'Ali Abbas Mshehed Al-Hilfi',
 'Date of birth': '30 August 1986 (age\xa032)',
 'Place of birth': 'Baghdad, Iraq',
 'Height': '1.70\xa0m (5\xa0ft 7\xa0in)',
 'Playing position': 'Left Winger , center midfielder , left back',
 'Years': 'Team',
 '2005–2006': 'Al-Talaba',
 '2007–2008': 'Al-Quwa Al-Jawiya',
 '2008–2009': 'Marconi Stallions',
 '2009–2012': 'Newcastle Jets',
 '2012–2016': 'Sydney FC',
 '2016–2017': 'Pohang Steelers',
 '2017–2018': 'Wellington Phoenix',
 '2018': 'Wellington Phoenix Reserves',
 '2007': 'Iraq U-23',
 '2007–': 'Iraq'}

In [151]:
dicto_tables['845']

{'Preceded by·Emma Khojava': '',
 'Succeeded by·Tali Japua': '',
 'Prime Minister·Anri Jergenia Gennadi Gagulia Raul Khajimba Nodar Khashba': 'Minister for Culture',
 'Preceded by·Vladimir Zantaria': '',
 'Succeeded by·Nugzar Logua': '',
 'Born·1960 (age\xa058–59) Mgudzurkhva': 'Personal details',
 'Nationality·Abkhaz': ''}

In [152]:
dicto_tables['375']

{'Born': 'Charlotte Lucy Gainsbourg 21 July 1971 (age\xa047) London , England',
 'Nationality': 'British  French',
 'Occupation': 'Actress  singer',
 'Years\xa0active': '1984–present',
 'Partner(s)': 'Yvan Attal (1991–present; engaged)',
 'Children': '3',
 'Parent(s)': 'Serge Gainsbourg  Jane Birkin',
 'Relatives': 'Kate Barry (half-sister)  Lou Doillon (half-sister)  Judy Campbell (grandmother)  Andrew Birkin (uncle)  David Birkin (cousin)  Anno Birkin (cousin)',
 'Genres': 'Alternative rock indie pop dream pop',
 'Labels': 'Phonogram  Because  Atlantic  Vice  Elektra',
 'Associated acts': 'Beck  Nigel Godrich  Sebastian',
 'Website': 'charlottegainsbourg .com'}