# Table Tutorial

## Part I: Preprocessing

In [1]:
# If necessary:
import os
os.remove('snorkel.db')

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')

from snorkel import SnorkelSession
session = SnorkelSession()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Parse the Train `Corpus`

In [3]:
from snorkel.parser import CorpusParser
from snorkel.parser import HTMLParser
from snorkel.parser import OmniParser

doc_parser = HTMLParser(path='data/hardware/hardware100_html/')
context_parser = OmniParser()
cp = CorpusParser(doc_parser, context_parser, max_docs=100) 

In [4]:
%time corpus = cp.parse_corpus(name='Hardware', session=session)


CPU times: user 3min 36s, sys: 12.5 s, total: 3min 49s
Wall time: 9min 31s


In [5]:
for doc in corpus.documents[:3]: print doc

Document TWSCS04757-1
Document JCSTS01155-1
Document SGSTS13702-1


### Saving the `Corpus`
Finally, we persist the parsed corpus in Snorkel's database backend:

In [6]:
session.add(corpus)
session.commit()

### Reloading the `Corpus`
If the corpus has already been parsed, load it here:

In [7]:
from snorkel.models import Corpus

corpus = session.query(Corpus).filter(Corpus.name == 'Hardware').one()
print "%s contains %d Documents" % (corpus, len(corpus))

Corpus (Hardware) contains 98 Documents


## Parse the Dev/Test `Corpus`

In [8]:
# doc_parser = HTMLParser(path='data/hardware/hardware100_html/')
# context_parser = OmniParser()
# cp = CorpusParser(doc_parser, context_parser, max_docs=100) 

In [9]:
# %time corpus = cp.parse_corpus(name='Hardware', session=session)

In [10]:
# session.add(corpus)
# session.commit()

### Split the `Corpus` into Train/Dev/Test

In [11]:
# TEMP
# from snorkel.utils import get_ORM_instance
# from snorkel.models import Corpus

# corpus = get_ORM_instance(Corpus, session, 'Hardware Training')
# session.delete(corpus)

# corpus = get_ORM_instance(Corpus, session, 'Hardware Development')
# session.delete(corpus)

# corpus = get_ORM_instance(Corpus, session, 'Hardware Test')
# session.delete(corpus)

# session.commit()

In [12]:
from snorkel.utils import get_ORM_instance
from snorkel.queries import split_corpus

corpus = get_ORM_instance(Corpus, session, 'Hardware')
split_corpus(session, corpus, train=0.8, development=0.2, test=0, seed=1)

78 Documents added to corpus Hardware Training
20 Documents added to corpus Hardware Development


In [13]:
from snorkel.utils import get_ORM_instance
from snorkel.models import Corpus

corpus = get_ORM_instance(Corpus, session, 'Hardware Training')
print "%s contains %d Documents" % (corpus, len(corpus))

corpus = get_ORM_instance(Corpus, session, 'Hardware Development')
print "%s contains %d Documents" % (corpus, len(corpus))

Corpus (Hardware Training) contains 78 Documents
Corpus (Hardware Development) contains 20 Documents


In [14]:
# If necessary
import os
os.system('cp snorkel.db snorkel.db\ corpus');

Next, in Part 2, we will look at how to extract `Candidate` relations from our saved `Corpus`.