In [25]:
import torch
import torch.nn.functional as F
import torchtext
import time
import random
import pandas as pd

torch.backends.cudnn.deterministic = True

In [26]:
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

VOCABULARY_SIZE = 20000
LEARNING_RATE = 0.005
BATCH_SIZE = 128
NUM_EPOCHS = 15
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

EMBEDDING_DIM = 128
HIDDEN_DIM = 256
NUM_CLASSES = 2

In [27]:
df = pd.read_csv('movie_data.csv')
df.head()

Unnamed: 0,TEXT_COLUMN_NAME,LABEL_COLUMN_NAME
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [28]:
df.columns = ['TEXT_COLUMN_NAME', 'LABEL_COLUMN_NAME'] 
df.to_csv('movie_data.csv', index=None) 
df = pd.read_csv('movie_data.csv') 
df.head()

Unnamed: 0,TEXT_COLUMN_NAME,LABEL_COLUMN_NAME
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0
3,hi for all the people who have seen this wonde...,1
4,"I recently bought the DVD, forgetting just how...",0


In [29]:
del df

In [30]:
### Defining the feature processing

TEXT = torchtext.legacy.data.Field(
    tokenize='spacy', # default splits on whitespace
    tokenizer_language='en_core_web_sm'
)

### Defining the label processing

LABEL = torchtext.legacy.data.LabelField(dtype=torch.long)

In [31]:
fields = [('TEXT_COLUMN_NAME', TEXT), ('LABEL_COLUMN_NAME', LABEL)]

dataset = torchtext.legacy.data.TabularDataset(
    path='movie_data.csv', format='csv',
    skip_header=True, fields=fields)

In [32]:
train_data, test_data = dataset.split(
    split_ratio=[0.8, 0.2],
    random_state=random.seed(RANDOM_SEED))

print(f'Num Train: {len(train_data)}')
print(f'Num Test: {len(test_data)}')

Num Train: 40000
Num Test: 10000


In [33]:
train_data, valid_data = train_data.split(
    split_ratio=[0.85, 0.15],
    random_state=random.seed(RANDOM_SEED))

print(f'Num Train: {len(train_data)}')
print(f'Num Validation: {len(valid_data)}')



Num Train: 34000
Num Validation: 6000


In [34]:
print(vars(train_data.examples[0]))

{'TEXT_COLUMN_NAME': ['For', 'some', 'unknown', 'reason', ',', '7', 'years', 'ago', ',', 'I', 'watched', 'this', 'movie', 'with', 'my', 'mother', 'and', 'sister', '.', 'I', 'do', "n't", 'think', 'I', "'ve", 'ever', 'laughed', 'as', 'hard', 'with', 'them', 'before', '.', 'This', 'movie', 'was', 'sooooo', 'bad', '.', 'How', 'sequels', 'were', 'produced', 'is', 'beyond', 'me', '.', 'Its', 'been', 'awhile', 'since', 'I', 'last', 'saw', 'this', '"', 'movie', '"', ',', 'but', 'the', 'one', 'impression', 'that', 'it', 'has', 'stuck', 'with', 'me', 'over', 'the', 'years', 'has', 'been', ',', '"', 'They', 'must', 'have', 'found', 'the', 'script', 'in', 'a', 'dumpster', 'in', 'the', 'backlot', 'of', 'a', 'cheap', 'movie', 'studio', ',', 'made', 'into', 'a', '"', 'movie', '"', ',', 'and', 'decided', 'that', 'it', 'did', "n't", 'suck', 'enough', ',', 'and', 'made', 'it', 'worse', '.', 'I', "'m", 'pretty', 'sure', 'that', 'they', 'spent', 'all', 'the', 'budget', 'on', 'camera', 'work', 'and', 'the'

In [35]:

TEXT.build_vocab(train_data, max_size=VOCABULARY_SIZE)
LABEL.build_vocab(train_data)

print(f'Vocabulary size: {len(TEXT.vocab)}')
print(f'Number of classes: {len(LABEL.vocab)}')

Vocabulary size: 20002
Number of classes: 2


In [36]:
train_loader, valid_loader, test_loader = \
    torchtext.legacy.data.BucketIterator.splits(
        (train_data, valid_data, test_data),
         batch_size=BATCH_SIZE,
         sort_within_batch=False,
         sort_key=lambda x: len(x.TEXT_COLUMN_NAME),
         device=DEVICE
    )

In [37]:
print('Train')
for batch in train_loader:
    print(f'Text matrix size: {batch.TEXT_COLUMN_NAME.size()}')
    print(f'Target vector size: {batch.LABEL_COLUMN_NAME.size()}')
    break
    
print('\nValid:')
for batch in valid_loader:
    print(f'Text matrix size: {batch.TEXT_COLUMN_NAME.size()}')
    print(f'Target vector size: {batch.LABEL_COLUMN_NAME.size()}')
    break
    
print('\nTest:')
for batch in test_loader:
    print(f'Text matrix size: {batch.TEXT_COLUMN_NAME.size()}')
    print(f'Target vector size: {batch.LABEL_COLUMN_NAME.size()}')
    break

Train
Text matrix size: torch.Size([1136, 128])
Target vector size: torch.Size([128])

Valid:
Text matrix size: torch.Size([55, 128])
Target vector size: torch.Size([128])

Test:
Text matrix size: torch.Size([52, 128])
Target vector size: torch.Size([128])


In [38]:
class RNN(torch.nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        #self.rnn = torch.nn.RNN(embedding_dim,
        #                        hidden_dim,
        #                        nonlinearity='relu')
        self.rnn = torch.nn.LSTM(embedding_dim,
                                 hidden_dim)        
        
        self.fc = torch.nn.Linear(hidden_dim, output_dim)
        

    def forward(self, text):
        # text dim: [sentence length, batch size]
        
        embedded = self.embedding(text)
        # embedded dim: [sentence length, batch size, embedding dim]
        
        output, (hidden, cell) = self.rnn(embedded)
        # output dim: [sentence length, batch size, hidden dim]
        # hidden dim: [1, batch size, hidden dim]

        hidden.squeeze_(0)
        # hidden dim: [batch size, hidden dim]
        
        output = self.fc(hidden)
        return output

In [39]:
torch.manual_seed(RANDOM_SEED)
model = RNN(input_dim=len(TEXT.vocab),
            embedding_dim=EMBEDDING_DIM,
            hidden_dim=HIDDEN_DIM,
            output_dim=NUM_CLASSES # could use 1 for binary classification
)

model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

In [40]:
def compute_accuracy(model, data_loader, device):

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for i, (features, targets) in enumerate(data_loader):

            features = features.to(device)
            targets = targets.float().to(device)

            logits = model(features)
            _, predicted_labels = torch.max(logits, 1)

            num_examples += targets.size(0)
            correct_pred += (predicted_labels == targets).sum()
    return correct_pred.float()/num_examples * 100

In [None]:
start_time = time.time()

for epoch in range(NUM_EPOCHS):
    model.train()
    for batch_idx, batch_data in enumerate(train_loader):
        
        text = batch_data.TEXT_COLUMN_NAME.to(DEVICE)
        labels = batch_data.LABEL_COLUMN_NAME.to(DEVICE)

        ### FORWARD AND BACK PROP
        logits = model(text)
        loss = F.cross_entropy(logits, labels)
        optimizer.zero_grad()
        
        loss.backward()
        
        ### UPDATE MODEL PARAMETERS
        optimizer.step()
        
        ### LOGGING
        if not batch_idx % 50:
            print (f'Epoch: {epoch+1:03d}/{NUM_EPOCHS:03d} | '
                   f'Batch {batch_idx:03d}/{len(train_loader):03d} | '
                   f'Loss: {loss:.4f}')

    with torch.set_grad_enabled(False):
        print(f'training accuracy: '
              f'{compute_accuracy(model, train_loader, DEVICE):.2f}%'
              f'\nvalid accuracy: '
              f'{compute_accuracy(model, valid_loader, DEVICE):.2f}%')
        
    print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    
print(f'Total Training Time: {(time.time() - start_time)/60:.2f} min')
print(f'Test accuracy: {compute_accuracy(model, test_loader, DEVICE):.2f}%')

Epoch: 001/015 | Batch 000/266 | Loss: 0.7060
Epoch: 001/015 | Batch 050/266 | Loss: 0.6915
Epoch: 001/015 | Batch 100/266 | Loss: 0.6959
Epoch: 001/015 | Batch 150/266 | Loss: 0.6940
Epoch: 001/015 | Batch 200/266 | Loss: 0.7001
Epoch: 001/015 | Batch 250/266 | Loss: 0.6879
training accuracy: 50.09%
valid accuracy: 49.27%
Time elapsed: 243.50 min
Epoch: 002/015 | Batch 000/266 | Loss: 0.6922


In [42]:

import requests
from bs4 import BeautifulSoup
import re
import os
import unicodedata
import string
import importlib
import spacy
from nltk.stem.snowball import SnowballStemmer

In [44]:
import sys
sys.prefix

'C:\\Users\\jhu\\Anaconda3\\envs\\torch'

In [45]:
page = requests.get('https://cooking.nytimes.com/recipes/1018442-chicken-soup-from-scratch')
soup = BeautifulSoup(page.content, 'html.parser')
steps = soup.findAll("ol", {"class": "recipe-steps"})

print(steps)

[<ol class="recipe-steps">
<li>Place the chicken, celery, carrots, onions, parsnip (if using), parsley, peppercorns, bay leaves and salt in a large soup pot and cover with cold water by 1 inch.</li>
<li>Bring to a boil over high heat, then immediately reduce the heat to very low. Adjust the heat until the soup is “smiling”: barely moving on the surface, with an occasional bubble breaking through. Cook uncovered, until the chicken is very tender and falling off the bone, 1 to 1 1/2 hours.</li>
<li>When cool enough to handle, use tongs to transfer chicken from the pot to a container. Taste the broth and continue to simmer it until it is concentrated and tasty. Strain broth through a fine sieve (or a colander lined with cheesecloth) into a separate container. Discard all the solids from the strainer (or reserve the vegetables, chill and serve with vinaigrette, if you wish).</li>
<li>Refrigerate chicken pieces and broth separately for at least 8 hours (or up to 3 days), until a thick layer

In [46]:
def cleanhtml(raw_html):
    """Function to clean up the html tags in data."""
    cleanr = re.compile('<.*?>')
    # Remove html tags
    cleantext = re.sub(cleanr, '', raw_html)
    cleantext = cleantext.replace('\n', ' ').rstrip().strip()
    return cleantext

cleansteps = cleanhtml(str(steps[0]))
print(cleansteps)

Place the chicken, celery, carrots, onions, parsnip (if using), parsley, peppercorns, bay leaves and salt in a large soup pot and cover with cold water by 1 inch. Bring to a boil over high heat, then immediately reduce the heat to very low. Adjust the heat until the soup is “smiling”: barely moving on the surface, with an occasional bubble breaking through. Cook uncovered, until the chicken is very tender and falling off the bone, 1 to 1 1/2 hours. When cool enough to handle, use tongs to transfer chicken from the pot to a container. Taste the broth and continue to simmer it until it is concentrated and tasty. Strain broth through a fine sieve (or a colander lined with cheesecloth) into a separate container. Discard all the solids from the strainer (or reserve the vegetables, chill and serve with vinaigrette, if you wish). Refrigerate chicken pieces and broth separately for at least 8 hours (or up to 3 days), until a thick layer of yellow fat has risen to the top of the broth. When rea

In [48]:
with open(os.path.join('', 'sample_data.txt'), 'w') as fptr:
    fptr.write(cleansteps)

In [51]:

! deon -o ETHICS.md

Checklist successfully written to file ETHICS.md.


In [52]:

# Read our data back in
with open(os.path.join('', 'sample_data.txt'), 'r') as fptr:
    article = fptr.read()
print(article)

Place the chicken, celery, carrots, onions, parsnip (if using), parsley, peppercorns, bay leaves and salt in a large soup pot and cover with cold water by 1 inch. Bring to a boil over high heat, then immediately reduce the heat to very low. Adjust the heat until the soup is “smiling”: barely moving on the surface, with an occasional bubble breaking through. Cook uncovered, until the chicken is very tender and falling off the bone, 1 to 1 1/2 hours. When cool enough to handle, use tongs to transfer chicken from the pot to a container. Taste the broth and continue to simmer it until it is concentrated and tasty. Strain broth through a fine sieve (or a colander lined with cheesecloth) into a separate container. Discard all the solids from the strainer (or reserve the vegetables, chill and serve with vinaigrette, if you wish). Refrigerate chicken pieces and broth separately for at least 8 hours (or up to 3 days), until a thick layer of yellow fat has risen to the top of the broth. When rea

In [54]:

spacy_nlp = spacy.load('en_core_web_sm')

In [55]:
doc = spacy_nlp(article)
tokens = [token.text for token in doc]
print(tokens)

['Place', 'the', 'chicken', ',', 'celery', ',', 'carrots', ',', 'onions', ',', 'parsnip', '(', 'if', 'using', ')', ',', 'parsley', ',', 'peppercorns', ',', 'bay', 'leaves', 'and', 'salt', 'in', 'a', 'large', 'soup', 'pot', 'and', 'cover', 'with', 'cold', 'water', 'by', '1', 'inch', '.', 'Bring', 'to', 'a', 'boil', 'over', 'high', 'heat', ',', 'then', 'immediately', 'reduce', 'the', 'heat', 'to', 'very', 'low', '.', 'Adjust', 'the', 'heat', 'until', 'the', 'soup', 'is', '“', 'smiling', '”', ':', 'barely', 'moving', 'on', 'the', 'surface', ',', 'with', 'an', 'occasional', 'bubble', 'breaking', 'through', '.', 'Cook', 'uncovered', ',', 'until', 'the', 'chicken', 'is', 'very', 'tender', 'and', 'falling', 'off', 'the', 'bone', ',', '1', 'to', '1', '1/2', 'hours', '.', 'When', 'cool', 'enough', 'to', 'handle', ',', 'use', 'tongs', 'to', 'transfer', 'chicken', 'from', 'the', 'pot', 'to', 'a', 'container', '.', 'Taste', 'the', 'broth', 'and', 'continue', 'to', 'simmer', 'it', 'until', 'it', 'i

In [58]:
for token in doc:
    print(token.text)

Place
the
chicken
,
celery
,
carrots
,
onions
,
parsnip
(
if
using
)
,
parsley
,
peppercorns
,
bay
leaves
and
salt
in
a
large
soup
pot
and
cover
with
cold
water
by
1
inch
.
Bring
to
a
boil
over
high
heat
,
then
immediately
reduce
the
heat
to
very
low
.
Adjust
the
heat
until
the
soup
is
“
smiling
”
:
barely
moving
on
the
surface
,
with
an
occasional
bubble
breaking
through
.
Cook
uncovered
,
until
the
chicken
is
very
tender
and
falling
off
the
bone
,
1
to
1
1/2
hours
.
When
cool
enough
to
handle
,
use
tongs
to
transfer
chicken
from
the
pot
to
a
container
.
Taste
the
broth
and
continue
to
simmer
it
until
it
is
concentrated
and
tasty
.
Strain
broth
through
a
fine
sieve
(
or
a
colander
lined
with
cheesecloth
)
into
a
separate
container
.
Discard
all
the
solids
from
the
strainer
(
or
reserve
the
vegetables
,
chill
and
serve
with
vinaigrette
,
if
you
wish
)
.
Refrigerate
chicken
pieces
and
broth
separately
for
at
least
8
hours
(
or
up
to
3
days
)
,
until
a
thick
layer
of
yellow
fat
has
risen

In [59]:
all_letters_numbers = string.ascii_letters + " .,;'" + "0123456789"
n_letters = len(all_letters_numbers)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters_numbers
    )

In [60]:
ascii_article = unicode_to_ascii(article)
print(ascii_article)

Place the chicken, celery, carrots, onions, parsnip if using, parsley, peppercorns, bay leaves and salt in a large soup pot and cover with cold water by 1 inch. Bring to a boil over high heat, then immediately reduce the heat to very low. Adjust the heat until the soup is smiling barely moving on the surface, with an occasional bubble breaking through. Cook uncovered, until the chicken is very tender and falling off the bone, 1 to 1 12 hours. When cool enough to handle, use tongs to transfer chicken from the pot to a container. Taste the broth and continue to simmer it until it is concentrated and tasty. Strain broth through a fine sieve or a colander lined with cheesecloth into a separate container. Discard all the solids from the strainer or reserve the vegetables, chill and serve with vinaigrette, if you wish. Refrigerate chicken pieces and broth separately for at least 8 hours or up to 3 days, until a thick layer of yellow fat has risen to the top of the broth. When ready to finish

In [61]:

# Lemmatize unless it's a special case, e.g. '-PRON-' replacing 'it'
lemmatized_tokens = [token.lemma_ if '-' not in token.lemma_ else token.text for token in doc]
print(lemmatized_tokens)

['place', 'the', 'chicken', ',', 'celery', ',', 'carrot', ',', 'onion', ',', 'parsnip', '(', 'if', 'use', ')', ',', 'parsley', ',', 'peppercorns', ',', 'bay', 'leave', 'and', 'salt', 'in', 'a', 'large', 'soup', 'pot', 'and', 'cover', 'with', 'cold', 'water', 'by', '1', 'inch', '.', 'bring', 'to', 'a', 'boil', 'over', 'high', 'heat', ',', 'then', 'immediately', 'reduce', 'the', 'heat', 'to', 'very', 'low', '.', 'adjust', 'the', 'heat', 'until', 'the', 'soup', 'be', '"', 'smile', '"', ':', 'barely', 'move', 'on', 'the', 'surface', ',', 'with', 'an', 'occasional', 'bubble', 'break', 'through', '.', 'Cook', 'uncover', ',', 'until', 'the', 'chicken', 'be', 'very', 'tender', 'and', 'fall', 'off', 'the', 'bone', ',', '1', 'to', '1', '1/2', 'hour', '.', 'when', 'cool', 'enough', 'to', 'handle', ',', 'use', 'tong', 'to', 'transfer', 'chicken', 'from', 'the', 'pot', 'to', 'a', 'container', '.', 'taste', 'the', 'broth', 'and', 'continue', 'to', 'simmer', 'it', 'until', 'it', 'be', 'concentrated',

In [62]:
print([token.lemma_ for token in doc])

['place', 'the', 'chicken', ',', 'celery', ',', 'carrot', ',', 'onion', ',', 'parsnip', '(', 'if', 'use', ')', ',', 'parsley', ',', 'peppercorns', ',', 'bay', 'leave', 'and', 'salt', 'in', 'a', 'large', 'soup', 'pot', 'and', 'cover', 'with', 'cold', 'water', 'by', '1', 'inch', '.', 'bring', 'to', 'a', 'boil', 'over', 'high', 'heat', ',', 'then', 'immediately', 'reduce', 'the', 'heat', 'to', 'very', 'low', '.', 'adjust', 'the', 'heat', 'until', 'the', 'soup', 'be', '"', 'smile', '"', ':', 'barely', 'move', 'on', 'the', 'surface', ',', 'with', 'an', 'occasional', 'bubble', 'break', 'through', '.', 'Cook', 'uncover', ',', 'until', 'the', 'chicken', 'be', 'very', 'tender', 'and', 'fall', 'off', 'the', 'bone', ',', '1', 'to', '1', '1/2', 'hour', '.', 'when', 'cool', 'enough', 'to', 'handle', ',', 'use', 'tong', 'to', 'transfer', 'chicken', 'from', 'the', 'pot', 'to', 'a', 'container', '.', 'taste', 'the', 'broth', 'and', 'continue', 'to', 'simmer', '-PRON-', 'until', '-PRON-', 'be', 'concen

In [63]:
no_stop_words = [token.text for token in doc if not token.is_stop]
print(no_stop_words)

['Place', 'chicken', ',', 'celery', ',', 'carrots', ',', 'onions', ',', 'parsnip', '(', ')', ',', 'parsley', ',', 'peppercorns', ',', 'bay', 'leaves', 'salt', 'large', 'soup', 'pot', 'cover', 'cold', 'water', '1', 'inch', '.', 'Bring', 'boil', 'high', 'heat', ',', 'immediately', 'reduce', 'heat', 'low', '.', 'Adjust', 'heat', 'soup', '“', 'smiling', '”', ':', 'barely', 'moving', 'surface', ',', 'occasional', 'bubble', 'breaking', '.', 'Cook', 'uncovered', ',', 'chicken', 'tender', 'falling', 'bone', ',', '1', '1', '1/2', 'hours', '.', 'cool', 'handle', ',', 'use', 'tongs', 'transfer', 'chicken', 'pot', 'container', '.', 'Taste', 'broth', 'continue', 'simmer', 'concentrated', 'tasty', '.', 'Strain', 'broth', 'fine', 'sieve', '(', 'colander', 'lined', 'cheesecloth', ')', 'separate', 'container', '.', 'Discard', 'solids', 'strainer', '(', 'reserve', 'vegetables', ',', 'chill', 'serve', 'vinaigrette', ',', 'wish', ')', '.', 'Refrigerate', 'chicken', 'pieces', 'broth', 'separately', '8', 'h

In [64]:

stemmer = SnowballStemmer("english")
stemmed_tokens = [stemmer.stem(token.text) for token in doc]
print(stemmed_tokens)

['place', 'the', 'chicken', ',', 'celeri', ',', 'carrot', ',', 'onion', ',', 'parsnip', '(', 'if', 'use', ')', ',', 'parsley', ',', 'peppercorn', ',', 'bay', 'leav', 'and', 'salt', 'in', 'a', 'larg', 'soup', 'pot', 'and', 'cover', 'with', 'cold', 'water', 'by', '1', 'inch', '.', 'bring', 'to', 'a', 'boil', 'over', 'high', 'heat', ',', 'then', 'immedi', 'reduc', 'the', 'heat', 'to', 'veri', 'low', '.', 'adjust', 'the', 'heat', 'until', 'the', 'soup', 'is', '“', 'smile', '”', ':', 'bare', 'move', 'on', 'the', 'surfac', ',', 'with', 'an', 'occasion', 'bubbl', 'break', 'through', '.', 'cook', 'uncov', ',', 'until', 'the', 'chicken', 'is', 'veri', 'tender', 'and', 'fall', 'off', 'the', 'bone', ',', '1', 'to', '1', '1/2', 'hour', '.', 'when', 'cool', 'enough', 'to', 'handl', ',', 'use', 'tong', 'to', 'transfer', 'chicken', 'from', 'the', 'pot', 'to', 'a', 'contain', '.', 'tast', 'the', 'broth', 'and', 'continu', 'to', 'simmer', 'it', 'until', 'it', 'is', 'concentr', 'and', 'tasti', '.', 'str

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.7-cp38-cp38-win_amd64.whl (156 kB)
Collecting tabulate
  Using cached tabulate-0.8.9-py3-none-any.whl (25 kB)
Installing collected packages: python-crfsuite, tabulate, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.7 sklearn-crfsuite-0.3.6 tabulate-0.8.9
