# Sentence Compresion

In [1]:
#Let's dowload sentence compression dataset by Filippova et al.

!git clone https://github.com/google-research-datasets/sentence-compression.git #terminaalissa(!)

Cloning into 'sentence-compression'...
remote: Counting objects: 41, done.[K
remote: Total 41 (delta 0), reused 0 (delta 0), pack-reused 41[K
Unpacking objects: 100% (41/41), done.
Checking connectivity... done.


In [3]:
!ls ./sentence-compression/data

comp-data.eval.json.gz	   sent-comp.train04.json.gz  sent-comp.train08.json.gz
sent-comp.train01.json.gz  sent-comp.train05.json.gz  sent-comp.train09.json.gz
sent-comp.train02.json.gz  sent-comp.train06.json.gz  sent-comp.train10.json.gz
sent-comp.train03.json.gz  sent-comp.train07.json.gz


In [1]:
#Because the json is not really in the format python is able to parse, the loading procedure
#is somewhat complicated

import gzip, json

inf = gzip.open('./sentence-compression/data/sent-comp.train01.json.gz', 'rt')
contents = inf.read()

contents = contents.replace('''{
  "graph":''', ''',{
  "graph":''')

contents = '[' + contents[1:] + ']'

data = json.loads(contents)

In [2]:
data[0].keys()

dict_keys(['graph', 'compression', 'headline', 'compression_ratio', 'doc_id', 'source_tree', 'compression_untransformed'])

In [3]:
data[0]['graph']

{'edge': [{'child_id': 30, 'label': 'subj', 'parent_id': 50},
  {'child_id': 35, 'label': 'dep', 'parent_id': 30},
  {'child_id': 36, 'label': 'advmod', 'parent_id': 35},
  {'child_id': 37, 'label': 'subj', 'parent_id': 39},
  {'child_id': 39, 'label': 'rcmod', 'parent_id': 35},
  {'child_id': 39, 'label': 'ROOT', 'parent_id': -1},
  {'child_id': 42, 'label': 'in', 'parent_id': 39},
  {'child_id': 44, 'label': 'conj', 'parent_id': 39},
  {'child_id': 46, 'label': 'in', 'parent_id': 44},
  {'child_id': 50, 'label': 'ROOT', 'parent_id': -1},
  {'child_id': 51, 'label': 'amod', 'parent_id': 52},
  {'child_id': 52, 'label': 'dobj', 'parent_id': 50},
  {'child_id': 55, 'label': 'conj', 'parent_id': 50},
  {'child_id': 58, 'label': 'for', 'parent_id': 55},
  {'child_id': 60, 'label': 'in', 'parent_id': 55},
  {'child_id': 62, 'label': 'tmod', 'parent_id': 55},
  {'child_id': 65, 'label': 'dobj', 'parent_id': 55},
  {'child_id': 66, 'label': 'advmod', 'parent_id': 74},
  {'child_id': 67, 'lab

In [4]:
data[0]['compression_untransformed']

{'edge': [{'child_id': 29, 'parent_id': 30},
  {'child_id': 30, 'parent_id': 50},
  {'child_id': 48, 'parent_id': 50},
  {'child_id': 49, 'parent_id': 50},
  {'child_id': 50, 'parent_id': -1},
  {'child_id': 51, 'parent_id': 52},
  {'child_id': 52, 'parent_id': 50},
  {'child_id': 53, 'parent_id': 50},
  {'child_id': 54, 'parent_id': 55},
  {'child_id': 55, 'parent_id': 50},
  {'child_id': 59, 'parent_id': 55},
  {'child_id': 60, 'parent_id': 59}],
 'text': 'Serge Ibaka has been granted Spanish citizenship and will play in EuroBasket.'}

In [5]:
#How on earth do we turn this into something we can learn?
#Here's a way:

xxx = data[0]

kept_tokens = []
for x in xxx["compression_untransformed"]['edge']:
    kept_tokens.append(x['parent_id'])
    kept_tokens.append(x['child_id'])

for x in xxx['graph']['node']:
    for w in x['word']:
        if w['form'] != 'ROOT':
            print (w['form'], w['id'] in kept_tokens)


Serge True
Ibaka True
the False
Oklahoma False
City False
Thunder False
forward False
who False
was False
born False
in False
the False
Congo False
but False
played False
in False
Spain False
has True
been True
granted True
Spanish True
citizenship True
and True
will True
play True
for False
the False
country False
in True
EuroBasket True
this False
summer False
the False
event False
where False
spots False
in False
the False
2012 False
Olympics False
will False
be False
decided False


In [6]:
#So, lets just parse the whole data
import glob, gzip, json

X = []
Y = []

example_limit = 50000

for input_file in glob.glob('./sentence-compression/data/sent-comp.train*.json.gz'):

    
    inf = gzip.open(input_file, 'rt')
    contents = inf.read()
    inf.close()
    
    contents = contents.replace('''{
  "graph":''', ''',{
  "graph":''')

    contents = '[' + contents[1:] + ']'
    data = json.loads(contents)
    
    if len(data) > example_limit:
        break
    
    for i, xxx in enumerate(data):
        t_x = []
        t_y = []

        kept_tokens = []
        for x in xxx["compression_untransformed"]['edge']:
            kept_tokens.append(x['parent_id'])
            kept_tokens.append(x['child_id'])

        for x in xxx['graph']['node']:
            for w in x['word']:
                if w['form'] != 'ROOT':
                    #print (w['form'], w['id'] in kept_tokens)
                    t_x.append(w['form'])
                    t_y.append(w['id'] in kept_tokens)

        X.append(t_x)
        Y.append(t_y)

In [7]:
idx = 55
print(X[idx])
print (Y[idx])
print ()
print (' '.join(X[idx]))
print (' '.join([x for x, y in zip(X[idx], Y[idx]) if y]))

['Toktogul', 'reservoir', 'water', 'volume', 'accumulated', 'remains', 'stable', 'according', 'press', 'to', 'the', 'service', 'of', 'Electric', 'Stations', 'Company']
[True, True, True, True, False, True, True, False, False, False, False, False, False, False, False, False]

Toktogul reservoir water volume accumulated remains stable according press to the service of Electric Stations Company
Toktogul reservoir water volume remains stable


This looks a lot like the pos example during the previous lecture. We see as input a list of english tokens and a list of output tags. Could we load these into the previous pos-taggin model and get ourselves a neural sentence compressor?

Obviously! Let's get busy.

In [8]:
len(X)

200000

In [9]:
#Let's cut this to train and validation data
import json

train_X = X[:180000]
train_Y = Y[:180000]

validation_X = X[180000:]
validation_Y = Y[180000:]

training_data = []
for x, y in zip(train_X, train_Y):
    training_data.append({'text': x, 'tags':y})

outf = open('./data/sent-comp-train.json','w')
json.dump(training_data, outf)
outf.close()

validation_data = []
for x, y in zip(validation_X, validation_Y):
    training_data.append({'text': x, 'tags':y})

outf = open('./data/sent-comp-eval.json','w')
json.dump(validation_data, outf)
outf.close()
