In [1]:
from common import load_data
import nltk
from pathlib import Path

In [2]:
path = '../../data/LSMDC/task1/LSMDC16_annos_training_someone.csv'
path = Path(path)
data = load_data(path)

In [3]:
sample = data[list(data.keys())[0]]

In [4]:
sample

'Now outside, SOMEONE kicks over a trashcan.'

In [5]:
from pytorch_transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2', unk_token="__")
tokenizer.add_special_tokens({'someone': 'SOMEONE'})
tokenizer.tokenize('SOMEONE comes')

['SOMEONE', 'comes']

In [6]:
sample = tokenizer.tokenize(sample)

In [7]:
data = {k: tokenizer.tokenize(v) for k, v in data.items()}
small_prefix = b'\xc4\xa1'.decode()
big_prefix = b'\xc4\xa0'.decode()

In [8]:
sample

['Now',
 'Ġoutside',
 ',',
 'SOMEONE',
 'k',
 'icks',
 'Ġover',
 'Ġa',
 'Ġtrash',
 'can',
 '.']

In [9]:
sample = {k: nltk.pos_tag([k.replace(big_prefix, "")])[0] for k in sample}
sample

{'Now': ('Now', 'RB'),
 'Ġoutside': ('outside', 'IN'),
 ',': (',', ','),
 'SOMEONE': ('SOMEONE', 'NN'),
 'k': ('k', 'NN'),
 'icks': ('icks', 'NNS'),
 'Ġover': ('over', 'IN'),
 'Ġa': ('a', 'DT'),
 'Ġtrash': ('trash', 'NN'),
 'can': ('can', 'MD'),
 '.': ('.', '.')}

In [11]:
def pos_filter(pos):
    if pos.startswith('NN'):
        return True
    if pos.startswith('VB'):
        return True
    if pos.startswith('PR'):
        return True
    return False

In [15]:
data[list(data.keys())[0]]

['Now',
 'Ġoutside',
 ',',
 'SOMEONE',
 'k',
 'icks',
 'Ġover',
 'Ġa',
 'Ġtrash',
 'can',
 '.']

In [16]:
data = {k: {k2: nltk.pos_tag([k2.replace(big_prefix, "")])[0] for k2 in v if k2 != '' and k2 != big_prefix} for k, v in data.items()}

In [17]:
data[list(data.keys())[0]]

{'Now': ('Now', 'RB'),
 'Ġoutside': ('outside', 'IN'),
 ',': (',', ','),
 'SOMEONE': ('SOMEONE', 'NN'),
 'k': ('k', 'NN'),
 'icks': ('icks', 'NNS'),
 'Ġover': ('over', 'IN'),
 'Ġa': ('a', 'DT'),
 'Ġtrash': ('trash', 'NN'),
 'can': ('can', 'MD'),
 '.': ('.', '.')}

In [18]:
data = {k: {k2: v2 for k2, v2 in v.items() if pos_filter(v2[1])} for k, v in data.items()}

In [26]:
data[list(data.keys())[1]]

{'S': ('S', 'NN'),
 'itting': ('itting', 'VBG'),
 'Ġbench': ('bench', 'NN'),
 'Ġnerd': ('nerd', 'NN'),
 'Ġlooks': ('looks', 'NNS')}

In [47]:
from itertools import chain

words = chain(*[v.items() for v in data.values()])

In [48]:
words = list([(w[0],w[1][0]) for w in words])

In [49]:
words[2]

('icks', 'icks')

In [52]:
from collections import Counter

c = Counter()
for w in words:
    c[w[0]] += 1

In [53]:
c.most_common(1000)

[('SOMEONE', 72129),
 ('Ġhis', 17007),
 ('s', 14478),
 ('Ġher', 10590),
 ('Ġhim', 6386),
 ('Ġit', 4733),
 ('Ġis', 3279),
 ('es', 3113),
 ('look', 3066),
 ('Ġdoor', 3009),
 ('is', 2902),
 ('st', 2750),
 ('t', 2647),
 ('Ġhead', 2640),
 ('ly', 2525),
 ('Ġhand', 2512),
 ('Ġeyes', 2471),
 ('They', 2470),
 ('Ġthem', 2278),
 ('Ġman', 2140),
 ('Ġroom', 2102),
 ('Ġface', 2075),
 ('akes', 2032),
 ('w', 1853),
 ('Ġtheir', 1810),
 ('turn', 1801),
 ('its', 1770),
 ('ing', 1764),
 ('Ġcar', 1590),
 ('ks', 1576),
 ('ers', 1573),
 ('Ġare', 1570),
 ('Ġthey', 1563),
 ('ares', 1518),
 ('Ġhands', 1451),
 ('sh', 1427),
 ('g', 1372),
 ('Ġtable', 1350),
 ('p', 1337),
 ('Ġfront', 1304),
 ('Ġlooks', 1294),
 ('Ġwindow', 1286),
 ('wal', 1242),
 ('stand', 1211),
 ('Ġside', 1202),
 ('sm', 1202),
 ('His', 1170),
 ('Ġgaze', 1159),
 ('Ġlook', 1131),
 ('gl', 1121),
 ('atches', 1094),
 ('ches', 1068),
 ('Ġturns', 1056),
 ('Ġwoman', 1046),
 ('Ġbed', 1023),
 ('le', 1017),
 ('l', 1012),
 ('iles', 1010),
 ('ed', 1000),
 ('Ġ

In [54]:
keywords = dict(c.most_common(1000))
keywords

{'SOMEONE': 72129,
 'Ġhis': 17007,
 's': 14478,
 'Ġher': 10590,
 'Ġhim': 6386,
 'Ġit': 4733,
 'Ġis': 3279,
 'es': 3113,
 'look': 3066,
 'Ġdoor': 3009,
 'is': 2902,
 'st': 2750,
 't': 2647,
 'Ġhead': 2640,
 'ly': 2525,
 'Ġhand': 2512,
 'Ġeyes': 2471,
 'They': 2470,
 'Ġthem': 2278,
 'Ġman': 2140,
 'Ġroom': 2102,
 'Ġface': 2075,
 'akes': 2032,
 'w': 1853,
 'Ġtheir': 1810,
 'turn': 1801,
 'its': 1770,
 'ing': 1764,
 'Ġcar': 1590,
 'ks': 1576,
 'ers': 1573,
 'Ġare': 1570,
 'Ġthey': 1563,
 'ares': 1518,
 'Ġhands': 1451,
 'sh': 1427,
 'g': 1372,
 'Ġtable': 1350,
 'p': 1337,
 'Ġfront': 1304,
 'Ġlooks': 1294,
 'Ġwindow': 1286,
 'wal': 1242,
 'stand': 1211,
 'Ġside': 1202,
 'sm': 1202,
 'His': 1170,
 'Ġgaze': 1159,
 'Ġlook': 1131,
 'gl': 1121,
 'atches': 1094,
 'ches': 1068,
 'Ġturns': 1056,
 'Ġwoman': 1046,
 'Ġbed': 1023,
 'le': 1017,
 'l': 1012,
 'iles': 1010,
 'ed': 1000,
 'Ġfloor': 999,
 'Ġsmile': 997,
 'Ġwater': 982,
 'ances': 960,
 'It': 948,
 'Ġits': 947,
 'Ġtowards': 930,
 'gr': 929,
 'r

In [55]:
import json

with open('../../data/LSMDC/task1/keywords/keywords_gpt_pos_top_1000.json', 'w') as f:
    json.dump(keywords, f)

