In [1]:
import re
from random import uniform, choice
from collections import defaultdict
import numpy as np
from fuzzywuzzy import process
import pickle
import argparse

In [2]:
# symbols to keep - regex
r_alphabet = re.compile(u'[a-zA-Z0-9-]+|[.,:;?!]+')
r_filter = '(\w+)?[-]?\d+[-,.]?(\w+)?[\d+]?'
one_more_filter = '-+\w+'

exceptions = ['as', 'at', 'but', 'by', 'for', 'in', 'of', 'off', 'on', 'out', 'per', 'to', 'up', 'via', 'a', 'the', 'and', '.', ',', ':', ';', 'pi', 'new', 'bar', 'sum', 'sea', 'low', 'gas']

# generator - all lines to lower register
def gen_lines(corpus):
    data = open(corpus, encoding = 'utf-8')
    for line in data:
        yield line.lower()

# generator - tokens
def gen_tokens(lines):
    for line in lines:
        tokens = r_alphabet.findall(line)
        for token in tokens:
            if re.match(r_filter, token):
                continue
            if re.match(one_more_filter, token):
                continue
            elif (len(token) <= 3) & (token not in exceptions):
                continue
            else:
                yield token

# generator - trigrams (three words in a row)         
def gen_trigrams(tokens):
    t0, t1 = '$', '$'
    for t2 in tokens:
        yield t0, t1, t2
        if t2 in '.!?':
            yield t1, t2, '$'
            yield t2, '$','$'
            t0, t1 = '$', '$'
        else:
            t0, t1 = t1, t2

In [3]:
def train(corpus):
    lines = gen_lines(corpus)
    tokens = gen_tokens(lines)
    trigrams = gen_trigrams(tokens)

    bi, tri = defaultdict(lambda: 0.0), defaultdict(lambda: 0.0)

    for t0, t1, t2 in trigrams:
        bi[t0, t1] += 1
        tri[t0, t1, t2] += 1

    model = {}
    for (t0, t1, t2), freq in tri.items():
        if (t0, t1) in model:
            model[t0, t1].append((t2, freq/bi[t0, t1]))
        else:
            model[t0, t1] = [(t2, freq/bi[t0, t1])]
    return model

In [4]:
model = train('paper_titles_new.csv')

In [5]:
lines = list(gen_lines('paper_titles_new.csv'))
tokens_set = set(gen_tokens(lines))

In [6]:
len(lines)

1771038

In [7]:
with open('model', 'wb') as f:
    pickle.dump(model, f)

In [8]:
with open('tokens_set', 'wb') as f:
    pickle.dump(tokens_set, f)

In [9]:
def unirand(seq):
    items = []
    for item, freq in seq:
        items.append(item)
    token = choice(items)
    return token

In [10]:
def generate_title(first_word, exception = '', query = ''):
    phrase = ''
    first_word = first_word.lower()
    t0, t1 = '$', first_word
    
    if len(exception) != 0:
        phrase += query + ' ' + exception
    else:
        phrase += first_word
    while len(phrase) < 100:
        t0, t1 = t1, unirand(model[t0, t1])
        if t1 == '$': break
        if t1 in ('.!?,;:') or t0 == '$':
            phrase += t1
        else:
            phrase += ' ' + t1
    return phrase.capitalize()

In [14]:
def run(keyword):
    try:
        print(generate_title(keyword))
    except KeyError:
        print('Hmmmm... let me think...')
        extract = process.extract(keyword.lower(), tokens_set)
        exception = choice(extract)
        first_word = choice(['a', 'the'])
        print(generate_title(first_word, exception[0], keyword))

In [15]:
run('apple')

Hmmmm... let me think...
Apple dapple substantial energy output of population gradients of nitrogen dioxide measurement in same-sign
