# A very bad article spinner using trigrams.

In [1]:
# Very basic article spinner for NLP class, which can be found at:
# https://deeplearningcourses.com/c/data-science-natural-language-processing-in-python
# https://www.udemy.com/data-science-natural-language-processing-in-python

# Author: http://lazyprogrammer.me

import nltk
import random
import numpy as np

from bs4 import BeautifulSoup

# load the reviews
# data courtesy of http://www.cs.jhu.edu/~mdredze/datasets/sentiment/index2.html
positive_reviews = BeautifulSoup(open('./tmp/dataset/sorted_data_acl/electronics/positive.review').read())
positive_reviews = positive_reviews.findAll('review_text')

In [2]:
# extract trigrams and insert into dictionary
# (w1, w3) is the key, [ w2 ] are the values
trigrams = {}
for review in positive_reviews:
    s = review.text.lower()
    tokens = nltk.tokenize.word_tokenize(s)
    for i in range(len(tokens) - 2):
        k = (tokens[i], tokens[i+2])
        if k not in trigrams:
            trigrams[k] = []
        trigrams[k].append(tokens[i+1])

In [3]:
# turn each array of middle-words into a probability vector
trigram_probabilities = {}
for k, words in trigrams.items():
    # create a dictionary of word -> count
    if len(set(words)) > 1:
        # only do this when there are different possibilities for a middle word
        d = {}
        n = 0
        for w in words:
            if w not in d:
                d[w] = 0
            d[w] += 1
            n += 1
        for w, c in d.items():
            d[w] = float(c) / n
        trigram_probabilities[k] = d

In [5]:
trigram_probabilities

{('i', 'this'): {'purchased': 0.12422360248447205,
  'bought': 0.3105590062111801,
  'recomend': 0.006211180124223602,
  'made': 0.012422360248447204,
  'picked': 0.018633540372670808,
  'say': 0.006211180124223602,
  'use': 0.055900621118012424,
  'had': 0.006211180124223602,
  'got': 0.055900621118012424,
  'think': 0.037267080745341616,
  'ordered': 0.012422360248447204,
  'matched': 0.006211180124223602,
  'noticed': 0.006211180124223602,
  'thought': 0.006211180124223602,
  'recommend': 0.024844720496894408,
  'choose': 0.006211180124223602,
  'like': 0.012422360248447204,
  'found': 0.055900621118012424,
  'did': 0.018633540372670808,
  'find': 0.024844720496894408,
  'set': 0.006211180124223602,
  'love': 0.024844720496894408,
  'hold': 0.006211180124223602,
  'received': 0.012422360248447204,
  'have': 0.018633540372670808,
  'buy': 0.006211180124223602,
  'chose': 0.012422360248447204,
  'put': 0.006211180124223602,
  'used': 0.012422360248447204,
  'give': 0.00621118012422360

In [6]:
def random_sample(d):
    # choose a random sample from dictionary where values are the probabilities
    r = random.random()
    cumulative = 0
    for w, p in d.items():
        cumulative += p
        if r < cumulative:
            return w

In [8]:
def test_spinner():
    review = random.choice(positive_reviews)
    s = review.text.lower()
    print("Original:", s)
    tokens = nltk.tokenize.word_tokenize(s)
    for i in range(len(tokens) - 2):
        if random.random() < 0.2: # 20% chance of replacement
            k = (tokens[i], tokens[i+2])
            if k in trigram_probabilities:
                w = random_sample(trigram_probabilities[k])
                tokens[i+1] = w
    print("Spun:")
    print(" ".join(tokens).replace(" .", ".").replace(" '", "'").replace(" ,", ",").replace("$ ", "$").replace(" !", "!"))


if __name__ == '__main__':
    test_spinner()

Original: 
excellent sound from phones and microphone. comfortable on head and ears. plenty of cable. on cable controls are handy.

Spun:
excellent sound from phones and portability. comfortable on bluetooth and wpc55ag. control of cable. on cable controls are handy.
