## Pointwise mutual imformation for a random sample of N tokens in Shakespeare's work. The result is PMI data for tokens that co-occur with the sample tokens such that the number of co-occurances exceed a given threshold.

In [1]:
#from thresholdPMI import PMI
from simple_tokenize import simple_tokenize
from math import log
from itertools import permutations
from pyspark import SparkContext, SparkConf
sc = SparkContext(appName="MyApp", master="local[2]")

In [2]:
# Returns a list of tuples with the following format:
# ((token1, token2), pmi, co-occurrence_count, token1_count, token2_count)
def PMI(threshold):
    # read in text file as RDD
    lines = sc.textFile('Shakespeare.txt')
    
    # count line co-occurances for each pair
    # filter for co-occurances >= threshold
    pairCount = lines.map(lambda line: simple_tokenize(line)) \
                     .map(lambda line: list(set(line))) \
                     .flatMap(lambda line: permutations(line, 2)) \
                     .map(lambda pair: (pair, 1)) \
                     .reduceByKey(lambda x, y: x + y) \
                     .filter(lambda x: x[1] >= threshold)
    
    # count line occurances for each token
    tokenCount = lines.map(lambda line: simple_tokenize(line)) \
                      .flatMap(lambda line: list(set(line))) \
                      .map(lambda token: (token, 1)) \
                      .reduceByKey(lambda x, y: x + y)

    # PMI function
    # pmi(nxy, nx, ny, nlines) takes n(x), n(y), n(x,y), and the number of lines and produces the PMI
    # pmi: Int Int Int --> Int
    def pmi(nxy, nx, ny, nlines=lines.count()):
        pxy = nxy / nlines
        px = nx / nlines
        py = ny / nlines
        pmi = log(pxy / (px * py), 10)
        return pmi
    
    # join pairCount and tokenCount for each pair
    #  algorithm:
    #   join tokenCount on token 1
    #   join tokenCount on token 2
    #   compute pmi
    #   organize results
    pmiData = pairCount.map(lambda x: (x[0][0], (x[0][1], x[1]))).join(tokenCount) \
                       .map(lambda x: (x[1][0][0], (x[0], x[1][0][1], x[1][1]))).join(tokenCount) \
                       .map(lambda x: ((x[1][0][0], x[0]), x[1][0][1], x[1][0][2], x[1][1])) \
                       .map(lambda x: ((x[0], pmi(x[1], x[2], x[3])) + x[1:4]))

    return pmiData.collect()

In [3]:
# Returns a list of samp_size tuples with the following format:
# (token, [ list_of_cooccurring_tokens ])
# where list_of_cooccurring_tokens is of the form
# [((token1, token2), pmi, cooc_count, token1_count, token2_count), ...]
def PMI_one_token(threshold, samp_size):
    # read in text file as RDD
    lines = sc.textFile('Shakespeare.txt') 
    
    # distinct tokens
    distTokens = lines.flatMap(simple_tokenize).distinct()
    
    # N random tokens
    sample = distTokens.takeSample(False, samp_size)
    
    # pmi data for tokens in sample and associated co-occurances >= threshold
    pmiData = list(filter(lambda x: x[0][0] in sample, PMI(threshold)))
    
    # parallelize sample and organize pmi data within RDD
    data = sc.parallelize(sample) \
             .map(lambda token: (token, list(filter(lambda x: x[0][0] == token, pmiData))))
    
    return data.collect()

In [5]:
PMI_one_token(threshold=3, samp_size=10)

[('wick', []),
 ('bere', []),
 ('handless', []),
 ('adversely', []),
 ('fallible', []),
 ('footstool', []),
 ('gargrave',
  [(('gargrave', 'thomas'), 2.9698878501256165, 4, 7, 75),
   (('gargrave', 'sir'), 1.4251585274109542, 4, 7, 2629)]),
 ('pro', []),
 ('perigouna', []),
 ('command',
  [(('command', 'as'), 0.31196606821341405, 14, 170, 4917),
   (('command', 'when'), 0.2479438316001176, 5, 170, 2035),
   (('command', 'are'), -0.18332770748007804, 3, 170, 3296),
   (('command', 'of'), -0.2492539370809278, 13, 170, 16624),
   (('command', 'let'), 0.24011849408816105, 5, 170, 2072),
   (('command', 'no'), 0.0855834918613885, 6, 170, 3549),
   (('command', 'we'), -0.03525178952679398, 4, 170, 3125),
   (('command', 'have'), 0.05271756210417533, 9, 170, 5742),
   (('command', 'king'), -0.11404682436007972, 3, 170, 2810),
   (('command', 'away'), 0.5441983969410301, 4, 170, 823),
   (('command', 'must'), 0.2952453763688629, 4, 170, 1460),
   (('command', 'by'), -0.004473810424878968, 5, 1