In [1]:
import operator
import math
import re

from pyspark import SparkConf, SparkContext

In [2]:
sc = SparkContext(conf=SparkConf().setAppName("Spark assignment 2: Collocations").setMaster("local"))

In [3]:
def parse_article(line):
    try:
        article_id, text = unicode(line.rstrip()).split('\t', 1)
        text = re.sub("^\W+|\W+$", "", text, flags=re.UNICODE)
        words = re.split("\W*\s+\W*", text, flags=re.UNICODE)
        return [word.lower() for word in words]
    except ValueError as e:
        return []

def process_article(words):
    prew = words[0]
    for word in words[1:]:
        yield (prew, word), 1
        prew = word

In [4]:
articles = sc.textFile("/data/wiki/en_articles_part/articles-part", 16)

In [5]:
words_count = (
    articles
    .flatMap(parse_article)
    .map(lambda item: (item, 1))
    .reduceByKey(operator.add)
)
total_words = float(words_count.values().sum())

In [6]:
with open('/datasets/stop_words_en.txt', 'r') as infile:
    stop_words = set(word.strip().lower() for word in infile)

In [7]:
words_map = words_count.filter(lambda row: row[0] not in stop_words).collectAsMap()

In [8]:
word_pairs = (
    articles
    .map(parse_article)
    .flatMap(process_article)
    .filter(
        # not in stop words
        lambda item: item[0][0] not in stop_words and item[0][1] not in stop_words)
    .reduceByKey(operator.add)
    .filter(
        # more then 500 occurencies
        lambda item: item[1] > 500
    )
)

In [9]:
def calc_PMI(row):
    """
    PMI(a, b) = ln (P(ab) / (P(a) * P(b))
    NPMI(a, b) = PMI(a, b) / -ln P(ab)
    """
    (word_a, word_b), count = row
    p_ab = count / total_words
    p_a = words_map[word_a] / total_words
    p_b = words_map[word_b] / total_words
    pmi = math.log(p_ab / (p_a * p_b))
    npmi = pmi / - math.log(p_ab)
    return (word_a, word_b), npmi
    

In [10]:
pmis_word_pairs = word_pairs.map(calc_PMI).sortBy(lambda row: row[1], False)

In [11]:
for row in pmis_word_pairs.take(39):
    print '{key[0]}_{key[1]}'.format(key=row[0])    

los_angeles
external_links
united_states
prime_minister
san_francisco
et_al
new_york
supreme_court
19th_century
20th_century
references_external
soviet_union
air_force
baseball_player
university_press
united_kingdom
roman_catholic
north_america
new_zealand
notes_references
civil_war
world_war
catholic_church
war_ii
south_africa
roman_empire
united_nations
took_place
american_singer-songwriter
high_school
american_actor
american_actress
american_baseball
york_city
american_football
years_later
north_american
