Calculate tf*idf for each pair (word, article) from the Wikipedia dump. Apply the stop words filter to speed up calculations. Term frequency (tf) is a function depending on a term (word) and a document (article):

tf(term, doc_id) = Nt/N,

where Nt - quantity of particular term in the document, N - the total number of terms in the document (without stop words)

Inverse document frequency (idf) is a function depends on a term:

idf(term) = 1/log(1 + Dt),

where Dt - number of documents in the dataset with the particular term.

You can find more information here: https://en.wikipedia.xn--org/wiki/Tfidf-q82h but use just the formulas mentioned above.

Output: tf*idf for term=’labor’ and article_id=12


In [1]:
%%writefile mapper.py

import sys
import re

reload(sys)
sys.setdefaultencoding('utf-8') # required to convert to unicode


def read_vocabulary(file_path):
    with open(file_path, 'r') as infile:
        return set(infile.read().splitlines())

stop_words = read_vocabulary("stop_words.txt")


for line in sys.stdin:
    try:
        article_id, text = unicode(line.strip()).split('\t', 1)
    except ValueError as e:
        continue

    text = re.sub("^\W+|\W+$", "", text, flags=re.UNICODE)
    words = re.split("\W*\s+\W*", text, flags=re.UNICODE)

    for word in words:
        if word.lower() in stop_words:
            continue
        print >> sys.stderr, "reporter:counter:Wiki stats,Total words,%d" % 1
        print '{article_id}\t{word}\t{count}'.format(article_id=article_id, word=word.lower(), count=1)
        print 'all\t{article_id}\t{count}'.format(article_id=article_id, count=1)


Overwriting mapper.py


In [2]:
%%writefile reducer.py

import sys

current_key = None
word_sum = 0

for line in sys.stdin:
    try:
        article_id, word, count = line.strip().split('\t', 2)
        key = article_id, word
        count = int(count)
    except ValueError as e:
        continue
    if current_key != key:
        if current_key:
            print '{key[0]}\t{key[1]}\t{count}'.format(key=current_key, count=word_sum)
        word_sum = 0
        current_key = key
    word_sum += count

if current_key:
    print '{key[0]}\t{key[1]}\t{count}'.format(key=current_key, count=word_sum)


Overwriting reducer.py


In [43]:
%%writefile reducer_idf.py

import sys
from math import log

current_key = None
word_sum = 0

for line in sys.stdin:
    try:
        article_id, word, _ = line.strip().split('\t', 2)
        key = word
    except ValueError as e:
        continue
    if current_key != key:
        if current_key:
            print '{key}\t{count}'.format(key=current_key, count=1. / log(1. + word_sum))
        word_sum = 0
        current_key = key
    word_sum += 1

if current_key:
    print '{key}\t{count}'.format(key=current_key, count=1. / log(1. + word_sum))


Overwriting reducer_idf.py


In [66]:
%%writefile mapper_tf.py

import sys
import os
import re
import subprocess

reload(sys)
sys.setdefaultencoding('utf-8') # required to convert to unicode

articles = {}

def get_article_count(article_id):
    if article_id not in articles:
        cmd = 'hdfs dfs -text {dirname}/* | egrep "^all\s{article_id}"'.format(
            dirname=os.environ.get('HDFS_COUNT_DIR'), article_id=article_id)
        res = subprocess.check_output(cmd, shell=True).strip().split('\t')[-1]
        if len(articles.keys()) > 100:
            global articles
            articles = {}
        articles[article_id] = res
    return float(articles[article_id])

for line in sys.stdin:
    try:
        article_id, word, count = line.strip().split('\t', 2)
        count = int(count)
    except ValueError as e:
        continue
    if article_id == 'all':
        continue
    tf = count / get_article_count(article_id)
    print '{article_id}\t{word}\t{count}'.format(article_id=article_id, word=word, count=tf)


Overwriting mapper_tf.py


In [None]:
%%bash

NUM_REDUCERS=8

IN_DIR="/data/wiki/en_articles_part"
COUNT_DIR="tf_idf_count"  # $(date +"%s%6N")
IDF_DIR="idf_out"  # $(date +"%s%6N")
TF_DIR="tf_out"  # $(date +"%s%6N")

# hdfs dfs -rm -r -f -skipTrash ${COUNT_DIR} > /dev/null
hdfs dfs -rm -r -f -skipTrash ${IDF_DIR} > /dev/null
hdfs dfs -rm -r -f -skipTrash ${TF_DIR} > /dev/null

# # yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar \
#     -D mapreduce.job.name="Prepare word counts" \
#     -D mapreduce.job.reduces=${NUM_REDUCERS} \
#     -D mapreduce.job.output.key.comparator.class=org.apache.hadoop.mapreduce.lib.partition.KeyFieldBasedComparator \
#     -D mapreduce.map.output.key.field.separator=$'\t' \
#     -D stream.num.map.output.key.fields=2 \
#     -D mapreduce.partition.keycomparator.options='-k2 -k1' \
#     -files mapper.py,reducer.py,/datasets/stop_words_en.txt#stop_words.txt \
#     -mapper "python mapper.py" \
#     -combiner "python reducer.py" \
#     -reducer "python reducer.py" \
#     -input ${IN_DIR} \
#     -output ${COUNT_DIR} > /dev/null

# # calc idf for each word
# yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar \
#     -D mapreduce.job.name="Calc idf words" \
#     -D mapreduce.job.reduces=1 \
#     -D mapreduce.job.output.key.comparator.class=org.apache.hadoop.mapreduce.lib.partition.KeyFieldBasedComparator \
#     -D mapreduce.map.output.key.field.separator=$'\t' \
#     -D stream.num.map.output.key.fields=2 \
#     -D mapreduce.partition.keycomparator.options='-k2' \
#     -files reducer_idf.py \
#     -mapper "egrep '^all' --invert" \
#     -reducer "python reducer_idf.py" \
#     -input ${COUNT_DIR} \
#     -output ${IDF_DIR} > /dev/null

# calc tf for each word
yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar \
    -D mapreduce.job.name="Calc tf words" \
    -D mapreduce.job.reduces=0 \
    -files mapper_tf.py \
    -cmdenv HDFS_COUNT_DIR='/user/jovyan/'${COUNT_DIR} \
    -mapper "python mapper_tf.py" \
    -input ${COUNT_DIR} \
    -output ${TF_DIR} > /dev/null

# print answ
