Calculate tf*idf for each pair (word, article) from the Wikipedia dump. Apply the stop words filter to speed up calculations. Term frequency (tf) is a function depending on a term (word) and a document (article):

tf(term, doc_id) = Nt/N,

where Nt - quantity of particular term in the document, N - the total number of terms in the document (without stop words)

Inverse document frequency (idf) is a function depends on a term:

idf(term) = 1/log(1 + Dt),

where Dt - number of documents in the dataset with the particular term.

You can find more information here: https://en.wikipedia.xn--org/wiki/Tfidf-q82h but use just the formulas mentioned above.

Output: tf*idf for term=’labor’ and article_id=12


In [1]:
%%writefile mapper.py

import sys
import re

reload(sys)
sys.setdefaultencoding('utf-8') # required to convert to unicode


def read_vocabulary(file_path):
    with open(file_path, 'r') as infile:
        return set(infile.read().splitlines())

stop_words = read_vocabulary("stop_words.txt")


for line in sys.stdin:
    try:
        article_id, text = unicode(line.strip()).split('\t', 1)
    except ValueError as e:
        continue

    text = re.sub("^\W+|\W+$", "", text, flags=re.UNICODE)
    words = re.split("\W*\s+\W*", text, flags=re.UNICODE)

    for word in words:
        if word.lower() in stop_words:
            continue
        print >> sys.stderr, "reporter:counter:Wiki stats,Total words,%d" % 1
        print '{article_id}\t{word}\t{count}'.format(article_id=article_id, word=word.lower(), count=1)
        print '0\t{article_id}\t{count}'.format(article_id=article_id, count=1)


Overwriting mapper.py


In [2]:
%%writefile reducer.py

import sys

current_key = None
word_sum = 0

for line in sys.stdin:
    try:
        article_id, word, count = line.strip().split('\t', 2)
        key = article_id, word
        count = int(count)
    except ValueError as e:
        continue
    if current_key != key:
        if current_key:
            print '{key[0]}\t{key[1]}\t{count}'.format(key=current_key, count=word_sum)
        word_sum = 0
        current_key = key
    word_sum += count

if current_key:
    print '{key[0]}\t{key[1]}\t{count}'.format(key=current_key, count=word_sum)


Overwriting reducer.py


In [3]:
%%writefile reducer_idf.py

import sys
import os
from math import log

current_key = None
word_sum = 0
total = float(os.environ.get('TOTAL_ARTICLES'))

for line in sys.stdin:
    try:
        word, _ = line.strip().split('\t', 1)
        key = word
    except ValueError as e:
        continue
    print 'tf\t{}'.format(line.strip())
    if current_key != key:
        if current_key:
            print 'idf\t{key}\t{count:.12f}'.format(key=current_key, count=log(total / word_sum))
        word_sum = 0
        current_key = key
    word_sum += 1

if current_key:
    print 'idf\t{key}\t{count:.12f}'.format(key=current_key, count=log(total / word_sum))


Overwriting reducer_idf.py


In [4]:
%%writefile mapper_tf.py

import sys
import os
import re
import subprocess

reload(sys)
sys.setdefaultencoding('utf-8') # required to convert to unicode

articles = {}
with open ('articles.txt', 'r') as infile:
    for row in infile:
        key, value = row.strip().split()
        articles[key] = float(value)

for line in sys.stdin:
    try:
        article_id, word, count = line.strip().split('\t', 2)
        count = int(count)
    except ValueError as e:
        continue
    if article_id == '0':
        continue
    print '{word}\t{article_id}\t{count:.12f}'.format(
        article_id=article_id, word=word, count=count / articles[article_id])


Overwriting mapper_tf.py


In [5]:
%%bash

NUM_REDUCERS=8

IN_DIR="/data/wiki/en_articles_part"
COUNT_DIR="tf_idf_count" # $(date +"%s%6N")
ARTICLES='articles_count.txt' # $(date +"%s%6N")
TF_IDF_DIR="tf_idf_out" # $(date +"%s%6N")

hdfs dfs -rm -r -f -skipTrash ${COUNT_DIR} > /dev/null
hdfs dfs -rm -r -f -skipTrash ${TF_IDF_DIR} > /dev/null

yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar \
    -D mapreduce.job.name="Prepare word counts" \
    -D mapreduce.job.reduces=${NUM_REDUCERS} \
    -D stream.num.map.output.key.fields=2 \
    -files mapper.py,reducer.py,/datasets/stop_words_en.txt#stop_words.txt \
    -mapper "python mapper.py" \
    -combiner "python reducer.py" \
    -reducer "python reducer.py" \
    -input ${IN_DIR} \
    -output ${COUNT_DIR} > /dev/null

# calc idf * tf for each word
hdfs dfs -text ${COUNT_DIR}/part* | egrep "^0\s" | cut -d$'\t' -f2,3 > ${ARTICLES}
total_articles=`wc -l ${ARTICLES} | cut -d' ' -f1`

yarn jar /opt/cloudera/parcels/CDH/lib/hadoop-mapreduce/hadoop-streaming.jar \
    -D mapreduce.job.name="Calc tf idf" \
    -D mapreduce.job.reduces=${NUM_REDUCERS} \
    -files mapper_tf.py,reducer_idf.py,${ARTICLES}#articles.txt \
    -cmdenv TOTAL_ARTICLES=$total_articles \
    -mapper "python mapper_tf.py" \
    -reducer "python reducer_idf.py" \
    -input ${COUNT_DIR} \
    -output ${TF_IDF_DIR} > /dev/null

# print answ
echo `hdfs dfs -text ${TF_IDF_DIR}/part* | grep '^tf\slabor\s12\s'` \
    `hdfs dfs -text ${TF_IDF_DIR}/part* | egrep '^idf\slabor\s'` \
    | awk '{printf ("%f", $4 * $7) }'


0.005326

17/10/30 15:01:43 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
17/10/30 15:01:43 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032
17/10/30 15:01:43 INFO mapred.FileInputFormat: Total input files to process : 1
17/10/30 15:01:43 INFO mapreduce.JobSubmitter: number of splits:2
17/10/30 15:01:43 INFO mapreduce.JobSubmitter: Submitting tokens for job: job_1509357171066_0043
17/10/30 15:01:44 INFO impl.YarnClientImpl: Submitted application application_1509357171066_0043
17/10/30 15:01:44 INFO mapreduce.Job: The url to track the job: http://7e19e138f0cc:8088/proxy/application_1509357171066_0043/
17/10/30 15:01:44 INFO mapreduce.Job: Running job: job_1509357171066_0043
17/10/30 15:01:50 INFO mapreduce.Job: Job job_1509357171066_0043 running in uber mode : false
17/10/30 15:01:50 INFO mapreduce.Job:  map 0% reduce 0%
17/10/30 15:02:08 INFO mapreduce.Job:  map 21% reduce 0%
17/10/30 15:02:14 INFO mapreduce.Job:  map 31% reduce 0%
17/10/30 15:02:20 INFO 