In [1]:
from __future__ import print_function
import pyspark
import glob
import os
import sys
from operator import add
from math import log

## Data inputs - Change input path for the files you want to process
## To run, simply run all cells

In [14]:
input_path = '/mnt/wiktorskit-danielb-ns0000k/home/notebook/group04/clean_data/wiki_test.csv'
multi_path = '/mnt/wiktorskit-danielb-ns0000k/home/notebook/group04/clean_data/enwiki*.csv'
nowiki_path = '/mnt/wiktorskit-danielb-ns0000k/home/notebook/group04/clean_data/nowiki*.csv'

output_path_tf_idf = '/mnt/wiktorskit-danielb-ns0000k/home/notebook/group04/TF-IDF/tf_idf_21'
nowiki_output_path_tf_idf = '/mnt/wiktorskit-danielb-ns0000k/home/notebook/group04/TF-IDF/tf_idf_nowiki_final_111'

## Helper Functions

In [3]:
def getPages(path):
    """
    Reads a text file into a spark RDD if path is valid
    
    :param path: path of text file
    :return: RDD containing the text file
    """
    # Validity check of the path
    for file in glob.glob(path):
        if not os.path.isfile(file) or '/mnt/' not in file:
            print('Not a valid path')
            sys.exit(-1)
    return sc.textFile('file://'+path)

def getTerms(page):
    """
    This function retrieves the terms from any given page
    
    :param page: a page represented as a tab seperated string
    :return: list of terms belonging to page
    """
    len_links = int(page.strip().split('\t')[3])
    term_list = page.strip().split('\t')[4:-len_links:]
    term_list = [x.lower() for x in term_list]
    return term_list

def get_IdTerm_tuple(key, terms):
    """

    """
    num_terms = len(terms)
    if num_terms> 0:
        for term in terms:
            yield (key, term), 1

In [4]:
def procRDD(rdd, cache=False, part=True, hashp=True, npart=12):
    """
    Helper to handle caching/partioning
    
    Function taken from:
    https://stackoverflow.com/questions/31659404/spark-iteration-time-increasing-exponentially-when-using-join
    
    :param rdd: pyspark RDD
    :param cache: boolean (default=True)
    :param part: boolean (default=True)
    :param hashp: boolean (default=True)
    :param npart: number of partitions (default=12 suggested to be 2*(number of cores))
    :return: rdd or rdd.cache()
    """
    rdd = rdd if not part else rdd.repartition(npart)
    rdd = rdd if not hashp else rdd.partitionBy(npart)
    return rdd if not cache else rdd.cache()

## TF-IDF implementation

In [5]:
pyspark.SparkContext.setSystemProperty('spark.executor.memory', '7g')
# pyspark.SparkContext.setSystemProperty('spark.cores.max', '2')
pyspark.SparkContext.setSystemProperty('spark.driver.cores', '2')
pyspark.SparkContext.setSystemProperty('spark.driver.memory', '7g')
sc = pyspark.SparkContext(appName='tf_idf')

In [6]:
"""
We want to find 4 parameters:
n = term frequency in article
N = total terms in article
d = term frequency in corpus
D = total size of corpus (number of articles)

We calculate:
TF * IDF = (n/N) * log (D/d)

Alternative:

TF * IDF = n * log(D/d)

bias towards large documents
"""
npart = 16
pages = getPages(nowiki_path)

id_and_terms = pages \
    .filter(lambda line: len(line.strip()) > 0 and \
           not (line.strip().split('\t')[1].find('File:') == 0) and \
           not (line.strip().split('\t')[1].find('Wikipedia:') == 0)) \
    .map(lambda line: (line.strip().split('\t')[0], \
                       getTerms(line)))

# Create TF
id_and_N = id_and_terms \
    .map(lambda key_value: (key_value[0], len(key_value[1])))
TF = id_and_terms \
    .flatMap(lambda key_value : get_IdTerm_tuple(key_value[0], key_value[1])) \
    .reduceByKey(add) \
    .map(lambda key_value: (key_value[0][0], (key_value[0][1],key_value[1]))) \
    .join(id_and_N, npart) \
    .map(lambda key_value: 
         (key_value[1][0][0], 
          (key_value[0], key_value[1][0][1] / key_value[1][1])))

# Create IDF
D = id_and_terms.count()
IDF = TF \
    .map(lambda key_value: (key_value[0], 1)) \
    .reduceByKey(add) \
    .map(lambda key_value: (key_value[0], log(D / key_value[1])))

# Create TF_IDF
TF_IDF = TF \
    .join(IDF, npart) \
    .map(lambda key_value: 
         (key_value[0], 
          (key_value[1][0][0], key_value[1][0][1] * key_value[1][1])))
TF_IDF.saveAsTextFile(nowiki_output_path_tf_idf)

In [28]:
sc.stop()

In [5]:
pyspark.SparkContext.setSystemProperty('spark.executor.memory', '7g')
# pyspark.SparkContext.setSystemProperty('spark.cores.max', '2')
pyspark.SparkContext.setSystemProperty('spark.driver.cores', '2')
pyspark.SparkContext.setSystemProperty('spark.driver.memory', '7g')
sc = pyspark.SparkContext(appName='tf_idf')

# We do not have enough resources to use caching here

In [6]:
# pyspark.SparkContext.setSystemProperty('spark.cores.max', '2')
"""
We want to find 4 parameters:
n = term frequency in article
N = total terms in article
d = term frequency in corpus
D = total size of corpus (number of articles)

We calculate:
TF * IDF = (n/N) * log (D/d)

Alternative:

TF * IDF = n * log(D/d)

bias towards large documents
"""
npart = 16
pages = getPages(nowiki_path)

id_and_terms = pages \
    .filter(lambda line: len(line.strip()) > 0 and \
           not (line.strip().split('\t')[1].find('File:') == 0) and \
           not (line.strip().split('\t')[1].find('Wikipedia:') == 0)) \
    .map(lambda line: (line.strip().split('\t')[0], \
                       getTerms(line)))
D = id_and_terms.count()
# Create TF
id_and_N = procRDD(id_and_terms \
    .map(lambda key_value: (key_value[0], len(key_value[1]))), 
                   npart=npart)
TF = id_and_terms \
    .flatMap(lambda key_value : get_IdTerm_tuple(key_value[0], key_value[1])) \
    .reduceByKey(add) \
    .map(lambda key_value: (key_value[0][0], (key_value[0][1],key_value[1]))) \
    .join(id_and_N, npart) \
    .map(lambda key_value: 
         (key_value[1][0][0], 
          (key_value[0], key_value[1][0][1] / key_value[1][1])))

# Create IDF

IDF = procRDD(TF \
    .map(lambda key_value: (key_value[0], 1)) \
    .reduceByKey(add) \
    .map(lambda key_value: (key_value[0], log(D / key_value[1]))),
             npart=npart)

# Create TF_IDF
TF_IDF = TF \
    .join(IDF, npart) \
    .map(lambda key_value: 
         (key_value[0], 
          (key_value[1][0][0], key_value[1][0][1] * key_value[1][1])))
TF_IDF.saveAsTextFile(nowiki_output_path_tf_idf)

### TF-IDF is the final output. It looks like this: (term, (article_id, tf-idf.score)), the key term is repeated for different article-ids
### output to file might be term -> [(id_i,score_i),....,(id_n,score_n)

In [16]:
def getTFIDF(line):
    tf_idf = line.split(',')
    if len(tf_idf) == 3:
        return tf_idf[0][2:-1], (tf_idf[1].strip()[2:-1], float(tf_idf[2][2:-1].strip()[:-2]))
    else:
        return ','.join(tf_idf[:-2])[2:-1], (tf_idf[-2].strip()[2:-1], float(tf_idf[-1][2:-1].strip()[:-2]))

In [2]:
pyspark.SparkContext.setSystemProperty('spark.executor.memory', '7g')
pyspark.SparkContext.setSystemProperty('spark.driver.cores', '2')
pyspark.SparkContext.setSystemProperty('spark.driver.memory', '7g')
pyspark.SparkContext.setSystemProperty('spark.cleaner.ttl', '600')
sc = pyspark.SparkContext(appName='pagerank')

In [5]:
tf_idf = sc.textFile('file:///mnt/wiktorskit-danielb-ns0000k/home/notebook/group04/TF-IDF/tf_idf_nowiki_final/part*')

In [6]:
tf_idf.count(), tf_idf.take(5)

(75436783,
 ["('oversvømmet', ('847877', 0.0028516679896155024))",
  "('oversvømmet', ('1520687', 0.0661727219557499))",
  "('oversvømmet', ('1577312', 0.002659114650395747))",
  "('oversvømmet', ('30336', 0.0027072676319924507))",
  "('oversvømmet', ('351260', 0.008515898817090176))"])

In [7]:
tf_idf.take(1)[0]

"('oversvømmet', ('847877', 0.0028516679896155024))"

In [17]:
TF_IDF = tf_idf.map(lambda line: getTFIDF(line))
TF_IDF.count(), TF_IDF.take(5)

(75436783,
 [('oversvømmet', ('847877', 0.002851667989615502)),
  ('oversvømmet', ('1520687', 0.066172721955749)),
  ('oversvømmet', ('1577312', 0.00265911465039574)),
  ('oversvømmet', ('30336', 0.00270726763199245)),
  ('oversvømmet', ('351260', 0.00851589881709017))])

In [13]:
x = sc.parallelize([("a", ('2', 1)), ("b", 1), ("a", (3, 1))])
a = x.groupByKey().collect()
print(a)
# for i in a:
#     for j in i:
#         print(j)
#         if not isinstance(j, str):
#             for k in j:
#                 print(k)

[('b', <pyspark.resultiterable.ResultIterable object at 0x7fbbb20a89e8>), ('a', <pyspark.resultiterable.ResultIterable object at 0x7fbbb20a8ac8>)]


In [11]:
type(TF_IDF), type(x)

(pyspark.rdd.PipelinedRDD, pyspark.rdd.RDD)

In [15]:
TF_IDF.saveAsTextFile(nowiki_output_path_tf_idf)

In [18]:
grouped = TF_IDF.groupByKey().mapValues(list)
grouped.take(5)

[("13.48,88'", [('531769', 0.0253389780250443)]),
 ("149,115'", [('23711', 0.01527833714820648)]),
 ("OppLillehammerLitrim20083603,6Sjur'", [('657417', 0.0189684440017987)]),
 ("32.31,55'", [('533924', 0.0402085579439326)]),
 ("xooooxxx4,10'", [('1626423', 0.0344350214186)])]

In [21]:
grouped.take(15)

KeyboardInterrupt: 

In [22]:
sc.stop()