In [1]:
from operator import add
import re
from collections import OrderedDict
from operator import itemgetter 
import itertools
from pyspark.sql import SparkSession

# New API
spark_session = SparkSession\
        .builder\
        .master("spark://192.168.2.250:7077") \
        .appName("TianruZ_common_crawl")\
        .config("spark.executor.cores",4)\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled", True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()


# RDD API 
spark_context = spark_session.sparkContext

spark_context.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/29 13:19:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:

# (*/*) - out of memory›
# ~6.4mins for 39496 files. (...00000/)  (takes 1 minute with 40 partitions)
# ~5 secs for 10 files (...00000/0*) 
# ~20 secs for 11110 files (...00000/1*) 


rdd = spark_context.newAPIHadoopFile(
    'hdfs://192.168.2.250:9000/Common_Crawl/CC-MAIN-20231128083443-20231128113443-00000.warc.wet',
    'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
    'org.apache.hadoop.io.LongWritable',
    'org.apache.hadoop.io.Text',
    conf={'textinputformat.record.delimiter': 'WARC/1.0'}
)\
.cache() # Keep this RDD in memory!

rdd.count()
# Only one job (previous .cache() did not trigger a job)

                                                                                

36541

In [3]:
rdd.take(3)
# [(line_number, partition)]

[(0, ''),
 (8,
  '\r\nWARC-Type: warcinfo\r\nWARC-Date: 2023-12-12T01:49:07Z\r\nWARC-Filename: CC-MAIN-20231128083443-20231128113443-00000.warc.wet.gz\r\nWARC-Record-ID: <urn:uuid:965b7f0e-e769-4c47-9e28-f7d9120e1027>\r\nContent-Type: application/warc-fields\r\nContent-Length: 382\r\n\r\nSoftware-Info: ia-web-commons.1.1.10-SNAPSHOT-20231102093126\r\nExtracted-Date: Tue, 12 Dec 2023 01:49:07 GMT\r\nrobots: checked via crawler-commons 1.5-SNAPSHOT (https://github.com/crawler-commons/crawler-commons)\r\nisPartOf: CC-MAIN-2023-50\r\noperator: Common Crawl Admin (info@commoncrawl.org)\r\ndescription: Wide crawl of the web for November/December 2023\r\npublisher: Common Crawl\r\n\r\n\r\n\r\n'),
 (657,
  '\r\nWARC-Type: conversion\r\nWARC-Target-URI: http://0-50.ru/news/line/2014-07-13/id_43733.html\r\nWARC-Date: 2023-11-28T11:34:01Z\r\nWARC-Record-ID: <urn:uuid:ea93f3a5-6e42-46bb-b2da-bbc91ff80ef0>\r\nWARC-Refers-To: <urn:uuid:60a1e8f3-68a9-437c-937c-e52edd95c91c>\r\nWARC-Block-Digest: sha1

In [4]:
rdd.getNumPartitions()

3

In [5]:
print(spark_context.uiWebUrl)

http://host-192-168-2-88-de1:4040


In [6]:
## Example #1 - Filter by Top_level Domain and compute most common words ##

# Try .ac.uk, .ru, .se, .com
p = re.compile('WARC-Target-URI: \S+\.ac.uk', re.IGNORECASE)


# Note: .partition(..) returns a 3-tuple: the string before the separator (index 0), 
# the separotor (index 1), and the part of the string afterwards (index 2) -- which is the part we want.
all_words = rdd\
    .filter(lambda doc: bool(p.search(doc[1])))\
    .map(lambda web_text: web_text[1].partition('\r\n\r\n')[2])\
    .flatMap(lambda t: t.split(' '))\
    .flatMap(lambda w: w.split('\n'))\



all_words_and_count = all_words.map(lambda w: w.strip())\
    .map(lambda w: (w,1))


word_counts = all_words_and_count.reduceByKey(add)

print(word_counts.takeOrdered(60, key=lambda x: -x[1]))





[('and', 1929), ('the', 1920), ('of', 1601), ('to', 1128), ('in', 820), ('a', 589), ('-', 486), ('gif', 414), ('for', 405), ('16x16', 384), ('The', 377), ('from', 346), ('on', 321), ('&', 314), ('Research', 294), ('by', 251), ('will', 235), ('with', 227), ('is', 224), ('Sir', 221), ('you', 217), ('as', 211), ('University', 205), ('', 203), ('that', 175), ('at', 169), ('this', 158), ('be', 154), ('1', 152), ('A.', 150), ('/', 146), ('are', 139), ('or', 133), ('your', 132), ('our', 129), ('an', 126), ('Buchanan,', 112), ('Study', 109), ('This', 107), ('International', 105), ('Our', 105), ('About', 102), ('have', 99), ('Letter', 98), ('students', 98), ('us', 98), ('Contact', 98), ('Group', 94), ('[J.', 93), ('Baron', 92), ('Northumbria', 92), ('about', 92), ('Students', 92), ('2nd', 92), ('A', 90), ('module', 89), ('Bloomfield', 88), ('Bloomfield],', 88), ('we', 87), ('Student', 87)]


                                                                                

In [7]:
## Example #2 - Group by TLD and compute most common words for each ##

ex = "WARC-Type: conversion\
WARC-Target-URI: http://news.bbc.co.uk/2/hi/africa/3414345.stm\
WARC-Date: 2014-08-02T09:52:13Z"

p = re.compile('WARC-Target-URI: \S+\.([a-zA-Z]{2,3})/', re.IGNORECASE)
# print(p.search(ex).group(1))
# uk

def get_tld(content):
    match = p.search(content)
    if match is not None:
        return match.group(1)
    else:
        return None

# discard the line number
# partition() -- python function -- split on the first occurance, returns (before,split,after)
# filter out those with no TLD

    
words_by_tld_rdd = rdd\
    .map(lambda filename_content: filename_content[1])\
    .map(lambda content: (get_tld(content), content.partition('\r\n\r\n')[2]))\
    .filter(lambda tld_content: tld_content[0] is not None)\
    .flatMapValues(lambda words: words.split(' '))\
    .flatMapValues(lambda words: words.split('\n'))\
    .mapValues(lambda word: word.strip())
    #.take(10)

# print(words_by_tld_rdd.take(10))

tlds = words_by_tld_rdd.countByKey()
#print(tlds)

tlds = OrderedDict(sorted(tlds.items(), key = itemgetter(1), reverse = True))
# print(tlds)  

top_tlds = dict(itertools.islice(tlds.items(), 10))

# print(top_tlds)

print("Results:")

for tld in top_tlds:
    print(tld)
    top_words_for_tld = words_by_tld_rdd\
        .filter(lambda tld_word: tld_word[0] == tld)\
        .values()\
        .map(lambda w: (w,1))\
        .reduceByKey(add)\
        .takeOrdered(20, key=lambda x: -x[1])
    print(top_words_for_tld)

                                                                                

Results:
com


                                                                                

[('the', 205930), ('to', 171298), ('and', 163704), ('of', 129213), ('a', 124676), ('', 124622), ('-', 100788), ('in', 97609), ('de', 94143), ('for', 73505), ('&', 61972), ('is', 61280), ('on', 47521), ('you', 45383), ('with', 45299), ('|', 43149), ('your', 42377), ('The', 42033), ('►', 40714), ('that', 36216)]
ru


                                                                                

[('и', 39462), ('в', 29784), ('', 26258), ('на', 18284), ('для', 17914), ('с', 14339), ('-', 11307), ('по', 8604), ('не', 8586), ('В', 7019), ('от', 6357), ('из', 4856), ('—', 4625), ('к', 4574), ('1', 4315), ('что', 4293), ('/', 3896), ('–', 3848), ('или', 3476), ('0', 3209)]
org


                                                                                

[('the', 36365), ('and', 26813), ('of', 25211), ('to', 24733), ('a', 18072), ('in', 16839), ('de', 14641), ('', 13019), ('for', 10472), ('is', 8871), ('-', 7264), ('on', 7223), ('The', 6461), ('by', 6219), ('with', 6022), ('–', 5869), ('that', 5710), ('la', 5667), ('&', 4977), ('are', 4378)]
de


                                                                                

[('und', 24108), ('der', 16703), ('die', 15091), ('', 13512), ('in', 13118), ('-', 10455), ('für', 9446), ('von', 8486), ('zu', 7711), ('&', 7182), ('Sie', 6705), ('den', 6443), ('mit', 6375), ('auf', 5533), ('oder', 4769), ('im', 4653), ('the', 4512), ('ist', 4305), ('des', 4273), ('to', 4166)]
net


                                                                                

[('in', 11785), ('the', 11752), ('to', 9676), ('and', 8992), ('(1)', 8684), ('a', 8682), ('', 7353), ('►', 7323), ('of', 7194), ('-', 5867), ('de', 5738), ('is', 4441), ('for', 4066), ('on', 3845), ('|', 3459), ('(2)', 3374), ('2023', 2796), ('you', 2779), ('your', 2673), ('by', 2601)]
uk


                                                                                

[('the', 16107), ('and', 14054), ('to', 13592), ('of', 10428), ('a', 7402), ('in', 7095), ('&', 5743), ('for', 5426), ('-', 4566), ('is', 4102), ('The', 3684), ('on', 3661), ('(0)', 3389), ('with', 3212), ('you', 3172), ('your', 3101), ('by', 2651), ('are', 2597), ('', 2384), ('that', 2383)]
fr


                                                                                

[('de', 25593), ('et', 10347), ('à', 8845), ('la', 8825), ('-', 7000), ('des', 6742), ('les', 6312), ('le', 6001), ('en', 5343), ('du', 5085), (':', 4961), ('pour', 4533), ('/', 3523), ('sur', 3332), ('un', 3217), ('vous', 2913), ('1', 2784), ('au', 2577), ('par', 2504), ('une', 2428)]
hu


                                                                                

[('-', 25140), ('a', 21692), ('keresőoptimalizálás,', 16002), ('és', 13508), ('keresőoptimalizálás', 11643), ('havidíjas', 11371), ('az', 10006), ('Weboldal', 6840), ('A', 6797), ('Google', 6464), ('Komplex', 6306), ('Web+', 6205), ('Budapest', 5996), ('havi', 5426), ('Prémium', 5417), ('linképítés,', 5413), ('Az', 4970), ('hogy', 4076), ('weboldal', 3610), ('készítés', 3230)]
it


                                                                                

[('di', 15316), ('e', 11487), ('in', 5682), ('per', 5682), ('il', 4749), ('a', 4726), ('la', 4447), ('-', 4056), ('del', 3613), ('che', 3237), ('un', 2887), ('da', 2726), ('con', 2650), ('|', 2594), ('è', 2535), ('i', 2504), ('le', 2456), ('della', 2264), ('al', 2251), ('–', 2119)]
pl




[('', 10220), ('i', 9023), ('w', 8578), ('do', 6921), ('z', 6140), ('na', 5930), ('A;', 4950), ('-', 3873), ('>', 3655), ('się', 3238), ('–', 2539), ('to', 2252), ('dla', 2193), ('o', 1993), ('jest', 1789), ('zł', 1665), ('nie', 1652), ('skup', 1428), ('łożysk', 1412), ('a', 1135)]


                                                                                

In [8]:
spark_session.stop()