In [1]:
from pyspark.sql import SparkSession
from pyspark import StorageLevel
import os

os.environ["PYSPARK_PYTHON"]="/home/pc/g5_env/bin/python39"

spark = SparkSession.builder.master("local[16]")\
            .appName("ms_wiki_extract")\
            .config('spark.executor.memory', '20g')\
            .config('spark.driver.maxResultSize', '10g')\
            .config('spark.driver.memory', '20g')\
            .config('spark.ui.showConsoleProgress', False)\
            .getOrCreate()

sparkContext = spark.sparkContext

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/05 20:35:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
file_path = 'hdfs://g5.bigtop.it:8020/user/root/wikidata_en_preprocessed.parquet'

In [3]:
df1 = spark.read.option("header",True).parquet(file_path)
df1.show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|_1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

In [4]:
df1.rdd.getNumPartitions()

16

In [5]:
df1 = df1.repartition(64)

In [6]:
df1.show()

+--------------------+
|                  _1|
+--------------------+
|mannerheim explai...|
|renaud jean le tr...|
|pete doherty was ...|
|during the electi...|
|clothing products...|
|of cipa played at...|
|it later emerged ...|
|they were hms ove...|
|the latter drove ...|
|            ipromide|
|thumb a view of s...|
|as part of phase ...|
|the contest was c...|
|he then lectured ...|
|the female defend...|
|elliot junior uyi...|
|novices wear the ...|
|in three videos o...|
|renovation of the...|
|the preaching eff...|
+--------------------+
only showing top 20 rows



In [7]:
from pyspark.sql.functions import col,isnan, when, count

c = '_1'
df1.select([count(when(col(c).isNull(), c)).alias(c)]).show()

+---+
| _1|
+---+
|  0|
+---+



In [8]:
from nltk.tokenize import word_tokenize
import re
def word_token(text):
    string =text['_1']
    # print(type(text))
    return word_tokenize(string)

def f(x): return x

def exchangePosition(text, index):
    return index, (text)


In [9]:
RDD = df1.rdd.map(lambda x: word_token(x))\
.persist(StorageLevel.MEMORY_ONLY)

In [10]:
RDD2=RDD.map(lambda x:(x,1)).map(lambda x: exchangePosition(x[0], x[1]))\
.flatMapValues(f)\
.map(lambda x: exchangePosition(x[0], x[1]))\
.reduceByKey(lambda a,b: a+b)\
.persist(StorageLevel.MEMORY_ONLY)

In [11]:
RDD2.count()

3704067

In [12]:
Column = ['word', 'word_count']
df2 = RDD2.toDF(Column)

22/10/05 20:38:05 WARN BlockManager: Task 248 already completed, not releasing lock for rdd_44_0


In [13]:
df2.show()

+------------+----------+
|        word|word_count|
+------------+----------+
|         out|    216565|
|        bani|       496|
|    ipromide|         1|
|sufficiently|      2013|
|      family|    172612|
|   continued|     56495|
|   standards|     14272|
|   something|     16978|
|  kilometers|      5226|
|        wwwf|        79|
|         wwf|      1730|
|         och|      1236|
|      belief|      6713|
|      common|     54279|
|       where|    236524|
|particularly|     26032|
|  production|     70878|
|          sc|      7248|
|     promote|     12284|
|      result|     50587|
+------------+----------+
only showing top 20 rows



In [14]:
import advertools as adv
eng_STOPWORDS=adv.stopwords['english']

In [15]:
df2.orderBy("word_count", ascending=False)\
.filter(~col("word").isin(eng_STOPWORDS))\
.show(10)

+----------+----------+
|      word|word_count|
+----------+----------+
|       new|    595956|
|    school|    317618|
|      time|    305875|
|university|    293830|
|  national|    281605|
|     style|    277952|
|        de|    273528|
|     world|    266668|
|     years|    265312|
|     state|    258455|
+----------+----------+
only showing top 10 rows



In [16]:
def text_extract(text):
    keywords =  ['secondary', 'school', 'tertiary', 'university', 'national',  'private']
    for x in keywords:
        if x in text:
            return text
        
    return ''


keyword_filter = RDD.map(lambda x: text_extract(x)).filter(lambda x: x != '').persist(StorageLevel.MEMORY_ONLY)

In [17]:
keyword_filter.count()

724074

In [18]:
data1 = keyword_filter.map(lambda x : (tuple(x), 1))\
.reduceByKey(lambda a,b: a+b).persist(StorageLevel.MEMORY_ONLY)

In [19]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, ArrayType
schema = StructType([ \
    StructField('token_sentence',ArrayType(StringType()),True), \
    StructField('sent_count',IntegerType(),True), \
  ])


df1 = spark.createDataFrame(data = data1, schema=schema)
df1.show()

+--------------------+----------+
|      token_sentence|sent_count|
+--------------------+----------+
|[there, are, prop...|       144|
|[his, candidature...|         1|
|[abingdon, is, th...|         1|
|[in, he, returned...|         1|
|[while, most, cul...|         1|
|[degree, from, th...|         1|
|[high, schools, s...|         1|
|[he, also, attemp...|         1|
|[the, school, ini...|         1|
|[he, initially, e...|         1|
|[in, national, el...|         1|
|[thesis, universi...|         3|
|[he, also, set, t...|         1|
|[in, her, english...|         1|
|[he, attended, ne...|         1|
|[tobias, went, to...|         1|
|[standing, as, th...|         1|
|[wales, quoted, i...|         1|
|[buddleja, podara...|         1|
|[the, inability, ...|         1|
+--------------------+----------+
only showing top 20 rows



In [20]:
import pyspark.sql.functions as f
df1 = df1.withColumn('word_count', f.size(f.col('token_sentence')))

In [21]:
df1.show()

+--------------------+----------+----------+
|      token_sentence|sent_count|word_count|
+--------------------+----------+----------+
|[there, are, prop...|       144|        13|
|[his, candidature...|         1|        15|
|[abingdon, is, th...|         1|         7|
|[in, he, returned...|         1|        20|
|[while, most, cul...|         1|        23|
|[degree, from, th...|         1|         6|
|[high, schools, s...|         1|        68|
|[he, also, attemp...|         1|        14|
|[the, school, ini...|         1|        31|
|[he, initially, e...|         1|        31|
|[in, national, el...|         1|         9|
|[thesis, universi...|         3|         5|
|[he, also, set, t...|         1|        15|
|[in, her, english...|         1|        47|
|[he, attended, ne...|         1|        29|
|[tobias, went, to...|         1|        28|
|[standing, as, th...|         1|        48|
|[wales, quoted, i...|         1|        14|
|[buddleja, podara...|         1|        31|
|[the, ina

In [22]:
df1.filter(df1.word_count == 1).show()

+--------------+----------+----------+
|token_sentence|sent_count|word_count|
+--------------+----------+----------+
|    [national]|         3|         1|
|      [school]|        11|         1|
|  [university]|         8|         1|
|     [private]|         2|         1|
|   [secondary]|         2|         1|
+--------------+----------+----------+



In [23]:
df2 = df1.filter(df1.word_count > 1)

In [24]:
df2.show()

+--------------------+----------+----------+
|      token_sentence|sent_count|word_count|
+--------------------+----------+----------+
|[there, are, prop...|       144|        13|
|[his, candidature...|         1|        15|
|[abingdon, is, th...|         1|         7|
|[in, he, returned...|         1|        20|
|[while, most, cul...|         1|        23|
|[degree, from, th...|         1|         6|
|[high, schools, s...|         1|        68|
|[he, also, attemp...|         1|        14|
|[the, school, ini...|         1|        31|
|[he, initially, e...|         1|        31|
|[in, national, el...|         1|         9|
|[thesis, universi...|         3|         5|
|[he, also, set, t...|         1|        15|
|[in, her, english...|         1|        47|
|[he, attended, ne...|         1|        29|
|[tobias, went, to...|         1|        28|
|[standing, as, th...|         1|        48|
|[wales, quoted, i...|         1|        14|
|[buddleja, podara...|         1|        31|
|[the, ina

In [25]:
df2.count()

689245

### Ngram

In [26]:
from pyspark.ml.feature import NGram
ngramDataFrame = df2

for x in range(2,6):
    ngram = NGram(n=x, inputCol='token_sentence', outputCol= str(x) +'grams')
    ngramDataFrame = ngram.transform(ngramDataFrame) 
ngramDataFrame.show()

+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|      token_sentence|sent_count|word_count|              2grams|              3grams|              4grams|              5grams|
+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|[there, are, prop...|       144|        13|[there are, are p...|[there are proper...|[there are proper...|[there are proper...|
|[his, candidature...|         1|        15|[his candidature,...|[his candidature ...|[his candidature ...|[his candidature ...|
|[abingdon, is, th...|         1|         7|[abingdon is, is ...|[abingdon is the,...|[abingdon is the ...|[abingdon is the ...|
|[in, he, returned...|         1|        20|[in he, he return...|[in he returned, ...|[in he returned t...|[in he returned t...|
|[while, most, cul...|         1|        23|[while most, most...|[while most culti...|[while most

In [27]:
ngramDataFrame.filter(ngramDataFrame.word_count < 5).count()

4175

In [28]:
ngramDataFrame.filter(ngramDataFrame.word_count == 4).show()

+--------------------+----------+----------+--------------------+--------------------+--------------------+------+
|      token_sentence|sent_count|word_count|              2grams|              3grams|              4grams|5grams|
+--------------------+----------+----------+--------------------+--------------------+--------------------+------+
|[university, of, ...|        54|         4|[university of, o...|[university of to...|[university of to...|    []|
|[melbourne, unive...|         2|         4|[melbourne univer...|[melbourne univer...|[melbourne univer...|    []|
|[university, of, ...|        47|         4|[university of, o...|[university of il...|[university of il...|    []|
|[university, of, ...|         2|         4|[university of, o...|[university of al...|[university of al...|    []|
|[university, of, ...|         1|         4|[university of, o...|[university of ta...|[university of ta...|    []|
|[university, of, ...|        22|         4|[university of, o...|[university of 

In [29]:
ngramDataFrame.filter(ngramDataFrame.word_count == 3).show()

+--------------------+----------+----------+--------------------+--------------------+------+------+
|      token_sentence|sent_count|word_count|              2grams|              3grams|4grams|5grams|
+--------------------+----------+----------+--------------------+--------------------+------+------+
|[national, mariti...|         7|         3|[national maritim...|[national maritim...|    []|    []|
|[paris, national,...|         1|         3|[paris national, ...|[paris national l...|    []|    []|
|[ipek, university...|         1|         3|[ipek university,...|[ipek university ...|    []|    []|
|[cornell, univers...|        56|         3|[cornell universi...|[cornell universi...|    []|    []|
|[anderson, high, ...|         1|         3|[anderson high, h...|[anderson high sc...|    []|    []|
|[whitney, high, s...|         2|         3|[whitney high, hi...|[whitney high sch...|    []|    []|
|[syracuse, univer...|         1|         3|[syracuse univers...|[syracuse univers...|    [

In [30]:
ngramDataFrame.filter(ngramDataFrame.word_count == 2).show()

+--------------------+----------+----------+--------------------+------+------+------+
|      token_sentence|sent_count|word_count|              2grams|3grams|4grams|5grams|
+--------------------+----------+----------+--------------------+------+------+------+
|[national, arbore...|         1|         2|[national arboretum]|    []|    []|    []|
|[marquette, unive...|         5|         2|[marquette univer...|    []|    []|    []|
|[denison, univers...|         1|         2|[denison university]|    []|    []|    []|
| [rebro, university]|         1|         2|  [rebro university]|    []|    []|    []|
|[secondary, educa...|         1|         2|[secondary educat...|    []|    []|    []|
|[secondary, malig...|         1|         2|[secondary malign...|    []|    []|    []|
|[national, encycl...|         1|         2|[national encyclo...|    []|    []|    []|
|[utrecht, univers...|         3|         2|[utrecht university]|    []|    []|    []|
|    [normal, school]|         2|         2

In [31]:
ngramDataFrame.filter(ngramDataFrame.word_count == 2)

def find_keyword(text):
    keywords =  ['secondary', 'school', 'tertiary', 'university', 'national',  'private']
    
    empList = []
    for x in text:
        for y in keywords:
            if y in x:
                empList.append(x)
    return empList


In [32]:
column =  ['token_sentence','sent_count','word_count','2grams','3grams','4grams','5grams']
gram5= ngramDataFrame.filter(ngramDataFrame.word_count > 4)\
.rdd.map(lambda x: (x[0], x[1], x[2],find_keyword(x[3])\
,find_keyword(x[4]), find_keyword(x[5]), find_keyword(x[6]))).persist(StorageLevel.MEMORY_ONLY)

In [33]:
df_5gram = gram5.toDF(column).show()

22/10/05 20:53:02 WARN BlockManager: Task 647 already completed, not releasing lock for rdd_97_0
22/10/05 20:53:02 WARN BlockManager: Task 648 already completed, not releasing lock for rdd_97_0
+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|      token_sentence|sent_count|word_count|              2grams|              3grams|              4grams|              5grams|
+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|[there, are, prop...|       144|        13|[the national, na...|[on the national,...|[listed on the na...|[districts listed...|
|[his, candidature...|         1|        15|[indian national,...|[the indian natio...|[by the indian na...|[supported by the...|
|[abingdon, is, th...|         1|         7|        [aaa school]|[smallest aaa sch...|[2nd smallest aaa...|[the 2nd smallest...|
|[in, he, returned...|         1

In [34]:
gram4= ngramDataFrame.filter(ngramDataFrame.word_count == 4)\
.rdd.map(lambda x: (x[0], x[1], x[2],find_keyword(x[3])\
,find_keyword(x[4]), find_keyword(x[5]), x[6])).persist(StorageLevel.MEMORY_ONLY)

In [35]:
gram3= ngramDataFrame.filter(ngramDataFrame.word_count == 3)\
.rdd.map(lambda x: (x[0], x[1], x[2],find_keyword(x[3])\
,find_keyword(x[4]), x[5], x[6])).persist(StorageLevel.MEMORY_ONLY)

In [36]:
gram2= ngramDataFrame.filter(ngramDataFrame.word_count == 2)\
.rdd.map(lambda x: (x[0], x[1], x[2],find_keyword(x[3])\
,find_keyword(x[4]), x[5], x[6])).persist(StorageLevel.MEMORY_ONLY)

In [37]:
Final_gram = gram5.union(gram4).union(gram3).union(gram2).toDF(column).persist(StorageLevel.MEMORY_ONLY)

22/10/05 20:53:03 WARN BlockManager: Task 649 already completed, not releasing lock for rdd_97_0


In [38]:
Final_gram.show()

+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|      token_sentence|sent_count|word_count|              2grams|              3grams|              4grams|              5grams|
+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|[there, are, prop...|       144|        13|[the national, na...|[on the national,...|[listed on the na...|[districts listed...|
|[his, candidature...|         1|        15|[indian national,...|[the indian natio...|[by the indian na...|[supported by the...|
|[abingdon, is, th...|         1|         7|        [aaa school]|[smallest aaa sch...|[2nd smallest aaa...|[the 2nd smallest...|
|[in, he, returned...|         1|        20|[stanford univers...|[the stanford uni...|[to the stanford ...|[returned to the ...|
|[while, most, cul...|         1|        23|[a secondary, sec...|[considered a sec...|[are consid

In [39]:
Final_gram.count()

689245

In [40]:
Final_gram.filter(Final_gram.word_count == 4).show()

+--------------------+----------+----------+--------------------+--------------------+--------------------+------+
|      token_sentence|sent_count|word_count|              2grams|              3grams|              4grams|5grams|
+--------------------+----------+----------+--------------------+--------------------+--------------------+------+
|[university, of, ...|        54|         4|     [university of]|[university of to...|[university of to...|    []|
|[melbourne, unive...|         2|         4|[melbourne univer...|[melbourne univer...|[melbourne univer...|    []|
|[university, of, ...|        47|         4|     [university of]|[university of il...|[university of il...|    []|
|[university, of, ...|         2|         4|     [university of]|[university of al...|[university of al...|    []|
|[university, of, ...|         1|         4|     [university of]|[university of ta...|[university of ta...|    []|
|[university, of, ...|        22|         4|     [university of]|[university of 

In [41]:
Final_gram = Final_gram.coalesce(1)

In [42]:
Final_gram.write.parquet("hdfs://g5.bigtop.it:8020/user/root/filtered_en_wiki.parquet")