In [1]:
from pyspark.sql import SparkSession
from pyspark import StorageLevel
import os

os.environ["PYSPARK_PYTHON"]="/home/pc/g5_env/bin/python39"

spark = SparkSession.builder.master("local[16]")\
            .appName("ms_wiki_extract")\
            .config('spark.executor.memory', '20g')\
            .config('spark.driver.maxResultSize', '10g')\
            .config('spark.driver.memory', '20g')\
            .config('spark.ui.showConsoleProgress', False)\
            .getOrCreate()

sparkContext = spark.sparkContext

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/05 19:00:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
file_path = 'hdfs://g5.bigtop.it:8020/user/root/wikidata_ms_preprocessed.parquet/part-00000-a8275472-ea44-4f76-b4d6-78236b446b1d-c000.snappy.parquet'

In [3]:
df1 = spark.read.option("header",True).parquet(file_path)
df1.show()

+--------------------+
|                  _1|
+--------------------+
|sipaku area merup...|
|lbixan merupakan ...|
|pembahagian penta...|
|syawal merupakan ...|
|1430h satu gempa ...|
|buk merupakan seb...|
|lapangan terbang ...|
|lapangan terbang ...|
|the owls nest wel...|
|           el bolson|
|kzlcaky kk merupa...|
|     daerah di turki|
|islam in panama m...|
|menurut laporan p...|
|kaum islam pertam...|
|orang mandinka ad...|
|sekumpulan daripa...|
|mereka memilih se...|
|mereka membentuk ...|
|bayano diperolehi...|
+--------------------+
only showing top 20 rows



In [4]:
df1.rdd.getNumPartitions()

16

In [5]:
df1 = df1.repartition(64)

In [6]:
df1.show()

+--------------------+
|                  _1|
+--------------------+
|seluruh rakyat ak...|
|pada julai bahagi...|
|radar pertahanan ...|
|beliau dibesarkan...|
|seiyu masaki tera...|
|filem ini dibinta...|
|pada awalnya hany...|
|kejayaan terbesar...|
|makanan ini kemud...|
|georges dumzil me...|
|pengeluaran getah...|
|lapangan terbang ...|
|dan aku berharap ...|
|narcy ialah komun...|
|thumb left logo p...|
|sebaik sahaja ana...|
|beliau meminjamka...|
|t r reid the chip...|
| i do not know nu...|
|disember pasukan ...|
+--------------------+
only showing top 20 rows



In [7]:
from pyspark.sql.functions import col,isnan, when, count

c = '_1'
df1.select([count(when(col(c).isNull(), c)).alias(c)]).show()

+---+
| _1|
+---+
|  0|
+---+



In [8]:
from nltk.tokenize import word_tokenize
import re
def word_token(text):
    string =text['_1']
    # print(type(text))
    return word_tokenize(string)

def f(x): return x

def exchangePosition(text, index):
    return index, (text)


In [9]:
RDD = df1.rdd.map(lambda x: word_token(x))\
.persist(StorageLevel.MEMORY_ONLY)

In [10]:
RDD2=RDD.map(lambda x:(x,1)).map(lambda x: exchangePosition(x[0], x[1]))\
.flatMapValues(f)\
.map(lambda x: exchangePosition(x[0], x[1]))\
.reduceByKey(lambda a,b: a+b)\
.persist(StorageLevel.MEMORY_ONLY)

In [11]:
RDD2.count()

1006863

In [12]:
Column = ['word', 'word_count']
df2 = RDD2.toDF(Column)

In [13]:
df2.show()

+-------------+----------+
|         word|word_count|
+-------------+----------+
|       daerah|    109528|
|       masaki|       110|
|     kemudian|     34021|
|        parti|     21520|
|       eeeeee|        22|
|mengakibatkan|      2908|
|        bulan|     29769|
|    berpindah|      5703|
|     syarikat|     66195|
|       jerman|     31750|
|diisytiharkan|      1480|
|        kombo|        60|
|       dacian|        10|
|      menteri|     26778|
| menghentikan|      1635|
|        bahru|      2014|
|     diadakan|     11980|
|      prinsip|      2096|
| pengeluarang|         1|
|      fizikal|      3290|
+-------------+----------+
only showing top 20 rows



In [14]:
import advertools as adv
indo_STOPWORDS=adv.stopwords['indonesian']

In [15]:
df2.orderBy("word_count", ascending=False)\
.filter(~col("word").isin(indo_STOPWORDS))\
.show(10)

+--------+----------+
|    word|word_count|
+--------+----------+
|     the|    211841|
|terletak|    201670|
|  beliau|    140548|
|      of|    138484|
| kawasan|    130848|
|malaysia|    110286|
|  daerah|    109528|
|   orang|    103072|
| kampung|     98386|
|   komun|     94137|
+--------+----------+
only showing top 10 rows



In [16]:
def text_extract(text):
    keywords =  ['terletak', 'kawasan', 'malaysia',' tentera', 'daerah',  'kabupaten']
    for x in keywords:
        if x in text:
            return text
        
    return ''


keyword_filter = RDD.map(lambda x: text_extract(x)).filter(lambda x: x != '').persist(StorageLevel.MEMORY_ONLY)

In [17]:
keyword_filter.count()

390700

In [18]:
data1 = keyword_filter.map(lambda x : (tuple(x), 1))\
.reduceByKey(lambda a,b: a+b).persist(StorageLevel.MEMORY_ONLY)

In [19]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, ArrayType
schema = StructType([ \
    StructField('token_sentence',ArrayType(StringType()),True), \
    StructField('sent_count',IntegerType(),True), \
  ])


df1 = spark.createDataFrame(data = data1, schema=schema)
df1.show()

+--------------------+----------+
|      token_sentence|sent_count|
+--------------------+----------+
|[medan, selera, p...|         1|
|[terutung, payung...|         1|
|[empangan, bakun,...|         1|
|[lapangan, terban...|         1|
|[daerah, preov, w...|        91|
|[shahree2005sumbe...|         1|
|[anarjan, bostana...|         1|
|[gkeli, bozyk, me...|         1|
|[kronburg, merupa...|         1|
|[nama, ini, telah...|         1|
|[pudeng, merupaka...|         1|
|[umretha, merupak...|         1|
|[langhamn, merupa...|         1|
|[buku, resipi, ke...|         1|
|[daerah, topoany,...|        54|
|[kiri, bodegraven...|         1|
|[bakauheni, merup...|         1|
|[sekolah, menenga...|         1|
|[ovack, lleburgaz...|         1|
|[tuhemberua, meru...|         1|
+--------------------+----------+
only showing top 20 rows



In [20]:
import pyspark.sql.functions as f
df1 = df1.withColumn('word_count', f.size(f.col('token_sentence')))

In [21]:
df1.show()

+--------------------+----------+----------+
|      token_sentence|sent_count|word_count|
+--------------------+----------+----------+
|[medan, selera, p...|         1|         8|
|[terutung, payung...|         1|        17|
|[empangan, bakun,...|         1|         7|
|[lapangan, terban...|         1|        31|
|[daerah, preov, w...|        91|        11|
|[shahree2005sumbe...|         1|        27|
|[anarjan, bostana...|         1|         9|
|[gkeli, bozyk, me...|         1|         9|
|[kronburg, merupa...|         1|        10|
|[nama, ini, telah...|         1|        14|
|[pudeng, merupaka...|         1|        14|
|[umretha, merupak...|         1|         8|
|[langhamn, merupa...|         1|        10|
|[buku, resipi, ke...|         1|        18|
|[daerah, topoany,...|        54|        11|
|[kiri, bodegraven...|         1|        12|
|[bakauheni, merup...|         1|        10|
|[sekolah, menenga...|         1|        25|
|[ovack, lleburgaz...|         1|         9|
|[tuhember

In [22]:
df1.filter(df1.word_count == 1).show()

+--------------+----------+----------+
|token_sentence|sent_count|word_count|
+--------------+----------+----------+
|     [kawasan]|         1|         1|
|      [daerah]|         6|         1|
|    [malaysia]|        35|         1|
+--------------+----------+----------+



In [23]:
df2 = df1.filter(df1.word_count > 1)

In [24]:
df2.show()

+--------------------+----------+----------+
|      token_sentence|sent_count|word_count|
+--------------------+----------+----------+
|[medan, selera, p...|         1|         8|
|[terutung, payung...|         1|        17|
|[empangan, bakun,...|         1|         7|
|[lapangan, terban...|         1|        31|
|[daerah, preov, w...|        91|        11|
|[shahree2005sumbe...|         1|        27|
|[anarjan, bostana...|         1|         9|
|[gkeli, bozyk, me...|         1|         9|
|[kronburg, merupa...|         1|        10|
|[nama, ini, telah...|         1|        14|
|[pudeng, merupaka...|         1|        14|
|[umretha, merupak...|         1|         8|
|[langhamn, merupa...|         1|        10|
|[buku, resipi, ke...|         1|        18|
|[daerah, topoany,...|        54|        11|
|[kiri, bodegraven...|         1|        12|
|[bakauheni, merup...|         1|        10|
|[sekolah, menenga...|         1|        25|
|[ovack, lleburgaz...|         1|         9|
|[tuhember

In [25]:
df2.count()

328852

### Ngram

In [26]:
from pyspark.ml.feature import NGram
ngramDataFrame = df2

for x in range(2,6):
    ngram = NGram(n=x, inputCol='token_sentence', outputCol= str(x) +'grams')
    ngramDataFrame = ngram.transform(ngramDataFrame) 
ngramDataFrame.show(1)

+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|      token_sentence|sent_count|word_count|              2grams|              3grams|              4grams|              5grams|
+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|[medan, selera, p...|         1|         8|[medan selera, se...|[medan selera pul...|[medan selera pul...|[medan selera pul...|
+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
only showing top 1 row



In [27]:
ngramDataFrame.filter(ngramDataFrame.word_count < 5).count()

863

In [28]:
ngramDataFrame.filter(ngramDataFrame.word_count == 4).show()

+--------------------+----------+----------+--------------------+--------------------+--------------------+------+
|      token_sentence|sent_count|word_count|              2grams|              3grams|              4grams|5grams|
+--------------------+----------+----------+--------------------+--------------------+--------------------+------+
|[pejabat, daerah,...|        13|         4|[pejabat daerah, ...|[pejabat daerah b...|[pejabat daerah b...|    []|
|[terengganu, daru...|         3|         4|[terengganu darul...|[terengganu darul...|[terengganu darul...|    []|
|[bahtera, perkahw...|         1|         4|[bahtera perkahwi...|[bahtera perkahwi...|[bahtera perkahwi...|    []|
|[album, artis, ma...|         1|         4|[album artis, art...|[album artis mala...|[album artis mala...|    []|
|[institut, terjem...|         2|         4|[institut terjema...|[institut terjema...|[institut terjema...|    []|
|[suruhanjaya, pil...|         2|         4|[suruhanjaya pili...|[suruhanjaya pi

In [29]:
ngramDataFrame.filter(ngramDataFrame.word_count == 3).show()

+--------------------+----------+----------+--------------------+--------------------+------+------+
|      token_sentence|sent_count|word_count|              2grams|              3grams|4grams|5grams|
+--------------------+----------+----------+--------------------+--------------------+------+------+
|[tokohtokoh, perb...|         6|         3|[tokohtokoh perbu...|[tokohtokoh perbu...|    []|    []|
|[thumb, yakult, m...|         1|         3|[thumb yakult, ya...|[thumb yakult mal...|    []|    []|
|[lkim, gov, malay...|         1|         3|[lkim gov, gov ma...| [lkim gov malaysia]|    []|    []|
|[jabatan, standar...|         1|         3|[jabatan standard...|[jabatan standard...|    []|    []|
|[kuala, lumpur, m...|         3|         3|[kuala lumpur, lu...|[kuala lumpur mal...|    []|    []|
|[heinemann, malay...|         1|         3|[heinemann malays...|[heinemann malays...|    []|    []|
|  [ia, terletak, di]|         4|         3|[ia terletak, ter...|    [ia terletak di]|    [

In [30]:
ngramDataFrame.filter(ngramDataFrame.word_count == 2).show()

+--------------------+----------+----------+--------------------+------+------+------+
|      token_sentence|sent_count|word_count|              2grams|3grams|4grams|5grams|
+--------------------+----------+----------+--------------------+------+------+------+
|  [malaysia, wanita]|         1|         2|   [malaysia wanita]|    []|    []|    []|
| [longman, malaysia]|         3|         2|  [longman malaysia]|    []|    []|    []|
| [malaysia, belanda]|         1|         2|  [malaysia belanda]|    []|    []|    []|
|  [tukang, malaysia]|         1|         2|   [tukang malaysia]|    []|    []|    []|
|       [malaysia, b]|         2|         2|        [malaysia b]|    []|    []|    []|
|      [di, malaysia]|         4|         2|       [di malaysia]|    []|    []|    []|
|[undangundang, ma...|         2|         2|[undangundang mal...|    []|    []|    []|
| [ukraine, malaysia]|         1|         2|  [ukraine malaysia]|    []|    []|    []|
|[kebudayaan, mala...|         1|         2

In [31]:
ngramDataFrame.filter(ngramDataFrame.word_count == 2)

def find_keyword(text):
    keywords =  ['terletak', 'kawasan', 'malaysia',' tentera', 'daerah',  'kabupaten']
    
    empList = []
    for x in text:
        for y in keywords:
            if y in x:
                empList.append(x)
    return empList


In [32]:
column =  ['token_sentence','sent_count','word_count','2grams','3grams','4grams','5grams']
gram5= ngramDataFrame.filter(ngramDataFrame.word_count > 4)\
.rdd.map(lambda x: (x[0], x[1], x[2],find_keyword(x[3])\
,find_keyword(x[4]), find_keyword(x[5]), find_keyword(x[6]))).persist(StorageLevel.MEMORY_ONLY)

In [33]:
df_5gram = gram5.toDF(column).show()

22/10/05 19:01:44 WARN BlockManager: Task 671 already completed, not releasing lock for rdd_97_0
22/10/05 19:01:44 WARN BlockManager: Task 672 already completed, not releasing lock for rdd_97_0
+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|      token_sentence|sent_count|word_count|              2grams|              3grams|              4grams|              5grams|
+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|[medan, selera, p...|         1|         8|[pula terletak, t...|[selera pula terl...|[medan selera pul...|[medan selera pul...|
|[terutung, payung...|         1|        17|[yang terletak, t...|[gampong yang ter...|[sebuah gampong y...|[merupakan sebuah...|
|[empangan, bakun,...|         1|         7|[juga terletak, t...|[bakun juga terle...|[empangan bakun j...|[empangan bakun j...|
|[lapangan, terban...|         1

In [34]:
gram4= ngramDataFrame.filter(ngramDataFrame.word_count == 4)\
.rdd.map(lambda x: (x[0], x[1], x[2],find_keyword(x[3])\
,find_keyword(x[4]), find_keyword(x[5]), x[6])).persist(StorageLevel.MEMORY_ONLY)

In [35]:
gram3= ngramDataFrame.filter(ngramDataFrame.word_count == 3)\
.rdd.map(lambda x: (x[0], x[1], x[2],find_keyword(x[3])\
,find_keyword(x[4]), x[5], x[6])).persist(StorageLevel.MEMORY_ONLY)

In [36]:
gram2= ngramDataFrame.filter(ngramDataFrame.word_count == 2)\
.rdd.map(lambda x: (x[0], x[1], x[2],find_keyword(x[3])\
,find_keyword(x[4]), x[5], x[6])).persist(StorageLevel.MEMORY_ONLY)

In [37]:
Final_gram = gram5.union(gram4).union(gram3).union(gram2).toDF(column).persist(StorageLevel.MEMORY_ONLY)

22/10/05 19:01:44 WARN BlockManager: Task 673 already completed, not releasing lock for rdd_97_0


In [38]:
Final_gram.show()

+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|      token_sentence|sent_count|word_count|              2grams|              3grams|              4grams|              5grams|
+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|[medan, selera, p...|         1|         8|[pula terletak, t...|[selera pula terl...|[medan selera pul...|[medan selera pul...|
|[terutung, payung...|         1|        17|[yang terletak, t...|[gampong yang ter...|[sebuah gampong y...|[merupakan sebuah...|
|[empangan, bakun,...|         1|         7|[juga terletak, t...|[bakun juga terle...|[empangan bakun j...|[empangan bakun j...|
|[lapangan, terban...|         1|        31|[maputo terletak,...|[terbang maputo t...|[lapangan terbang...|[lapangan terbang...|
|[daerah, preov, w...|        91|        11|      [daerah preov]|[daerah preov wil...|[daerah pre

In [39]:
Final_gram.count()

328852

In [40]:
Final_gram.filter(Final_gram.word_count == 4).show()

+--------------------+----------+----------+--------------------+--------------------+--------------------+------+
|      token_sentence|sent_count|word_count|              2grams|              3grams|              4grams|5grams|
+--------------------+----------+----------+--------------------+--------------------+--------------------+------+
|[pejabat, daerah,...|        13|         4|[pejabat daerah, ...|[pejabat daerah b...|[pejabat daerah b...|    []|
|[terengganu, daru...|         3|         4|     [iman malaysia]|[darul iman malay...|[terengganu darul...|    []|
|[bahtera, perkahw...|         1|         4| [mingguan malaysia]|[perkahwinan ming...|[bahtera perkahwi...|    []|
|[album, artis, ma...|         1|         4|[artis malaysia, ...|[album artis mala...|[album artis mala...|    []|
|[institut, terjem...|         2|         4|   [negara malaysia]|[terjemahan negar...|[institut terjema...|    []|
|[suruhanjaya, pil...|         2|         4|     [raya malaysia]|[pilihan raya m

In [41]:
Final_gram = Final_gram.coalesce(1)

Final_gram.write.parquet("hdfs://g5.bigtop.it:8020/user/root/filtered_ms_wiki.parquet")