In [1]:
from pyspark.sql import SparkSession
from pyspark import StorageLevel
import os

os.environ["PYSPARK_PYTHON"]="/home/pc/g5_env/bin/python39"

spark = SparkSession.builder.master("local[5]")\
            .appName("en_pos_tag")\
            .config('spark.executor.memory', '20g')\
            .config('spark.driver.maxResultSize', '10g')\
            .config('spark.driver.memory', '20g')\
            .config('spark.ui.showConsoleProgress', False)\
            .getOrCreate()

sparkContext = spark.sparkContext

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/07 23:36:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/07 23:36:45 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/10/07 23:36:45 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
22/10/07 23:36:45 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.
22/10/07 23:36:45 WARN Utils: Service 'SparkUI' could not bind on port 4043. Attempting port 4044.
22/10/07 23:36:45 WARN Utils: Service 'SparkUI' could not bind on port 4044. Attempting port 4045.


In [2]:
file_path = 'hdfs://g5.bigtop.it:8020/user/root/filtered_en_wiki.parquet/part-00000-b7931c44-7a8c-4f89-8265-6d16a044997a-c000.snappy.parquet'

In [3]:
df1 = spark.read.option("header",True).parquet(file_path)
df1.show()

+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|      token_sentence|sent_count|word_count|              2grams|              3grams|              4grams|              5grams|
+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|[there, are, prop...|       144|        13|[the national, na...|[on the national,...|[listed on the na...|[districts listed...|
|[his, candidature...|         1|        15|[indian national,...|[the indian natio...|[by the indian na...|[supported by the...|
|[abingdon, is, th...|         1|         7|        [aaa school]|[smallest aaa sch...|[2nd smallest aaa...|[the 2nd smallest...|
|[in, he, returned...|         1|        20|[stanford univers...|[the stanford uni...|[to the stanford ...|[returned to the ...|
|[while, most, cul...|         1|        23|[a secondary, sec...|[considered a sec...|[are consid

In [4]:
from pyspark.sql.functions import col

gram2 = df1.select(col("2grams")).withColumnRenamed("2grams","n-grams")
gram3 = df1.select(col("3grams")).filter(df1.word_count > 2).withColumnRenamed("3grams","n-grams")
gram4 = df1.select(col("4grams")).filter(df1.word_count > 3).withColumnRenamed("4grams","n-grams")
gram5 = df1.select(col("5grams")).filter(df1.word_count > 4).withColumnRenamed("5grams","n-grams")

In [5]:
n_gram = gram5.union(gram4).union(gram3).union(gram2).persist(StorageLevel.MEMORY_ONLY)

In [6]:
n_gram.show()

+--------------------+
|             n-grams|
+--------------------+
|[districts listed...|
|[supported by the...|
|[the 2nd smallest...|
|[returned to the ...|
|[cultivator are c...|
|[degree from the ...|
|[high schools ser...|
|[recover taxes ow...|
|[the school initi...|
|[british army as ...|
|[in national elec...|
|[thesis universit...|
|[he also set the ...|
|[of kerala won th...|
|[he attended new ...|
|[tobias went to s...|
|[standing as the ...|
|[in oxford dictio...|
|[podaras while at...|
|[establishment of...|
+--------------------+
only showing top 20 rows



In [7]:
from pyspark.sql.functions import explode
ngrams = n_gram.select(explode(col('n-grams'))).withColumnRenamed("col","ngrams")

In [8]:
ngrams.show()

+--------------------+
|              ngrams|
+--------------------+
|districts listed ...|
|listed on the nat...|
|on the national r...|
|the national regi...|
|national register...|
|supported by the ...|
|by the indian nat...|
|the indian nation...|
|indian national c...|
|national congress...|
|the 2nd smallest ...|
|returned to the s...|
|to the stanford u...|
|the stanford univ...|
|the stanford univ...|
|stanford universi...|
|stanford universi...|
|university gradua...|
|university gradua...|
|graduate school o...|
+--------------------+
only showing top 20 rows



In [9]:
def f(x): return x   
def exchangePosition(x, y):
    return y, x

In [10]:
ngrams2 = ngrams.rdd.map(lambda x: (1,x))\
.flatMapValues(f)\
.map(lambda x: exchangePosition(x[0], x[1]))\
.reduceByKey(lambda a,b: a+b)\
.persist(StorageLevel.MEMORY_ONLY)

In [32]:
ngrams2.count()

5888569

In [12]:
column = ['ngram', 'gram_count']
ngrams2 = ngrams2.toDF(column)

22/10/07 23:37:11 WARN BlockManager: Task 56 already completed, not releasing lock for rdd_42_0


In [13]:
ngrams2.show()

22/10/07 23:37:11 WARN BlockManager: Task 57 already completed, not releasing lock for rdd_42_0
+--------------------+----------+
|               ngram|gram_count|
+--------------------+----------+
|secondary tillage...|         1|
|high schools serv...|         1|
|city students fre...|         1|
|other high school...|         1|
|kerala won the na...|         1|
|dictionary of nat...|         1|
|dictionary of nat...|         3|
|educated at westm...|        42|
|the university of...|         1|
|houstons faculty ...|         1|
|national railways...|        14|
|the national regi...|      6404|
|negro national le...|         2|
|team members were...|         1|
|the tertiary sect...|        85|
|a secondary role ...|         1|
|a national inspir...|         1|
|national inspirat...|         1|
|environment herit...|         1|
|southern methodis...|         1|
+--------------------+----------+
only showing top 20 rows



In [14]:
ngram2 = ngrams2.drop('gram_count')

In [15]:
ngram2.show()

22/10/07 23:37:11 WARN BlockManager: Task 58 already completed, not releasing lock for rdd_42_0
+--------------------+
|               ngram|
+--------------------+
|secondary tillage...|
|high schools serv...|
|city students fre...|
|other high school...|
|kerala won the na...|
|dictionary of nat...|
|dictionary of nat...|
|educated at westm...|
|the university of...|
|houstons faculty ...|
|national railways...|
|the national regi...|
|negro national le...|
|team members were...|
|the tertiary sect...|
|a secondary role ...|
|a national inspir...|
|national inspirat...|
|environment herit...|
|southern methodis...|
+--------------------+
only showing top 20 rows



In [16]:
import pyspark.sql.functions as f
ngram2 = ngram2.withColumn('word_count', f.size(f.split(f.col('ngram'), ' '))).persist(StorageLevel.MEMORY_ONLY)
ngram2.count()

5888569

In [17]:
ngram2.show(10, False)

+----------------------------------------------+----------+
|ngram                                         |word_count|
+----------------------------------------------+----------+
|secondary tillage implement active cultivators|5         |
|high schools serving frederick city           |5         |
|city students frederick high school           |5         |
|other high schools in frederick               |5         |
|kerala won the national award                 |5         |
|dictionary of national biography lilian       |5         |
|dictionary of national biography john         |5         |
|educated at westminster school and            |5         |
|the university of houstons faculty            |5         |
|houstons faculty includes national medal      |5         |
+----------------------------------------------+----------+
only showing top 10 rows



In [18]:
keywords =  ['secondary', 'school', 'tertiary', 'university', 'national',  'private']

def keyword_position(text, n_gram):

    keyword = []
    index_of_keyword =[]
    
    tempList = list(text.split(" "))
    
    for x in keywords:
        i = 0
        for y in tempList:
            if x == y:
                keyword.append(x)
                index_of_keyword.append(i)
            i = i + 1
    
    if keyword == []:
        return
    
    if n_gram == 3 or n_gram == 2:
        return text
    
    if n_gram == 5:
        if 2 in index_of_keyword:
            return text
    
    if n_gram == 4:
        if 1 in index_of_keyword or 2 in index_of_keyword:
            return text

    return

In [19]:
from pyspark.sql import Row

row = Row("ngram")
ngram3 = ngram2.rdd.map(lambda x: (keyword_position(x[0], x[1]))).map(row).toDF().dropna(how='any').persist(StorageLevel.MEMORY_ONLY)

In [20]:
from nltk import pos_tag, word_tokenize

def en_pos_tag(text):
    results = pos_tag(word_tokenize(text), lang='eng') 
    empList = []
    for x,y in results:
        empList.append(y)

    return empList

In [21]:
ngram3.rdd.getNumPartitions()

20

In [22]:
ngram3 = ngram3.repartition(48)

In [23]:
ngram3.count()

2253091

In [24]:
pos_tag = ngram3.rdd.map(lambda x: (x[0], en_pos_tag(x[0]))).persist(StorageLevel.MEMORY_ONLY)

In [25]:
pos_tag.count()

2253091

In [26]:
columns = ['sentence', 'pos_tag']

df_pos_tag = pos_tag.toDF(columns)

22/10/07 23:39:38 WARN BlockManager: Task 220 already completed, not releasing lock for rdd_110_0


In [27]:
df_pos_tag.show()

+--------------------+--------------------+
|            sentence|             pos_tag|
+--------------------+--------------------+
|condemned the uni...|   [VBD, DT, NN, IN]|
|high school in be...|    [JJ, NN, IN, NN]|
|mens national cyc...|       [NNS, JJ, NN]|
|key national port...|        [JJ, JJ, NN]|
|of the private al...|[IN, DT, JJ, NN, NN]|
|roads the univers...|[NNS, DT, NN, IN,...|
|     play the school|        [VB, DT, NN]|
|in pacific nation...|[IN, JJ, JJ, VBD,...|
|dalarna universit...|        [NN, NN, IN]|
|act us national a...|   [VB, PRP, JJ, NN]|
|and public school...|[CC, JJ, NN, PRP,...|
|primary school gr...|        [JJ, NN, NN]|
|joined school choirs|       [JJ, NN, NNS]|
|reports universit...| [NNS, NN, NNS, VBP]|
|sai international...|[JJ, JJ, NN, IN, DT]|
|festival american...|   [JJ, JJ, JJ, NNS]|
|school district r...|        [NN, NN, JJ]|
|the herkomer scho...|    [DT, JJ, NN, IN]|
|school for collec...|        [NN, IN, NN]|
|     of bjp national|        [I

In [28]:
import pyspark.sql.functions as f
df_final = df_pos_tag.withColumn('n-gram', f.size(f.split(f.col('sentence'), ' ')))

In [29]:
df_final.show()

+--------------------+--------------------+------+
|            sentence|             pos_tag|n-gram|
+--------------------+--------------------+------+
|condemned the uni...|   [VBD, DT, NN, IN]|     4|
|high school in be...|    [JJ, NN, IN, NN]|     4|
|mens national cyc...|       [NNS, JJ, NN]|     3|
|key national port...|        [JJ, JJ, NN]|     3|
|of the private al...|[IN, DT, JJ, NN, NN]|     5|
|roads the univers...|[NNS, DT, NN, IN,...|     5|
|     play the school|        [VB, DT, NN]|     3|
|in pacific nation...|[IN, JJ, JJ, VBD,...|     5|
|dalarna universit...|        [NN, NN, IN]|     3|
|act us national a...|   [VB, PRP, JJ, NN]|     4|
|and public school...|[CC, JJ, NN, PRP,...|     5|
|primary school gr...|        [JJ, NN, NN]|     3|
|joined school choirs|       [JJ, NN, NNS]|     3|
|reports universit...| [NNS, NN, NNS, VBP]|     4|
|sai international...|[JJ, JJ, NN, IN, DT]|     5|
|festival american...|   [JJ, JJ, JJ, NNS]|     4|
|school district r...|        [

In [30]:
from pyspark.sql.functions import col, concat_ws
df_final = df_final.withColumn('pos_tag', concat_ws(', ', col('pos_tag')))

In [31]:
df_final.show()

+--------------------+--------------------+------+
|            sentence|             pos_tag|n-gram|
+--------------------+--------------------+------+
|condemned the uni...|     VBD, DT, NN, IN|     4|
|high school in be...|      JJ, NN, IN, NN|     4|
|mens national cyc...|         NNS, JJ, NN|     3|
|key national port...|          JJ, JJ, NN|     3|
|of the private al...|  IN, DT, JJ, NN, NN|     5|
|roads the univers...| NNS, DT, NN, IN, NN|     5|
|     play the school|          VB, DT, NN|     3|
|in pacific nation...| IN, JJ, JJ, VBD, DT|     5|
|dalarna universit...|          NN, NN, IN|     3|
|act us national a...|     VB, PRP, JJ, NN|     4|
|and public school...|CC, JJ, NN, PRP, VBD|     5|
|primary school gr...|          JJ, NN, NN|     3|
|joined school choirs|         JJ, NN, NNS|     3|
|reports universit...|   NNS, NN, NNS, VBP|     4|
|sai international...|  JJ, JJ, NN, IN, DT|     5|
|festival american...|     JJ, JJ, JJ, NNS|     4|
|school district r...|         