## Language Detection

In [1]:
from pyspark.sql import SparkSession
from pyspark import StorageLevel
import os

os.environ["PYSPARK_PYTHON"]="/home/pc/g5_env/bin/python39"

spark = SparkSession.builder.master("local[*]")\
            .appName("Language Detection")\
            .config('spark.executor.memory', '30g')\
            .config('spark.driver.maxResultSize', '30g')\
            .config('spark.driver.memory', '30g')\
            .getOrCreate()
            # .config('spark.ui.showConsoleProgress', False)\
           

sparkContext = spark.sparkContext

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/09 21:10:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/10/09 21:10:41 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
file_path = "hdfs://g5.bigtop.it:8020/user/root/filtered_social_media.parquet/part-00000-44f708f2-eb56-4495-8536-dd03b0326bbc-c000.snappy.parquet"

In [3]:
df1 = spark.read.option("header",True).parquet(file_path)
df1.show()



+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|      token_sentence|sent_count|word_count|              2grams|              3grams|              4grams|              5grams|
+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|[kafi, ghatia, pr...|         1|         5|                  []|                  []|                  []|                  []|
|[pelayanan, yg, r...|         1|         5|                  []|                  []|                  []|                  []|
|[aron, got, to, h...|         1|        23|[top quality, qua...|[getting top qual...|[we getting top q...|[many we getting ...|
|[great, clothes, ...|         1|         6|[great quality, q...|[clothes great qu...|[great clothes gr...|[great clothes gr...|
|[good, product, q...|      1555|         7|[product quality,...|[good product qua...|[good produ

                                                                                

In [4]:
df1.rdd.getNumPartitions()

22

In [5]:
df1 = df1.repartition(110)

In [6]:
df1.rdd.getNumPartitions()



110

In [7]:
df1.show()



+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|      token_sentence|sent_count|word_count|              2grams|              3grams|              4grams|              5grams|
+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|[fast, delivery, ...|         2|         9|[high quality, qu...|[delivery high qu...|[fast delivery hi...|[fast delivery hi...|
|[good, pruduct, q...|         1|        10|[pruduct quality,...|[good pruduct qua...|[good pruduct qua...|[good pruduct qua...|
|[nice, penghantar...|         1|        26|                  []|                  []|                  []|                  []|
|[barang, dah, sam...|         1|        10|                  []|                  []|                  []|                  []|
|[barang, sampai, ...|         1|         7|                  []|                  []|           

                                                                                

In [8]:
from pyspark.sql.functions import col

gram2 = df1.select(col("2grams")).withColumnRenamed("2grams","n-grams")
gram3 = df1.select(col("3grams")).filter(df1.word_count > 2).withColumnRenamed("3grams","n-grams")
gram4 = df1.select(col("4grams")).filter(df1.word_count > 3).withColumnRenamed("4grams","n-grams")
gram5 = df1.select(col("5grams")).filter(df1.word_count > 4).withColumnRenamed("5grams","n-grams")

In [9]:
n_gram = gram5.union(gram4).union(gram3).union(gram2).persist(StorageLevel.MEMORY_ONLY)

In [10]:
n_gram.show(1, False)



+-------------------------------------------------------------------------+
|n-grams                                                                  |
+-------------------------------------------------------------------------+
|[good quality material bolela agak, quality material bolela agak berbulu]|
+-------------------------------------------------------------------------+
only showing top 1 row



                                                                                

In [11]:
from pyspark.sql.functions import explode
ngrams = n_gram.select(explode(col('n-grams'))).withColumnRenamed("col","ngrams")

In [12]:
ngrams.show()

+--------------------+
|              ngrams|
+--------------------+
|good quality mate...|
|quality material ...|
|qualityworth the ...|
|the money and hig...|
|money and high qu...|
|and high quality ...|
|kerjabungkusan wa...|
|quality tudung te...|
|barang dah sampi ...|
|dah sampi bungkus...|
|sampi bungkusan o...|
|bungkusan okkemas...|
|okkemas xde ug ko...|
|kalau nak nampak ...|
|nak nampak lagi k...|
|nampak lagi kemas...|
|lagi kemas kena t...|
|kemas kena tekan ...|
|quality ok but se...|
|very impress with...|
+--------------------+
only showing top 20 rows



In [13]:
def f(x): return x   
def exchangePosition(x, y):
    return y, x

In [14]:
ngrams2 = ngrams.rdd.map(lambda x: (1,x))\
.flatMapValues(f)\
.map(lambda x: exchangePosition(x[0], x[1]))\
.reduceByKey(lambda a,b: a+b)\
.persist(StorageLevel.MEMORY_ONLY)

In [15]:
ngrams2.count()

                                                                                

1431722

In [16]:
column = ['ngram', 'gram_count']
ngrams2 = ngrams2.toDF(column)

In [17]:
ngrams2.show()

+------------------------+----------+
|                   ngram|gram_count|
+------------------------+----------+
|    barang quality be...|         1|
|    quality this is w...|         1|
|    selamat bungkusan...|         1|
|    you courier servi...|         1|
|    poor product qual...|         1|
|    wraplong expiry d...|         2|
|    pantas dan kemasb...|         1|
|    has a good qualit...|        37|
|    box quality okay ...|         1|
|    per describe and ...|        11|
|    penghantaran cepa...|         1|
|    always been good ...|         1|
|    for product quali...|         1|
|    baik good product...|         2|
|药水 卖家 已经 答应 下次|         1|
|    dan dibungkus den...|         4|
|    kuat pembungkusan...|         1|
|    这 卖家 真的 是 可以|         1|
|    quality ok je ras...|         1|
|    really good quali...|         2|
+------------------------+----------+
only showing top 20 rows



In [18]:
ngram2 = ngrams2.drop('gram_count')

In [19]:
ngram2.show()

+------------------------+
|                   ngram|
+------------------------+
|    barang quality be...|
|    quality this is w...|
|    selamat bungkusan...|
|    you courier servi...|
|    poor product qual...|
|    wraplong expiry d...|
|    pantas dan kemasb...|
|    has a good qualit...|
|    box quality okay ...|
|    per describe and ...|
|    penghantaran cepa...|
|    always been good ...|
|    for product quali...|
|    baik good product...|
|药水 卖家 已经 答应 下次|
|    dan dibungkus den...|
|    kuat pembungkusan...|
|    这 卖家 真的 是 可以|
|    quality ok je ras...|
|    really good quali...|
+------------------------+
only showing top 20 rows



In [20]:
ngram2.printSchema()

root
 |-- ngram: string (nullable = true)



In [21]:
import pyspark.sql.functions as f
ngram2 = ngram2.withColumn('word_count', f.size(f.split(f.col('ngram'), ' '))).persist(StorageLevel.MEMORY_ONLY)
ngram2.show(10, False)

+-------------------------------------------+----------+
|ngram                                      |word_count|
+-------------------------------------------+----------+
|barang quality berbaloi dengan hargamungkin|5         |
|quality this is worth to                   |5         |
|selamat bungkusan kemas sangat berpuas     |5         |
|you courier services and hope              |5         |
|poor product quality this product          |5         |
|wraplong expiry dategood product quality   |5         |
|pantas dan kemasbarang mampu milik         |5         |
|has a good quality and                     |5         |
|box quality okay dah selalu                |5         |
|per describe and quality is                |5         |
+-------------------------------------------+----------+
only showing top 10 rows



In [22]:
ngram2.count()

                                                                                

1431722

In [23]:
keywords =  ['quality', ' service', '购买', '卖家', 'kemas']

def keyword_position(text, n_gram):

    keyword = []
    index_of_keyword =[]
    
    tempList = list(text.split(" "))
    
    for x in keywords:
        i = 0
        for y in tempList:
            if x == y:
                keyword.append(x)
                index_of_keyword.append(i)
            i = i + 1
    
    if keyword == []:
        return
    
    if n_gram == 3 or n_gram == 2:
        return text
    
    if n_gram == 5:
        if 2 in index_of_keyword:
            return text
    
    if n_gram == 4:
        if 1 in index_of_keyword or 2 in index_of_keyword:
            return text

    return

In [24]:
from pyspark.sql import Row

row = Row("ngram")
ngram3 = ngram2.rdd.map(lambda x: (keyword_position(x[0], x[1]))).map(row).toDF().dropna(how='any').persist(StorageLevel.MEMORY_ONLY)

In [25]:
ngram3.count()

                                                                                

534349

In [26]:
from pyspark.sql.types import StringType
from lingua import Language, LanguageDetectorBuilder
from pyspark.sql.functions import col 

def lang_detect_word(text):
    languages = [Language.ENGLISH, Language.MALAY, Language.CHINESE]
    detector = LanguageDetectorBuilder.from_languages(*languages)\
    .with_minimum_relative_distance(0.1)\
    .build()
    
    empList = []
    
    for x in text.split(' '):
        language = detector.detect_language_of(x)
        if language == None:
            empList.append('OOV')
        elif language.name == 'ENGLISH':
            empList.append('EN')
        elif language.name == 'MALAY':
            empList.append('MS')
        elif language.name == 'CHINESE':
            empList.append('ZH')    

    return empList

rdd_lang_detect = ngram3.rdd.map(lambda x:  (x[0],lang_detect_word(x[0]))).persist(StorageLevel.MEMORY_ONLY)
rdd_lang_detect.count()

                                                                                

534349

In [27]:
rdd_lang_detect.toDF().show(truncate = False)

+--------------------------------------+----------------------+
|_1                                    |_2                    |
+--------------------------------------+----------------------+
|selamat bungkusan kemas sangat berpuas|[MS, MS, MS, MS, MS]  |
|poor product quality this product     |[EN, EN, EN, EN, EN]  |
|for product quality star with         |[EN, EN, EN, OOV, EN] |
|really good quality fast delivery     |[EN, EN, EN, EN, EN]  |
|usual product quality is great        |[EN, EN, EN, EN, EN]  |
|and nice quality will purchase        |[EN, EN, EN, EN, EN]  |
|money and quality looks great         |[EN, EN, EN, EN, EN]  |
|glossy good quality suitable for      |[EN, EN, EN, EN, EN]  |
|selamat bungkusan kemas kualiti baik  |[MS, MS, MS, OOV, MS] |
|好 是 卖家 还有 回复                  |[ZH, ZH, ZH, ZH, ZH]  |
|yang cepat quality yang sangat        |[MS, MS, EN, MS, MS]  |
|dapat high quality mcm ni             |[MS, EN, EN, OOV, OOV]|
|money good quality product fast       |[EN, EN,

In [28]:
def lang_detect_sentence(text):
    languages = [Language.ENGLISH, Language.MALAY, Language.CHINESE]
    detector = LanguageDetectorBuilder.from_languages(*languages)\
    .build()
    
    language = detector.detect_language_of(text)
    if language == None:
        return 'None'
    elif language.name == 'ENGLISH':
        
        return 'EN'
    elif language.name == 'MALAY':
        return 'MS'
    elif language.name == 'CHINESE':
        return 'ZH'
    return 'None'

In [29]:
lang_detect = rdd_lang_detect.map(lambda x: (x[0], x[1], lang_detect_sentence(x[0]))).persist(StorageLevel.MEMORY_ONLY)

In [30]:
lang_detect.count()

                                                                                

534349

In [31]:
column = ['sentence', 'language_word', 'language_sentence']

In [32]:
df_lang_detect = lang_detect.toDF(column)

In [33]:
df_lang_detect.show()

+--------------------+--------------------+-----------------+
|            sentence|       language_word|language_sentence|
+--------------------+--------------------+-----------------+
|selamat bungkusan...|[MS, MS, MS, MS, MS]|               MS|
|poor product qual...|[EN, EN, EN, EN, EN]|               EN|
|for product quali...|[EN, EN, EN, OOV,...|               EN|
|really good quali...|[EN, EN, EN, EN, EN]|               EN|
|usual product qua...|[EN, EN, EN, EN, EN]|               EN|
|and nice quality ...|[EN, EN, EN, EN, EN]|               EN|
|money and quality...|[EN, EN, EN, EN, EN]|               EN|
|glossy good quali...|[EN, EN, EN, EN, EN]|               EN|
|selamat bungkusan...|[MS, MS, MS, OOV,...|               MS|
|好 是 卖家 还有 回复|[ZH, ZH, ZH, ZH, ZH]|               ZH|
|yang cepat qualit...|[MS, MS, EN, MS, MS]|               MS|
|dapat high qualit...|[MS, EN, EN, OOV,...|               EN|
|money good qualit...|[EN, EN, EN, EN, EN]|               EN|
|a proud quality

In [34]:
from pyspark.sql.functions import col, concat_ws
df_final = df_lang_detect.withColumn('language_word', concat_ws(', ', col('language_word')))

In [35]:
df_final.show()

+--------------------+--------------------+-----------------+
|            sentence|       language_word|language_sentence|
+--------------------+--------------------+-----------------+
|selamat bungkusan...|  MS, MS, MS, MS, MS|               MS|
|poor product qual...|  EN, EN, EN, EN, EN|               EN|
|for product quali...| EN, EN, EN, OOV, EN|               EN|
|really good quali...|  EN, EN, EN, EN, EN|               EN|
|usual product qua...|  EN, EN, EN, EN, EN|               EN|
|and nice quality ...|  EN, EN, EN, EN, EN|               EN|
|money and quality...|  EN, EN, EN, EN, EN|               EN|
|glossy good quali...|  EN, EN, EN, EN, EN|               EN|
|selamat bungkusan...| MS, MS, MS, OOV, MS|               MS|
|好 是 卖家 还有 回复|  ZH, ZH, ZH, ZH, ZH|               ZH|
|yang cepat qualit...|  MS, MS, EN, MS, MS|               MS|
|dapat high qualit...|MS, EN, EN, OOV, OOV|               EN|
|money good qualit...|  EN, EN, EN, EN, EN|               EN|
|a proud quality