# <font color='red'>Social Media Data Preproccessing</font> 

## <font color='red'>Start Spark Session</font> 

In [1]:
from pyspark.sql import SparkSession
from pyspark import StorageLevel
import os

os.environ["PYSPARK_PYTHON"]="/home/pc/g5_env/bin/python39"

spark = SparkSession.builder.master("local[22]")\
            .appName("preproccesing")\
            .config('spark.executor.memory', '20g')\
            .config('spark.driver.maxResultSize', '10g')\
            .config('spark.driver.memory', '20g')\
            .config('spark.ui.showConsoleProgress', False)\
            .getOrCreate()

sparkContext = spark.sparkContext

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/10/06 23:52:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
spark.conf.get('spark.sql.files.maxPartitionBytes')

'134217728b'

In [3]:
spark.conf.set('spark.sql.files.maxPartitionBytes', 64000000)
spark.conf.get('spark.sql.files.maxPartitionBytes')

'64000000'

## <font color='red'>Import Social Media Data</font> 

In [4]:
dataDF = spark.read.option("header",True).csv("/home/pc/data/parsed_data/4data-comment_only.csv")
dataDF.show(10)

+--------------------+
|             Comment|
+--------------------+
|#sephorahaul #sep...|
|Yahoo! ❤️😍 enjoy...|
|            NAKK😍😍|
|Congratulations g...|
|@stacienotinokey ...|
|What an inspiring...|
|Always the best d...|
|@dpb1982 I’ll mak...|
|@brentlys_bbq tha...|
|@dpb1982 I defini...|
+--------------------+
only showing top 10 rows



In [5]:
dataDF.rdd.getNumPartitions()

22

In [6]:
dataDF = dataDF.repartition(66)

In [7]:
dataDF.rdd.getNumPartitions()

66

## <font color='red'>Dealing With Null Value</font>

In [8]:
### Checking total number of null value
from pyspark.sql.functions import col,isnan, when, count

c = 'Comment'
dataDF.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c)]).show()

+-------+
|Comment|
+-------+
|    699|
+-------+



In [9]:
### Remove null value

dataDF = dataDF.dropna(how='any')

### <font color='red'>Sentence Tokenize</font>

In [10]:
from nltk.tokenize import sent_tokenize, word_tokenize
def sentence_tokenize(text, index):
    return index,sent_tokenize(text)

def f(x): return x

In [11]:
# Adding indexes to all sentence
# Swapping the column
# Splitting the sentence
RDD = dataDF.rdd.map(lambda x: x[0]).zipWithIndex()\
.map(lambda x: sentence_tokenize(x[0], x[1]))\
.flatMapValues(f)\
.persist(StorageLevel.MEMORY_ONLY)

## <font color='red'>Text Cleaning</font>


#### Remove HTML tag

In [12]:
def remove_html_tags(text):
    #Remove html tags from a string
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, ' ', text)

#### Remove contractions

In [13]:
import contractions

def contractions_word(text):
    # using contractions.fix to expand the shortened words
    try:
        text = contractions.fix(text)
    except:
        return text
    return text


#### Remove Emoticons

In [14]:
from emot.emo_unicode import EMOTICONS_EMO

def remove_emoticons(text):
    for emot in EMOTICONS_EMO:
        text = text.replace(emot, '')
    return text


In [15]:
RDD2 = RDD.map(lambda x : remove_html_tags(x[1]))\
.map(lambda x : contractions_word(x))\
.map(lambda x: remove_emoticons(x))\
.persist(StorageLevel.MEMORY_ONLY)



In [16]:
RDD3 = RDD2.zipWithIndex()\
.map(lambda x: sentence_tokenize(x[0], x[1]))\
.flatMapValues(f)\
.persist(StorageLevel.MEMORY_ONLY)

#### 1) Remove Emoji, URL, Phone Number, Currency, Punctuations, Digit, E-mail
#### 2) To Lower

In [17]:
from cleantext import clean

def clean_text(text): 
    return str(clean(text,
            fix_unicode=True,               # fix various unicode errors
            to_ascii=False,                  # transliterate to closest ASCII representation
            lower=True,                     # lowercase text
            no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
            no_urls=True,                  # replace all URLs with a special token
            no_emails=True,                # replace all email addresses with a special token
            no_phone_numbers=True,         # replace all phone numbers with a special token
            no_numbers=True,               # replace all numbers with a special token
            no_digits=True,                # replace all digits with a special token
            no_currency_symbols=True,      # replace all currency symbols with a special token
            no_punct=True,                 # remove punctuations
            no_emoji=True,
            replace_with_punct="",          # instead of removing punctuations you may replace them
            replace_with_url="",
            replace_with_email="",
            replace_with_phone_number="",
            replace_with_number="",
            replace_with_digit="",
            replace_with_currency_symbol="",
            lang="en"                       # set to 'de' for German special handling
            ))


#### Remove Symbols

In [18]:
def remove_symbols(text):
    symbols = ['+', '^', '|', '~', '>', '<', '=', '`']
    for x in symbols:
        text = text.replace(x, ' ')
    return text

In [19]:
RDD4 = RDD3.map(lambda x : (clean_text(x)))\
.filter(lambda x : x != '')\
.map(lambda x : (remove_symbols(x)))\
.filter(lambda x : x != '')\
.persist(StorageLevel.MEMORY_ONLY)

## <font color='red'>Word Tokenize</font>

In [20]:
import jieba
import logging

def my_jieba(text):
    jieba.setLogLevel(logging.WARNING)
    return jieba.lcut(text, cut_all=False)

def remove_space(text):
    empList = []

    for x in text: 
        if x != '':
            if x != ' ':
                empList.append(x)
    return empList

def exchangePosition(text, index):
    return index, (text)


In [21]:
RDD5 = RDD4.map(lambda x: my_jieba(x))\
.map(lambda x: remove_space(x)).persist(StorageLevel.MEMORY_ONLY)

In [22]:
#Possible Ngram final output
data = RDD5.map(lambda x: (x,1))\
.map(lambda x: exchangePosition(x[0], x[1]))\
.flatMapValues(f)\
.map(lambda x: exchangePosition(x[0], x[1]))\
.reduceByKey(lambda a,b: a+b)

In [23]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

schema = StructType([ \
    StructField('word',StringType(),True), \
    StructField('word_count',IntegerType(),True), \
  ])


df = spark.createDataFrame(data = data, schema=schema)

In [24]:
df.show()

+-----------+----------+
|       word|word_count|
+-----------+----------+
|       even|     63300|
|    protest|       381|
|       baap|        21|
|    looking|     14077|
|       used|     24162|
|        yes|     10949|
|      jimin|      2923|
|      there|     45774|
|  literally|      2240|
|        bro|      7233|
|    bangeet|        21|
|     akulag|         1|
|       stop|      5552|
|       fool|       192|
|    kalinya|       419|
|       dari|    143055|
|   sickness|        77|
|  adilsleef|         1|
|tinggkatkan|         1|
|    willing|      1773|
+-----------+----------+
only showing top 20 rows



In [25]:
import advertools as adv
from nltk.corpus import stopwords as stop_words

zh_STOPWORDS, indo_STOPWORDS, eng_STOPWORDS = adv.stopwords['chinese'], adv.stopwords['indonesian'], stop_words.words('english')

In [26]:
df.orderBy("word_count", ascending=False)\
.filter(~col("word").isin(eng_STOPWORDS))\
.filter(~col("word").isin(zh_STOPWORDS))\
.filter(~col("word").isin(indo_STOPWORDS))\
.show(10)

+--------+----------+
|    word|word_count|
+--------+----------+
|    good|   3640420|
|  seller|   2594390|
|  barang|   1233846|
|delivery|   1192163|
|    fast|   1166771|
|      tq|   1004259|
| terbaik|    955579|
|   thank|    917710|
|    beli|    800359|
|      ok|    793587|
+--------+----------+
only showing top 10 rows



## <font color='red'>Data Extraction</font> 

In [27]:
def text_extract(text):
    keywords =  ['quality', ' service', '购买', '卖家', 'kemas']

    for x in keywords:
        if x in text:
            return text
        
    return ''


keyword_filter = RDD5.map(lambda x: text_extract(x)).filter(lambda x: x != '').persist(StorageLevel.MEMORY_ONLY)

In [28]:
keyword_filter.count()

846664

In [29]:
data1 = keyword_filter.map(lambda x : (tuple(x), 1))\
.reduceByKey(lambda a,b: a+b).persist(StorageLevel.MEMORY_ONLY)

In [30]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, ArrayType
schema = StructType([ \
    StructField('token_sentence',ArrayType(StringType()),True), \
    StructField('sent_count',IntegerType(),True), \
  ])


df1 = spark.createDataFrame(data = data1, schema=schema)
df1.show()

+--------------------+----------+
|      token_sentence|sent_count|
+--------------------+----------+
|[aron, got, to, h...|         1|
|[great, clothes, ...|         1|
|[good, product, q...|      1555|
|[the, former, qua...|         1|
|[the, quality, is...|         4|
|[penghantaran, pa...|         2|
|  [skru, tak, kemas]|         2|
|[the, quality, is...|         1|
|[terbaik, seller,...|         1|
|[foldable, car, o...|         2|
|[quality, is, great]|        55|
|[good, product, q...|         5|
|[bagus, bagus, pe...|         1|
|[penghantaran, yg...|         2|
|[the, goods, land...|         2|
|[powerbank, murah...|         1|
|[the, quality, is...|        81|
|[the, quality, mi...|         1|
|[thankyou, seller...|         4|
|[the, quality, is...|       153|
+--------------------+----------+
only showing top 20 rows



In [31]:
import pyspark.sql.functions as f
df1 = df1.withColumn('word_count', f.size(f.col('token_sentence')))

In [32]:
df1.show()

+--------------------+----------+----------+
|      token_sentence|sent_count|word_count|
+--------------------+----------+----------+
|[aron, got, to, h...|         1|        23|
|[great, clothes, ...|         1|         6|
|[good, product, q...|      1555|         7|
|[the, former, qua...|         1|        14|
|[the, quality, is...|         4|         4|
|[penghantaran, pa...|         2|        17|
|  [skru, tak, kemas]|         2|         3|
|[the, quality, is...|         1|         9|
|[terbaik, seller,...|         1|         5|
|[foldable, car, o...|         2|        10|
|[quality, is, great]|        55|         3|
|[good, product, q...|         5|        10|
|[bagus, bagus, pe...|         1|         9|
|[penghantaran, yg...|         2|         7|
|[the, goods, land...|         2|        30|
|[powerbank, murah...|         1|        13|
|[the, quality, is...|        81|         4|
|[the, quality, mi...|         1|        13|
|[thankyou, seller...|         4|        25|
|[the, qua

In [33]:
df1.filter(df1.word_count == 1).show()

+--------------+----------+----------+
|token_sentence|sent_count|word_count|
+--------------+----------+----------+
|     [quality]|     21082|         1|
|       [kemas]|      1250|         1|
|        [购买]|         3|         1|
|        [卖家]|        91|         1|
+--------------+----------+----------+



In [34]:
df2 = df1.filter(df1.word_count > 1)

In [35]:
df2.count()

301905

### NGRAM

In [36]:
from pyspark.ml.feature import NGram
ngramDataFrame = df2

for x in range(2,6):
    ngram = NGram(n=x, inputCol='token_sentence', outputCol= str(x) +'grams')
    ngramDataFrame = ngram.transform(ngramDataFrame) 
ngramDataFrame.show()

+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|      token_sentence|sent_count|word_count|              2grams|              3grams|              4grams|              5grams|
+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|[aron, got, to, h...|         1|        23|[aron got, got to...|[aron got to, got...|[aron got to hand...|[aron got to hand...|
|[great, clothes, ...|         1|         6|[great clothes, c...|[great clothes gr...|[great clothes gr...|[great clothes gr...|
|[good, product, q...|      1555|         7|[good product, pr...|[good product qua...|[good product qua...|[good product qua...|
|[the, former, qua...|         1|        14|[the former, form...|[the former quali...|[the former quali...|[the former quali...|
|[the, quality, is...|         4|         4|[the quality, qua...|[the quality is, ...|[the qualit

In [37]:
ngramDataFrame.filter(ngramDataFrame.word_count < 5).count()

26631

In [38]:
ngramDataFrame.filter(ngramDataFrame.word_count == 4).show()

+--------------------+----------+----------+--------------------+--------------------+--------------------+------+
|      token_sentence|sent_count|word_count|              2grams|              3grams|              4grams|5grams|
+--------------------+----------+----------+--------------------+--------------------+--------------------+------+
|[the, quality, is...|         4|         4|[the quality, qua...|[the quality is, ...|[the quality is s...|    []|
|[the, quality, is...|        81|         4|[the quality, qua...|[the quality is, ...|[the quality is g...|    []|
|[the, quality, is...|       153|         4|[the quality, qua...|[the quality is, ...| [the quality is ok]|    []|
|[very, very, good...|        39|         4|[very very, very ...|[very very good, ...|[very very good q...|    []|
|[good, quality, s...|         2|         4|[good quality, qu...|[good quality sgt...|[good quality sgt...|    []|
|[awning, nampak, ...|         1|         4|[awning nampak, n...|[awning nampak 

In [39]:
ngramDataFrame.filter(ngramDataFrame.word_count == 3).show()

+--------------------+----------+----------+--------------------+--------------------+------+------+
|      token_sentence|sent_count|word_count|              2grams|              3grams|4grams|5grams|
+--------------------+----------+----------+--------------------+--------------------+------+------+
|  [skru, tak, kemas]|         2|         3|[skru tak, tak ke...|    [skru tak kemas]|    []|    []|
|[quality, is, great]|        55|         3|[quality is, is g...|  [quality is great]|    []|    []|
|[quality, as, exp...|       131|         3|[quality as, as e...|[quality as expec...|    []|    []|
|[pen, good, quality]|         2|         3|[pen good, good q...|  [pen good quality]|    []|    []|
|[jahitan, kemas, je]|        11|         3|[jahitan kemas, k...|  [jahitan kemas je]|    []|    []|
|[tq, sellergood, ...|         2|         3|[tq sellergood, s...|[tq sellergood qu...|    []|    []|
|[goid, produat, q...|         1|         3|[goid produat, pr...|[goid produat qua...|    [

In [40]:
ngramDataFrame.filter(ngramDataFrame.word_count == 2).show()

+--------------------+----------+----------+--------------------+------+------+------+
|      token_sentence|sent_count|word_count|              2grams|3grams|4grams|5grams|
+--------------------+----------+----------+--------------------+------+------+------+
|[pemnbungkusan, k...|         1|         2|[pemnbungkusan ke...|    []|    []|    []|
|       [quality, 行]|         1|         2|        [quality 行]|    []|    []|    []|
|   [quality, medium]|         7|         2|    [quality medium]|    []|    []|    []|
|    [quality, boomm]|         1|         2|     [quality boomm]|    []|    []|    []|
|[qualitylow, qual...|        11|         2|[qualitylow quality]|    []|    []|    []|
|     [fine, quality]|         5|         2|      [fine quality]|    []|    []|    []|
|   [quality, boekkk]|         1|         2|    [quality boekkk]|    []|    []|    []|
|  [quality, unknown]|         3|         2|   [quality unknown]|    []|    []|    []|
|[kemas, lajuterbaik]|         1|         2| 

In [41]:
def find_keyword(text):
    keywords =  ['quality', ' service', '购买', '卖家', 'kemas']
    
    empList = []
    for x in text:
        for y in keywords:
            if y in x:
                empList.append(x)
    return empList


In [42]:
column =  ['token_sentence','sent_count','word_count','2grams','3grams','4grams','5grams']
gram5= ngramDataFrame.filter(ngramDataFrame.word_count > 4)\
.rdd.map(lambda x: (x[0], x[1], x[2],find_keyword(x[3])\
,find_keyword(x[4]), find_keyword(x[5]), find_keyword(x[6]))).persist(StorageLevel.MEMORY_ONLY)

In [43]:
df_5gram = gram5.toDF(column).show()

22/10/07 00:01:33 WARN BlockManager: Task 738 already completed, not releasing lock for rdd_104_0
+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|      token_sentence|sent_count|word_count|              2grams|              3grams|              4grams|              5grams|
+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|[aron, got, to, h...|         1|        23|[top quality, qua...|[getting top qual...|[we getting top q...|[many we getting ...|
|[great, clothes, ...|         1|         6|[great quality, q...|[clothes great qu...|[great clothes gr...|[great clothes gr...|
|[good, product, q...|      1555|         7|[product quality,...|[good product qua...|[good product qua...|[good product qua...|
|[the, former, qua...|         1|        14|[former quality, ...|[the former quali...|[the former quali...|[the former quali...|

In [44]:
gram4= ngramDataFrame.filter(ngramDataFrame.word_count == 4)\
.rdd.map(lambda x: (x[0], x[1], x[2],find_keyword(x[3])\
,find_keyword(x[4]), find_keyword(x[5]), x[6])).persist(StorageLevel.MEMORY_ONLY)

In [45]:
gram3= ngramDataFrame.filter(ngramDataFrame.word_count == 3)\
.rdd.map(lambda x: (x[0], x[1], x[2],find_keyword(x[3])\
,find_keyword(x[4]), x[5], x[6])).persist(StorageLevel.MEMORY_ONLY)

In [46]:
gram2= ngramDataFrame.filter(ngramDataFrame.word_count == 2)\
.rdd.map(lambda x: (x[0], x[1], x[2],find_keyword(x[3])\
,find_keyword(x[4]), x[5], x[6])).persist(StorageLevel.MEMORY_ONLY)

In [47]:
Final_gram = gram5.union(gram4).union(gram3).union(gram2).toDF(column).persist(StorageLevel.MEMORY_ONLY)

22/10/07 00:01:33 WARN BlockManager: Task 740 already completed, not releasing lock for rdd_104_0


In [48]:
Final_gram.show()

+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|      token_sentence|sent_count|word_count|              2grams|              3grams|              4grams|              5grams|
+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|[aron, got, to, h...|         1|        23|[top quality, qua...|[getting top qual...|[we getting top q...|[many we getting ...|
|[great, clothes, ...|         1|         6|[great quality, q...|[clothes great qu...|[great clothes gr...|[great clothes gr...|
|[good, product, q...|      1555|         7|[product quality,...|[good product qua...|[good product qua...|[good product qua...|
|[the, former, qua...|         1|        14|[former quality, ...|[the former quali...|[the former quali...|[the former quali...|
|[penghantaran, pa...|         2|        17|[jahitan kemas, k...|[ok jahitan kemas...|[baju ok ja

In [49]:
Final_gram.count()

301905

In [51]:
Final_gram.filter(Final_gram.word_count > 4).show()

+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|      token_sentence|sent_count|word_count|              2grams|              3grams|              4grams|              5grams|
+--------------------+----------+----------+--------------------+--------------------+--------------------+--------------------+
|[aron, got, to, h...|         1|        23|[top quality, qua...|[getting top qual...|[we getting top q...|[many we getting ...|
|[great, clothes, ...|         1|         6|[great quality, q...|[clothes great qu...|[great clothes gr...|[great clothes gr...|
|[good, product, q...|      1555|         7|[product quality,...|[good product qua...|[good product qua...|[good product qua...|
|[the, former, qua...|         1|        14|[former quality, ...|[the former quali...|[the former quali...|[the former quali...|
|[penghantaran, pa...|         2|        17|[jahitan kemas, k...|[ok jahitan kemas...|[baju ok ja

### Change Format

### To CSV