In [None]:
!apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz
!tar -xvf spark-3.1.1-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop2.7"
import findspark
findspark.init()

In [None]:
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)
%cd '/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/Week_4/data_day_7'

Mounted at /content/gdrive
/content/gdrive/MyDrive/LDS9_K265_TranHoangBach/Week_4/data_day_7


In [None]:
from pyspark import SparkContext
from pyspark.conf import SparkConf
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

In [None]:
sc = SparkContext(master="local", appName="New Spark Context")
spark = SparkSession(sc)

In [None]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import *
from pyspark.sql.types import *

text_df = spark.createDataFrame([
                                 (0, 'Hi I heard about Spark'),
                                 (1, 'I know Spark can work well with NLP'),
                                 (2, 'Logistic,regression,models,are,suppervised')
], ['id', 'sentence'])

text_df.show(truncate=False)

+---+------------------------------------------+
|id |sentence                                  |
+---+------------------------------------------+
|0  |Hi I heard about Spark                    |
|1  |I know Spark can work well with NLP       |
|2  |Logistic,regression,models,are,suppervised|
+---+------------------------------------------+



In [None]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
regex_tokenizer = RegexTokenizer(inputCol='sentence', outputCol='words', pattern='\\W')

count_tokens = udf(lambda words: len(words), IntegerType())

tokenized = tokenizer.transform(text_df)
tokenized.withColumn('tokens', count_tokens(col('words'))).show(truncate=False)

+---+------------------------------------------+--------------------------------------------+------+
|id |sentence                                  |words                                       |tokens|
+---+------------------------------------------+--------------------------------------------+------+
|0  |Hi I heard about Spark                    |[hi, i, heard, about, spark]                |5     |
|1  |I know Spark can work well with NLP       |[i, know, spark, can, work, well, with, nlp]|8     |
|2  |Logistic,regression,models,are,suppervised|[logistic,regression,models,are,suppervised]|1     |
+---+------------------------------------------+--------------------------------------------+------+



In [None]:
tokenized = regex_tokenizer.transform(text_df)
tokenized.withColumn('tokens', count_tokens(col('words'))).show(truncate=False)

+---+------------------------------------------+------------------------------------------------+------+
|id |sentence                                  |words                                           |tokens|
+---+------------------------------------------+------------------------------------------------+------+
|0  |Hi I heard about Spark                    |[hi, i, heard, about, spark]                    |5     |
|1  |I know Spark can work well with NLP       |[i, know, spark, can, work, well, with, nlp]    |8     |
|2  |Logistic,regression,models,are,suppervised|[logistic, regression, models, are, suppervised]|5     |
+---+------------------------------------------+------------------------------------------------+------+



In [None]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol='words', outputCol='filtered')
removered = remover.transform(tokenized)
removered.show(truncate=False)

+---+------------------------------------------+------------------------------------------------+-------------------------------------------+
|id |sentence                                  |words                                           |filtered                                   |
+---+------------------------------------------+------------------------------------------------+-------------------------------------------+
|0  |Hi I heard about Spark                    |[hi, i, heard, about, spark]                    |[hi, heard, spark]                         |
|1  |I know Spark can work well with NLP       |[i, know, spark, can, work, well, with, nlp]    |[know, spark, work, well, nlp]             |
|2  |Logistic,regression,models,are,suppervised|[logistic, regression, models, are, suppervised]|[logistic, regression, models, suppervised]|
+---+------------------------------------------+------------------------------------------------+-------------------------------------------+



In [None]:
from pyspark.ml.feature import NGram

ngram = NGram(n=2, inputCol='words', outputCol='ngrams')

ngram_df = ngram.transform(removered)
ngram_df.show(truncate=False)

+---+------------------------------------------+------------------------------------------------+-------------------------------------------+-------------------------------------------------------------------------+
|id |sentence                                  |words                                           |filtered                                   |ngrams                                                                   |
+---+------------------------------------------+------------------------------------------------+-------------------------------------------+-------------------------------------------------------------------------+
|0  |Hi I heard about Spark                    |[hi, i, heard, about, spark]                    |[hi, heard, spark]                         |[hi i, i heard, heard about, about spark]                                |
|1  |I know Spark can work well with NLP       |[i, know, spark, can, work, well, with, nlp]    |[know, spark, work, well, nlp]         

In [None]:
from pyspark.ml.feature import HashingTF, IDF

hashing_tf = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=5)
featurized_df = hashing_tf.transform(tokenized)
featurized_df.select('words', 'rawFeatures').show(truncate=False)

+------------------------------------------------+-------------------------------------+
|words                                           |rawFeatures                          |
+------------------------------------------------+-------------------------------------+
|[hi, i, heard, about, spark]                    |(5,[1,3],[3.0,2.0])                  |
|[i, know, spark, can, work, well, with, nlp]    |(5,[0,1,2,3,4],[1.0,2.0,2.0,1.0,2.0])|
|[logistic, regression, models, are, suppervised]|(5,[1,2,3,4],[1.0,1.0,1.0,2.0])      |
+------------------------------------------------+-------------------------------------+



In [None]:
idf = IDF(inputCol='rawFeatures', outputCol='features')
idf_model = idf.fit(featurized_df)
rescaled_df = idf_model.transform(featurized_df)
rescaled_df.select('words', 'rawFeatures', 'features').show(truncate=False)

+------------------------------------------------+-------------------------------------+----------------------------------------------------------------------------------+
|words                                           |rawFeatures                          |features                                                                          |
+------------------------------------------------+-------------------------------------+----------------------------------------------------------------------------------+
|[hi, i, heard, about, spark]                    |(5,[1,3],[3.0,2.0])                  |(5,[1,3],[0.0,0.0])                                                               |
|[i, know, spark, can, work, well, with, nlp]    |(5,[0,1,2,3,4],[1.0,2.0,2.0,1.0,2.0])|(5,[0,1,2,3,4],[0.6931471805599453,0.0,0.5753641449035617,0.0,0.5753641449035617])|
|[logistic, regression, models, are, suppervised]|(5,[1,2,3,4],[1.0,1.0,1.0,2.0])      |(5,[1,2,3,4],[0.0,0.28768207245178085,0.0,0.57536414

In [None]:
from pyspark.ml.feature import CountVectorizer

count_vec = CountVectorizer(inputCol='words', outputCol='features', vocabSize=50, minDF=1)

count_vec_model = count_vec.fit(tokenized)
result_count_vec = count_vec_model.transform(tokenized)
result_count_vec.select('words', 'features').show(truncate=False)

+------------------------------------------------+------------------------------------------------------------+
|words                                           |features                                                    |
+------------------------------------------------+------------------------------------------------------------+
|[hi, i, heard, about, spark]                    |(16,[0,1,3,6,8],[1.0,1.0,1.0,1.0,1.0])                      |
|[i, know, spark, can, work, well, with, nlp]    |(16,[0,1,4,5,10,11,14,15],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|[logistic, regression, models, are, suppervised]|(16,[2,7,9,12,13],[1.0,1.0,1.0,1.0,1.0])                    |
+------------------------------------------------+------------------------------------------------------------+



In [None]:
count_vec_model.vocabulary

['spark',
 'i',
 'are',
 'about',
 'can',
 'work',
 'heard',
 'logistic',
 'hi',
 'models',
 'with',
 'know',
 'regression',
 'suppervised',
 'nlp',
 'well']

In [None]:
len(count_vec_model.vocabulary)

16