In [68]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlp1').getOrCreate()
print('Spark is Connected')

Spark is Connected


In [69]:
# tools for nlp as follows

from pyspark.ml.feature import Tokenizer,RegexTokenizer
from pyspark.ml.functions import *
from pyspark.sql.types import *


In [70]:
ex_df = spark.createDataFrame([(0,'Hi,I heard about spark'),(1,'I wish could use java classes for project'),
                               (2,'Logistic Regression is base of ML')],['id','exm'])

ex_df.show()

+---+--------------------+
| id|                 exm|
+---+--------------------+
|  0|Hi,I heard about ...|
|  1|I wish could use ...|
|  2|Logistic Regressi...|
+---+--------------------+



In [71]:
tokenizer = Tokenizer(inputCol='exm',outputCol='words')
regextokenizer = RegexTokenizer(inputCol='exm',outputCol='words',pattern='\\W')

In [72]:

from pyspark.sql.functions import *

count_tokens = udf(lambda words : len(words),IntegerType())

In [73]:
tokenized = tokenizer.transform(ex_df)
tokenized.show()

+---+--------------------+--------------------+
| id|                 exm|               words|
+---+--------------------+--------------------+
|  0|Hi,I heard about ...|[hi,i, heard, abo...|
|  1|I wish could use ...|[i, wish, could, ...|
|  2|Logistic Regressi...|[logistic, regres...|
+---+--------------------+--------------------+



In [74]:
tokenized.withColumn('tokens',count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|                 exm|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi,I heard about ...|[hi,i, heard, abo...|     4|
|  1|I wish could use ...|[i, wish, could, ...|     8|
|  2|Logistic Regressi...|[logistic, regres...|     6|
+---+--------------------+--------------------+------+



In [75]:
rgtokenized = regextokenizer.transform(ex_df)
rgtokenized.withColumn('tokens',count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|                 exm|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi,I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish could use ...|[i, wish, could, ...|     8|
|  2|Logistic Regressi...|[logistic, regres...|     6|
+---+--------------------+--------------------+------+



In [76]:
# stopwords removal
from pyspark.ml.feature import StopWordsRemover

In [77]:
# stopwords are used to remove the articles i the and etc

remover = StopWordsRemover(inputCol='exm1',outputCol='filtered')




In [78]:
ex1 = spark.createDataFrame([(1,['I','girl','and ','On','The','floor','And','saw','An','path']),(2,['An','universe' ,'made','Of','The','particle'])],['id','exm1'])

In [79]:
ex1.show()

+---+--------------------+
| id|                exm1|
+---+--------------------+
|  1|[I, girl, and , O...|
|  2|[An, universe, ma...|
+---+--------------------+



In [80]:
remover.transform(ex1).show()

+---+--------------------+--------------------+
| id|                exm1|            filtered|
+---+--------------------+--------------------+
|  1|[I, girl, and , O...|[girl, and , floo...|
|  2|[An, universe, ma...|[universe, made, ...|
+---+--------------------+--------------------+



In [81]:
from pyspark.ml.feature import NGram


In [82]:
ngram = NGram(n=2,inputCol='exm1',outputCol='grams')

In [83]:
ngram.transform(ex1).show()

+---+--------------------+--------------------+
| id|                exm1|               grams|
+---+--------------------+--------------------+
|  1|[I, girl, and , O...|[I girl, girl and...|
|  2|[An, universe, ma...|[An universe, uni...|
+---+--------------------+--------------------+



In [84]:
from pyspark.ml.feature import *

In [85]:
# implementing Tfidf and hashingtf

words_data = tokenizer.transform(ex_df)

words_data.show()

+---+--------------------+--------------------+
| id|                 exm|               words|
+---+--------------------+--------------------+
|  0|Hi,I heard about ...|[hi,i, heard, abo...|
|  1|I wish could use ...|[i, wish, could, ...|
|  2|Logistic Regressi...|[logistic, regres...|
+---+--------------------+--------------------+



In [86]:
hashingtf = HashingTF(inputCol='words',outputCol='rawFeatures')

featured_data = hashingtf.transform(words_data)

In [87]:
featured_data.show()

+---+--------------------+--------------------+--------------------+
| id|                 exm|               words|         rawFeatures|
+---+--------------------+--------------------+--------------------+
|  0|Hi,I heard about ...|[hi,i, heard, abo...|(262144,[18700,48...|
|  1|I wish could use ...|[i, wish, could, ...|(262144,[19036,20...|
|  2|Logistic Regressi...|[logistic, regres...|(262144,[46243,10...|
+---+--------------------+--------------------+--------------------+



In [88]:
idf = IDF(inputCol='rawFeatures',outputCol='features')

In [89]:
idfmodel = idf.fit(featured_data)

In [90]:
rescaled = idfmodel.transform(featured_data)


In [92]:
rescaled.select('id','features').show(truncate=False)

24/01/12 13:07:35 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/01/12 13:07:35 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB


+---+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|id |features                                                                                                                                                                                                              |
+---+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0  |(262144,[18700,48163,66273,173558],[0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453])                                                                                                     |
|1  |(262144,[19036,20719,50886,55551,58672,98717,106776,109547],[0.6931471805599453,0.6931471805599453,0.6931471805

24/01/12 13:07:35 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB


In [93]:
cv = CountVectorizer(inputCol='exm1',outputCol='features',vocabSize=3,minDF=2.0)


In [94]:
cvm = cv.fit(ex1)

In [95]:
res = cvm.transform(ex1)
res.show()

+---+--------------------+-------------------+
| id|                exm1|           features|
+---+--------------------+-------------------+
|  1|[I, girl, and , O...|(2,[0,1],[1.0,1.0])|
|  2|[An, universe, ma...|(2,[0,1],[1.0,1.0])|
+---+--------------------+-------------------+

