In [1]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7/')

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('NLP I').getOrCreate()

In [3]:
from pyspark.ml.feature import Tokenizer,RegexTokenizer
from pyspark.sql.functions import col,udf

In [5]:
from pyspark.sql.types import IntegerType

In [4]:
sen_df = spark.createDataFrame([
    (0,'Hi I heard about spark'),
    (1,'I wish Java could use case classes'),
    (2,'Logistic,regression,model,are,neat')
],['id','sentences'])

In [5]:
sen_df.show()

+---+--------------------+
| id|           sentences|
+---+--------------------+
|  0|Hi I heard about ...|
|  1|I wish Java could...|
|  2|Logistic,regressi...|
+---+--------------------+



In [6]:
tokenizer  = Tokenizer(inputCol= 'sentences',outputCol='words')

In [10]:
regex_tokenizer = RegexTokenizer(inputCol= 'sentences',outputCol='words',pattern='\\W')

In [13]:
count_tokens = udf(lambda words: len(words),IntegerType())

In [14]:
tokenized  = tokenizer.transform(sen_df)

In [15]:
tokenized.show()

+---+--------------------+--------------------+
| id|           sentences|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish Java could...|[i, wish, java, c...|
|  2|Logistic,regressi...|[logistic,regress...|
+---+--------------------+--------------------+



In [17]:
tokenized.withColumn('tokens',count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|           sentences|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish Java could...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic,regress...|     1|
+---+--------------------+--------------------+------+



In [18]:
tokenized2 = regex_tokenizer.transform(sen_df)

In [19]:
tokenized2.withColumn('tokens',count_tokens(col('words'))).show()

+---+--------------------+--------------------+------+
| id|           sentences|               words|tokens|
+---+--------------------+--------------------+------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|     5|
|  1|I wish Java could...|[i, wish, java, c...|     7|
|  2|Logistic,regressi...|[logistic, regres...|     5|
+---+--------------------+--------------------+------+



In [20]:
from pyspark.ml.feature import StopWordsRemover

In [23]:
sent2df = spark.createDataFrame([
    (0,['I','saw','the','green','horse']),
    (1,['mary','had','a','little','lamb']),
],['id','tokens'])

In [24]:
sent2df.show()

+---+--------------------+
| id|              tokens|
+---+--------------------+
|  0|[I, saw, the, gre...|
|  1|[mary, had, a, li...|
+---+--------------------+



In [27]:
remover  = StopWordsRemover(inputCol='tokens',outputCol= 'filtered')

In [28]:
sent2df_filtered = remover.transform(sent2df)

In [29]:
sent2df_filtered.show()

+---+--------------------+--------------------+
| id|              tokens|            filtered|
+---+--------------------+--------------------+
|  0|[I, saw, the, gre...| [saw, green, horse]|
|  1|[mary, had, a, li...|[mary, little, lamb]|
+---+--------------------+--------------------+



In [30]:
#n-grams

In [31]:
from pyspark.ml.feature import NGram

In [32]:
tokenized2.show()

+---+--------------------+--------------------+
| id|           sentences|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish Java could...|[i, wish, java, c...|
|  2|Logistic,regressi...|[logistic, regres...|
+---+--------------------+--------------------+



In [34]:
ngram = NGram(n=2,inputCol= 'words',outputCol='bigrams')

In [35]:
ngrammed = ngram.transform(tokenized2)

In [36]:
ngrammed.show()

+---+--------------------+--------------------+--------------------+
| id|           sentences|               words|             bigrams|
+---+--------------------+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|[hi i, i heard, h...|
|  1|I wish Java could...|[i, wish, java, c...|[i wish, wish jav...|
|  2|Logistic,regressi...|[logistic, regres...|[logistic regress...|
+---+--------------------+--------------------+--------------------+



In [37]:
#Term Frequency & IDF

In [6]:
from pyspark.ml.feature import HashingTF,IDF,Tokenizer,RegexTokenizer

In [7]:
sen_df.show(truncate=False)

+---+----------------------------------+
|id |sentences                         |
+---+----------------------------------+
|0  |Hi I heard about spark            |
|1  |I wish Java could use case classes|
|2  |Logistic,regression,model,are,neat|
+---+----------------------------------+



In [8]:
rtokenizer = RegexTokenizer(inputCol='sentences',outputCol='tokens',pattern='\\W')

In [9]:
new_df = rtokenizer.transform(sen_df)

In [10]:
new_df.show()

+---+--------------------+--------------------+
| id|           sentences|              tokens|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish Java could...|[i, wish, java, c...|
|  2|Logistic,regressi...|[logistic, regres...|
+---+--------------------+--------------------+



In [11]:
new_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- sentences: string (nullable = true)
 |-- tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [51]:
hashingtf = HashingTF(inputCol='tokens',outputCol='hashedTF')

In [52]:
featurized_data = hashingtf.transform(new_df)

In [53]:
idf = IDF(inputCol='hashedTF',outputCol='features')

In [54]:
idf_data = idf.fit(featurized_data).transform(featurized_data)

In [55]:
idf_data.show()

+---+--------------------+--------------------+--------------------+--------------------+
| id|           sentences|              tokens|            hashedTF|            features|
+---+--------------------+--------------------+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|(262144,[24417,49...|(262144,[24417,49...|
|  1|I wish Java could...|[i, wish, java, c...|(262144,[20719,24...|(262144,[20719,24...|
|  2|Logistic,regressi...|[logistic, regres...|(262144,[13671,91...|(262144,[13671,91...|
+---+--------------------+--------------------+--------------------+--------------------+



In [56]:
#Count_vectorizer

In [57]:
from pyspark.ml.feature import CountVectorizer

In [58]:
sdf = spark.createDataFrame([
    (0, "a b c".split(" ")),
    (1, "a b b c a".split(" "))
], ["id", "words"])

In [59]:
sdf.show()

+---+---------------+
| id|          words|
+---+---------------+
|  0|      [a, b, c]|
|  1|[a, b, b, c, a]|
+---+---------------+



In [60]:
c_vect = CountVectorizer(inputCol='words',outputCol= 'features',vocabSize=3,minDF=2.0)

In [61]:
model = c_vect.fit(sdf).transform(sdf)

In [64]:
model.show(truncate=False)

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+

