In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [3]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import NGram
from pyspark.ml.feature import HashingTF, IDF

In [4]:
sen_df = spark.createDataFrame([(0, 'Hi I heard about Spark'),
                                (1, 'I wish java could use case classes'),
                                (2, 'Logistic,regression,models,are,neat')],
                              ['id', 'sentence'])

In [5]:
sen_df.show()

In [6]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')

In [7]:
regex_tokenizer = RegexTokenizer(inputCol='sentence', outputCol='words', pattern='\\W')

In [8]:
count_tokens = udf(lambda words: len(words), IntegerType())

In [9]:
tokenized = tokenizer.transform(sen_df)

In [10]:
tokenized.withColumn('tokens', count_tokens(col('words'))).show()

In [11]:
regex_tokenized = regex_tokenizer.transform(sen_df)
regex_tokenized.withColumn('tokens', count_tokens(col('words'))).show()

In [12]:
remover = StopWordsRemover(inputCol='words', outputCol='filtered')
remover.transform(regex_tokenized).show()

In [13]:
ngram = NGram(n=2, inputCol='words', outputCol='grams')

In [14]:
ngram.transform(regex_tokenized).select('grams').show(truncate=False)

In [15]:
regex_tokenized.show()

In [16]:
hashing_tf = HashingTF(inputCol='words', outputCol='rawFeatures')

In [17]:
featurized_data = hashing_tf.transform(regex_tokenized)

In [18]:
idf = IDF(inputCol='rawFeatures', outputCol='features')

In [19]:
idf_model = idf.fit(featurized_data)

In [20]:
rescaled_data = idf_model.transform(featurized_data)

In [21]:
rescaled_data.select('id', 'features').show(truncate=False)