# Tool for NLP

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [None]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [None]:
sen_df = spark.createDataFrame([(0, 'Hello, I heard about Spark in Python.'),
                                (1, 'I have two cats; Minerva and Filius'),
                                (2, 'I,do,not,know,anyone,who,types,like,this.')],
                                ['id', 'sentence'])


In [None]:
sen_df.describe().show()

In [None]:
tokenizer = Tokenizer(inputCol='sentence', outputCol='words')

In [None]:
regextokenizer = RegexTokenizer(inputCol='sentence', outputCol='words', pattern='\\W')

In [None]:
count_tokens = udf(lambda words:len(words), IntegerType())

In [None]:
tokenized = tokenizer.transform(sen_df)

In [None]:
tokenized.withColumn('tokens',count_tokens(col('words'))).show()

In [None]:
regex_tokenized = regextokenizer.transform(sen_df)

In [None]:
regex_tokenized.show()

In [None]:
regex_tokenized.withColumn('tokens',count_tokens(col('words'))).show()

In [None]:
from pyspark.ml.feature import StopWordsRemover

In [None]:
sentenceData = spark.createDataFrame([
    (0, ["I", "saw", "the", "red", "balloon"]),
    (1, ["Mary", "had", "a", "little", "lamb"])
], ["id", "raw"])

remover = StopWordsRemover(inputCol="raw", outputCol="filtered")
remover.transform(sentenceData).show(truncate=False)


In [None]:
from pyspark.ml.feature import NGram

In [None]:
wordDataFrame = spark.createDataFrame([
    (0, ["Hi", "I", "heard", "about", "Spark"]),
    (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
    (2, ["Logistic", "regression", "models", "are", "neat"])
], ["id", "words"])

ngram = NGram(n=2, inputCol="words", outputCol="ngrams")

ngramDataFrame = ngram.transform(wordDataFrame)
ngramDataFrame.select("ngrams").show(truncate=False)

In [None]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sentenceData = spark.createDataFrame([
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish Java could use case classes"),
    (1.0, "Logistic regression models are neat")
], ["label", "sentence"])

sentenceData.show()

In [None]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
wordsData.show()

In [None]:
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

rescaledData.select("label", "features").show()

In [None]:
from pyspark.ml.feature import CountVectorizer

# Input data: Each row is a bag of words with a ID.
df = spark.createDataFrame([
    (0, "a b c".split(" ")),
    (1, "a b b c a".split(" "))
], ["id", "words"])

# fit a CountVectorizerModel from the corpus.
cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0)

model = cv.fit(df)

result = model.transform(df)
result.show(truncate=False)