# NLP Tools Part One

### (Henri's Version)

Before we jump into the code along project, let's explore a few of the tools Spark has for dealing with text data.

Then we'll be able to use them easily in our project!

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("nlp").getOrCreate()

### First discuss tokenizers and tokenization.

Tokenization is the process of taking text, such as a sentence, and then breaking it into inividual terms that is usually words.

In [3]:
from pyspark.sql.functions import col, udf
# pyspark.sql.functions.col(col) --> Returns a Column based on the given column name.

# pyspark.sql.functions.udf(f=None, returnType=StringType) --> Creates a user defined function (UDF).
# --- f: Python function if used as a standalone function.
# --- returnType: The return type of the user-defined function.  The value can be either a 
#     pyspark.sql.types.DataType object, or a DDL-formatted type string.

from pyspark.sql.types import IntegerType
# pyspark.sql.types.IntegerType --> Int data type, i.e. a signed 32-bit integer.

In [4]:
# Create a DataFrame full of sentences.
sen_df = spark.createDataFrame(data=[
    (0, "Hi I heard about Spark"),
    (1, "I wish java could use case classes"),
    (2, "Logistic,regression,models,are,neat")
], schema=["id", "sentence"])

In [5]:
sen_df.show()
# First 2 entries are split on the whitespace of the sentence.
# The last entry is split on the comma "," character.

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|Hi I heard about ...|
|  1|I wish java could...|
|  2|Logistic,regressi...|
+---+--------------------+



In [6]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
# pyspark.ml.feature.Tokenizer(inputCol=None, outputCol=None) --> A tokenizer that converts the input string to
# lowercase and then splits it by white space.

# pyspark.ml.feature.RegexTokenizer(minTokenLength=1, gaps=True, pattern='\s+', inputCol=None, outputCol=None, toLowercase=True)
# --> A regex based tokenizer that extracts tokens either by using the provided regex pattern (in Java dialect)
# to split the text (default) or repeatedly matching the regex (if gaps is false). Optional parameters also allows
# filtering tokens using a minimal length.  It returns an array of strings that can be empty.
# --- gaps: Whether regex splits on gaps (True) or matches tokens.
# --- pattern:  Regex patter (Java dialect) used for tokenizing.

In [7]:
# Create the tokenizer objects.
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

In [8]:
regex_tokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")

In [9]:
count_tokens = udf(lambda words: len(words), returnType=IntegerType())

In [10]:
tokenized = tokenizer.transform(sen_df)

In [11]:
tokenized.show()
tokenized.select(["words"]).show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish java could...|[i, wish, java, c...|
|  2|Logistic,regressi...|[logistic,regress...|
+---+--------------------+--------------------+

+--------------------+
|               words|
+--------------------+
|[hi, i, heard, ab...|
|[i, wish, java, c...|
|[logistic,regress...|
+--------------------+



In [12]:
tokenized.withColumn("n_tokens", count_tokens(col("words"))).show()
# The third sentence is just treated as one giant string, because there are no whitespaces to split on.

+---+--------------------+--------------------+--------+
| id|            sentence|               words|n_tokens|
+---+--------------------+--------------------+--------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|       5|
|  1|I wish java could...|[i, wish, java, c...|       7|
|  2|Logistic,regressi...|[logistic,regress...|       1|
+---+--------------------+--------------------+--------+



In [13]:
reg_tokenized = regex_tokenizer.transform(sen_df)

In [14]:
reg_tokenized.show()

+---+--------------------+--------------------+
| id|            sentence|               words|
+---+--------------------+--------------------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|
|  1|I wish java could...|[i, wish, java, c...|
|  2|Logistic,regressi...|[logistic, regres...|
+---+--------------------+--------------------+



In [15]:
reg_tokenized.withColumn("n_tokens", count_tokens(col("words"))).show()

+---+--------------------+--------------------+--------+
| id|            sentence|               words|n_tokens|
+---+--------------------+--------------------+--------+
|  0|Hi I heard about ...|[hi, i, heard, ab...|       5|
|  1|I wish java could...|[i, wish, java, c...|       7|
|  2|Logistic,regressi...|[logistic, regres...|       5|
+---+--------------------+--------------------+--------+



### Next we want to cover stop-word removal.

These are words that don't carry a lot of meaning, such as: I, the, a, ...

In [16]:
from pyspark.ml.feature import StopWordsRemover
# pyspark.ml.feature.StopWordsRemover(inputCol=None, outputCol=None, stopWords=None, caseSensitive=False, locale=None)
# --> A feature transformer that filters out stop words from the input.
# --- locale: locale of the input, ignored when case sensitive is true.
# --- stopWords: The words to be filtered out.
# --- caseSensitive: Whether to do a case-sensitive comparison over the stop words.

In [17]:
sentence_df = spark.createDataFrame(data=[
    (0, ["I", "saw", "the", "green", "horse"]),
    (1, ["Mary", "had", "a", "little", "lamb"])
], schema=["id", "tokens"])

In [18]:
sentence_df.show()

+---+--------------------+
| id|              tokens|
+---+--------------------+
|  0|[I, saw, the, gre...|
|  1|[Mary, had, a, li...|
+---+--------------------+



In [19]:
remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")

In [20]:
remover.transform(sentence_df).show()

+---+--------------------+--------------------+
| id|              tokens|            filtered|
+---+--------------------+--------------------+
|  0|[I, saw, the, gre...| [saw, green, horse]|
|  1|[Mary, had, a, li...|[Mary, little, lamb]|
+---+--------------------+--------------------+



### Next, we cover n-grams.

n-gram is a sequence of n tokens (typically words) for some integer.

In [21]:
from pyspark.ml.feature import NGram
# pyspark.ml.feature.NGram(n=2, inputCol=None, outputCol=None)

# A feature transformer that converts the input array of strings into an array of n-grams.
# Null values in the input array are ignored.
# It returns an array of n-grams where each n-gram is represented by a space-separated string of words.
# When the input is empty, an empty array is returned.
# When the input array length is less than n (number of elements per n-gram), no n-grams are returned.
# --- n: number of elements per n-gram (>=1).

In [22]:
word_df = spark.createDataFrame(data=[
    (0, ["Hi", "I", "heard", "about", "Spark"]),
    (1, ["I", "wish", "Java", "could", "use", "case", "classes"]),
    (2, ["Logistic", "regression", "models", "are", "neat"])
], schema=["id", "words"])

In [23]:
ngram = NGram(n=2, inputCol="words", outputCol="grams")

In [24]:
ngram.transform(dataset=word_df).show()
# The grams are representing strings of consecutive words.

+---+--------------------+--------------------+
| id|               words|               grams|
+---+--------------------+--------------------+
|  0|[Hi, I, heard, ab...|[Hi I, I heard, h...|
|  1|[I, wish, Java, c...|[I wish, wish Jav...|
|  2|[Logistic, regres...|[Logistic regress...|
+---+--------------------+--------------------+



In [25]:
ngram.transform(dataset=word_df).select("grams").show(truncate=False)
# Is useful if you want the relationship between 2 words

+------------------------------------------------------------------+
|grams                                                             |
+------------------------------------------------------------------+
|[Hi I, I heard, heard about, about Spark]                         |
|[I wish, wish Java, Java could, could use, use case, case classes]|
|[Logistic regression, regression models, models are, are neat]    |
+------------------------------------------------------------------+

