# NLP Tools Part Two

### (Henri's Version)

Discuss term-frequency, inverse-document-frequency, count vectorization.

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("nlp").getOrCreate()

### First cover tf-idf

In [3]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
# pyspark.ml.feature.HashingTF(numFeatures=262144, binary=False, inputCol=None, outputCol=None) -->
# Maps a sequence of terms to their term frequencies using the hashing trick.
# --- binary: If True, all non zero counts are set to 1.
#     This is useful for discrete prbabilistic models that model binary events rather than integer counts.
#     Default False.
# --- numFeatures: The number of features.

# pyspark.ml.feature.IDF(minDocFreq=0, inputCol=None, outputCol=None) --> Compute the IDF given a collection of 
# documents.
# --- minDocFrequency: Minimum number of documents in which a term should appear for filtering.

# pyspark.ml.feature.Tokenizer(inputCol=None, outputCol=None) --> A tokenizer that converts the input string to
# lowercase and then splits it by white space.

In [4]:
sentence_df = spark.createDataFrame(data=[
    (0.0, "Hi I heard about Spark"),
    (0.0, "I wish java could use case classes"),
    (1.0, "Logistic regression models are neat")
], schema=["label", "sentence"])

In [5]:
sentence_df.show()

+-----+--------------------+
|label|            sentence|
+-----+--------------------+
|  0.0|Hi I heard about ...|
|  0.0|I wish java could...|
|  1.0|Logistic regressi...|
+-----+--------------------+



In [6]:
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

In [7]:
words_data = tokenizer.transform(sentence_df)

In [8]:
words_data.show()

+-----+--------------------+--------------------+
|label|            sentence|               words|
+-----+--------------------+--------------------+
|  0.0|Hi I heard about ...|[hi, i, heard, ab...|
|  0.0|I wish java could...|[i, wish, java, c...|
|  1.0|Logistic regressi...|[logistic, regres...|
+-----+--------------------+--------------------+



### Grab the term-frequency.

In [9]:
hashing_tf = HashingTF(inputCol="words", outputCol="rawFeatures")

In [10]:
featurized_data = hashing_tf.transform(dataset=words_data)
featurized_data

DataFrame[label: double, sentence: string, words: array<string>, rawFeatures: vector]

In [11]:
featurized_data.select(["words", "rawFeatures"]).show(truncate=False)

+------------------------------------------+--------------------------------------------------------------------------------------+
|words                                     |rawFeatures                                                                           |
+------------------------------------------+--------------------------------------------------------------------------------------+
|[hi, i, heard, about, spark]              |(262144,[24417,49304,73197,91137,234657],[1.0,1.0,1.0,1.0,1.0])                       |
|[i, wish, java, could, use, case, classes]|(262144,[20719,24417,55551,116873,147765,162369,192310],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])|
|[logistic, regression, models, are, neat] |(262144,[13671,91006,132713,167122,190884],[1.0,1.0,1.0,1.0,1.0])                     |
+------------------------------------------+--------------------------------------------------------------------------------------+



### Apply the Inverse-Document-Frequency

In [12]:
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [13]:
idf_fitted = idf.fit(dataset=featurized_data)

In [14]:
rescaled_data = idf_fitted.transform(dataset=featurized_data)

In [15]:
rescaled_data.select(["label", "features"]).show(truncate=False)

+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|label|features                                                                                                                                                                                        |
+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|0.0  |(262144,[24417,49304,73197,91137,234657],[0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453])                                                     |
|0.0  |(262144,[20719,24417,55551,116873,147765,162369,192310],[0.6931471805599453,0.28768207245178085,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.6931471805599453,0.693147180559945

```rescaled_data``` is now ready for any ML algorithm.

### Let us now move on to CountVectorizer

In [16]:
from pyspark.ml.feature import CountVectorizer
# pyspark.ml.feature.CountVectorizer(minTF=1.0, minDF=1.0, maxDF=9223372036854775807, vocabSize=262144, 
#                                    binary=False, inputCol=None, outputCol=None)
# --> Extracts a vocabulary from document collections and generates a CountVectorizerModel.
# --- binary: Binary toggle to control the output vector values.
#             If True, all nonzero counts (after minTF filter applied) are set to 1.
#             This is useful for discrete probabilistic models that model binary events rather than integer counts
#             Default False.
# --- maxDF: Specifies the maximum number of different documents a term could appear in to be included in the 
#            vocabulary.  A term that appears more than the threshold will be ignored.
#            If this is an integer >= 1, this specifies the maximum number of documents the term could appear in.
#            If this is a double in [0, 1), then this specifies the max fraction of documents the term could
#            appear in.
#            Default 2^63 - 1.
# --- minDF: Specifies the minimum number of different documents a term must appear in to be included in the 
#            vocabulary.  If this is an integer >= 1, this specifies the number of documents the term must appear
#            in; if this is a double in [0, 1), then this specifies the fraction of documents.
#            Default 1.0
# --- minTF: Filter to ignore rare words in a document. For each document, terms with frequency/count less than 
#            the given threshold are ignored.  If this is an integer >= 1, then this specifies a count (of times 
#            the term must appear in the document); if this is a double in [0,1), then this specifies a fraction 
#            (out of the document's token count). 
#            Note that the parameter is only used in transform of CountVectorizerModel and does not affect 
#            fitting. Default 1.0")
# --- vocabSize: Max size of the vocabulary.

In [17]:
df = spark.createDataFrame(data=[
    (0, ["a", "b", "c"]),
    (1, ["a", "b", "b", "c", "a"])
], schema=["id", "words"])

In [18]:
df.show()

+---+---------------+
| id|          words|
+---+---------------+
|  0|      [a, b, c]|
|  1|[a, b, b, c, a]|
+---+---------------+



In [19]:
count_vect = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2)

In [20]:
cv_fitted = count_vect.fit(df)

In [21]:
result = cv_fitted.transform(df)

In [22]:
result.show(truncate=False)
# Essentially the bag-of-words method that we discussed earlier on.

+---+---------------+-------------------------+
|id |words          |features                 |
+---+---------------+-------------------------+
|0  |[a, b, c]      |(3,[0,1,2],[1.0,1.0,1.0])|
|1  |[a, b, b, c, a]|(3,[0,1,2],[2.0,2.0,1.0])|
+---+---------------+-------------------------+

