# Creating Features Quiz
Use this Jupyter notebook to find the answers to the quiz in the previous section. There is an answer key in the next part of the lesson.

In [18]:
from pyspark.sql import SparkSession

# TODOS: 
# 1) import any other libraries you might need
# 2) run the cells below to read dataset and build body length feature
# 3) write code to answer the quiz questions 

from pyspark.ml.feature import RegexTokenizer, CountVectorizer, \
    IDF, StringIndexer
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

import re

In [19]:
spark = SparkSession.builder \
    .master("local") \
    .appName("Creating Features") \
    .getOrCreate()

### Read Dataset

In [20]:
stack_overflow_data = 'Train_onetag_small.json'

In [21]:
df = spark.read.json(stack_overflow_data)
df.persist()

DataFrame[Body: string, Id: bigint, Tags: string, Title: string, oneTag: string]

### Build Body Length Feature

In [22]:
regexTokenizer = RegexTokenizer(inputCol="Body", outputCol="words", pattern="\\W")
df = regexTokenizer.transform(df)

In [23]:
body_length = udf(lambda x: len(x), IntegerType())
df = df.withColumn("BodyLength", body_length(df.words))

In [24]:
df.head()

Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'which

In [25]:
df.printSchema()

root
 |-- Body: string (nullable = true)
 |-- Id: long (nullable = true)
 |-- Tags: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- oneTag: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- BodyLength: integer (nullable = true)



# Question 1
Select the question with Id = 1112. How many words does its body contain (check the BodyLength column)?

In [26]:
df.createOrReplaceTempView("df")

In [27]:
# TODO: write your code to answer question 1

spark.sql('''
        SELECT BodyLength FROM df WHERE Id == "1112"
          ''').show()

+----------+
|BodyLength|
+----------+
|        63|
+----------+



# Question 2
Create a new column that concatenates the question title and body. Apply the same functions we used before to compute the number of words in this combined column. What's the value in this new column for Id = 5123?

In [28]:
# Somehow not working - "viable alternative ..."
# spark.sql('''
# ALTER TABLE df ADD QuestBody string
# ''').show()

In [29]:
# TODO: write your code to answer question 2

concat = udf(lambda title, body: title + " " + body)

df = df.withColumn("QuestBody", concat(df.Title, df.Body))

df.printSchema()
df.head()

root
 |-- Body: string (nullable = true)
 |-- Id: long (nullable = true)
 |-- Tags: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- oneTag: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- BodyLength: integer (nullable = true)
 |-- QuestBody: string (nullable = true)



Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'which

In [30]:
regexTokenizer = RegexTokenizer(inputCol="QuestBody", outputCol="qbWords", pattern="\\W")
df = regexTokenizer.transform(df)

df = df.withColumn("qbWordsLength", body_length(df.qbWords))

df.createOrReplaceTempView("df")

spark.sql('''
        SELECT qbWordsLength FROM df WHERE Id == "5123"
          ''').show()

+-------------+
|qbWordsLength|
+-------------+
|          135|
+-------------+



# Create a Vector
Create a vector from the combined Title + Body length column. In the next few questions, you'll try different normalizer/scaler methods on this new column.

In [31]:
# TODO: write your code to create this vector
from pyspark.ml.feature import VectorAssembler, Normalizer, StandardScaler

assembler = VectorAssembler(inputCols=["qbWordsLength"], outputCol="qbVector")
df = assembler.transform(df)

df.head(1)

[Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'whic

# Question 3
Using the Normalizer method what's the normalized value for question Id = 512?

In [32]:
# TODO: write your code to answer question 3

scaler = Normalizer(inputCol="qbVector", outputCol="qbVectorNormalized")
df = scaler.transform(df)

In [33]:
df.head(1)

# ? It will just be 1 everywhere since the columns have just one value

# L1 Norm: x_1 + x_2 ... but there's no x_2 etc so just x_1
# L2 Norm: sqrt(x_1 ^2 + x_2 ^2) so just sqrt(x_1^2) = x_1

# Yup 1 is the correct answer

[Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'whic

# Question 4
Using the StandardScaler method (scaling both the mean and the standard deviation) what's the normalized value for question Id = 512?

In [34]:
# TODO: write your code to answer question 4

scaler2 = StandardScaler(inputCol="qbVector", outputCol="qbVectorStandard", withMean=True, withStd=True)
scalerModel = scaler2.fit(df)
df = scalerModel.transform(df)
df.head(1)

# Scales according to z = (x - u) / s

[Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'whic

In [35]:
df.createOrReplaceTempView("df")

spark.sql('''
        SELECT qbVectorStandard FROM df WHERE Id == "512"
          ''').show()

+--------------------+
|    qbVectorStandard|
+--------------------+
|[-0.6417314460998...|
+--------------------+



In [36]:
# TODO: write your code to answer question 4

scaler2 = StandardScaler(inputCol="qbVector", outputCol="qbVectorStandardNoStd", withMean=True, withStd=False)
scalerModel = scaler2.fit(df)
df = scalerModel.transform(df)
df.head(1)

# No std makes a difference since we use the entire dataset to compute mean & std

[Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'whic

# Question 5
Using the MinMaxScaler method what's the normalized value for question Id = 512?

In [37]:
# TODO: write your code to answer question 5

# For Range 0 - 1
# X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))

# For other Ranges define a max & min diff than 0, 1:
# X_scaled = X_std * (max - min) + min


from pyspark.ml.feature import MinMaxScaler

scaler = MinMaxScaler(inputCol="qbVector", outputCol="qbVectorMinMax")
scalerModel = scaler.fit(df)
df = scalerModel.transform(df)
df.head(1)

[Row(Body="<p>I'd like to check if an uploaded file is an image file (e.g png, jpg, jpeg, gif, bmp) or another file. The problem is that I'm using Uploadify to upload the files, which changes the mime type and gives a 'text/octal' or something as the mime type, no matter which file type you upload.</p>\n\n<p>Is there a way to check if the uploaded file is an image apart from checking the file extension using PHP?</p>\n", Id=1, Tags='php image-processing file-upload upload mime-types', Title='How to check if an uploaded file is an image without mime type?', oneTag='php', words=['p', 'i', 'd', 'like', 'to', 'check', 'if', 'an', 'uploaded', 'file', 'is', 'an', 'image', 'file', 'e', 'g', 'png', 'jpg', 'jpeg', 'gif', 'bmp', 'or', 'another', 'file', 'the', 'problem', 'is', 'that', 'i', 'm', 'using', 'uploadify', 'to', 'upload', 'the', 'files', 'which', 'changes', 'the', 'mime', 'type', 'and', 'gives', 'a', 'text', 'octal', 'or', 'something', 'as', 'the', 'mime', 'type', 'no', 'matter', 'whic

In [38]:
df.createOrReplaceTempView("df")

spark.sql('''
        SELECT qbVector, qbVectorMinMax FROM df WHERE Id == "512"
          ''').show()

+--------+--------------------+
|qbVector|      qbVectorMinMax|
+--------+--------------------+
|  [57.0]|[0.00624833820792...|
+--------+--------------------+



In [46]:
max_val = spark.sql('''
        SELECT MAX(qbVector) FROM df
          ''').show()

min_val = spark.sql('''
        SELECT MIN(qbVector) FROM df
          ''').show()

+-------------+
|max(qbVector)|
+-------------+
|     [7532.0]|
+-------------+

+-------------+
|min(qbVector)|
+-------------+
|       [10.0]|
+-------------+



In [40]:
# Calculating the expected value manually
(57 - 10.0) / (7532.0 - 10.0)

0.006248338207923425

In [41]:
# Same result :)

# Other Qs

Build a linear regression model using the length of the combined Title + Body fields. What is the value of r^2 when fitting a model with maxIter=5, regParam=0.0, fitIntercept=False, solver="normal"?

In [None]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(maxIter=5, regParam=0.0, fitIntercept=False, solver="normal")

# On what should I fit & what to predict? Very unclear question...

How many times greater is the Description Length of the longest question than the Description Length of the shortest question (rounded to the nearest whole number)?

Tip: Don't forget to import Spark SQL's aggregate functions that can operate on DataFrame columns.

In [42]:
# We already calculated the max & min above
# > 7532 / 10 = 753.2

What is the mean and standard deviation of the Description length?

In [53]:
spark.sql('''
        SELECT MEAN(qbWordsLength) FROM df
          ''').show()

# Not working
#spark.sql('''
#        SELECT STDEVP(qbWordsLength) FROM df
#          ''').show()


df.select(["qbWordsLength"]).describe().show()

+------------------+
|avg(qbWordsLength)|
+------------------+
|         180.28187|
+------------------+

+-------+------------------+
|summary|     qbWordsLength|
+-------+------------------+
|  count|            100000|
|   mean|         180.28187|
| stddev|192.10819533505023|
|    min|                10|
|    max|              7532|
+-------+------------------+



Let's use K-means to create 5 clusters of Description Lengths. Set the random seed to 42 and fit a 5-class K-means model on the Description Length column (you can use KMeans().setParams(...) ). What length is the center of the cluster representing the longest questions?

In [63]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# Trains a k-means model.
kmeans = KMeans().setK(5).setSeed(42).setFeaturesCol("qbVector")
model = kmeans.fit(df.select("qbVector"))

# Make predictions
predictions = model.transform(df.select("qbVector"))

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator().setFeaturesCol("qbVector")

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Silhouette with squared euclidean distance = 0.7631218189100486
Cluster Centers: 
[ 97.09383642]
[ 1077.93227792]
[ 502.39304611]
[ 2731.08284024]
[ 242.33112488]


In [None]:
df