In [None]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--conf spark.sql.catalogImplementation=in-memory pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

In [None]:
spark

In [None]:
from IPython.display import Image

## Pipeline

+ **DataFrame**: This ML API uses DataFrame from Spark SQL as an ML dataset, which can hold a variety of data types. E.g., a DataFrame could have different columns storing text, feature vectors, true labels, and predictions.


+ **Transformer**: A Transformer is an algorithm which can transform one DataFrame into another DataFrame. E.g., an ML model is a Transformer which transforms a DataFrame with features into a DataFrame with predictions.


+ **Estimator**: An Estimator is an algorithm which can be fit on a DataFrame to produce a Transformer. E.g., a learning algorithm is an Estimator which trains on a DataFrame and produces a model.


+ **Pipeline**: A Pipeline chains multiple Transformers and Estimators together to specify an ML workflow.


+ **Parameter**: All Transformers and Estimators now share a common API for specifying parameters.

In [None]:
Image("pics/ml-Pipeline.png")

In [None]:
Image("pics/ml-PipelineModel.png")

In [None]:
from pyspark.ml.linalg import Vectors

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
training = spark.createDataFrame([
    (1.0, Vectors.dense([0.0, 1.1, 0.1])),
    (0.0, Vectors.dense([2.0, 1.0, -1.0])),
    (0.0, Vectors.dense([2.0, 1.3, 1.0])),
    (1.0, Vectors.dense([0.0, 1.2, -0.5]))], schema = ["label", "features"])

In [None]:
lr = LogisticRegression(maxIter=10, regParam=0.01)

In [None]:
lr

In [None]:
lr.params

In [None]:
lr.getOrDefault("regParam")

In [None]:
model = lr.fit(training)

In [None]:
type(model)

In [None]:
model.coefficients

In [None]:
model.interceptVector

In [None]:
predict = model.transform(training)

In [None]:
predict.show(1, truncate=False, vertical=True)

In [None]:
model.getOrDefault("threshold")

## Toxic Comment Classification Challenge

In [None]:
from pyspark.sql.types import *

In [None]:
schema = StructType([
    StructField("id", StringType()),
    StructField("comment_text", StringType()),
    StructField("toxic", IntegerType()),
    StructField("severe_toxic", IntegerType()),
    StructField("obscene", IntegerType()),
    StructField("threat", IntegerType()),
    StructField("insult", IntegerType()),
    StructField("identity_hate", IntegerType())
])

In [None]:
dataset = spark.read.csv("/user/pavel.klemenkov/lectures/lecture03/data/train.csv", schema=schema, header=True)

In [None]:
dataset.show(2, vertical=True)

## There is an issue with multiline CSVs, fixed in 2.2.0 https://issues.apache.org/jira/browse/SPARK-19610

In [None]:
!head -n3 toxic_comment/train.csv

In [None]:
dataset = spark.read.csv("/user/pavel.klemenkov/lectures/lecture03/data/train.csv",
                         schema=schema, header=True, multiLine=True)

In [None]:
dataset.select("id").show(10)

## You need to add `escape` parameter!

In [None]:
dataset = spark.read.csv("/user/pavel.klemenkov/lectures/lecture03/data/train.csv",
                         schema=schema, header=True, multiLine=True, escape='"')

In [None]:
dataset.select("id").show(10)

In [None]:
dataset.show(2, vertical=True, truncate=False)

In [None]:
dataset.rdd.getNumPartitions()

In [None]:
dataset.count()

In [None]:
dataset = dataset.repartition(4).cache()

In [None]:
dataset.count()

## Let's define a binary target (toxic/non-toxic)

In [None]:
from pyspark.sql import functions as f

In [None]:
target = f.when(
    (dataset.toxic == 0) &
    (dataset.severe_toxic == 0) &
    (dataset.obscene == 0) &
    (dataset.threat == 0) &
    (dataset.insult == 0) &
    (dataset.identity_hate == 0),
    0
).otherwise(1)

In [None]:
dataset = dataset.withColumn("target", target)

In [None]:
dataset.select("id", "target").show(10)

In [None]:
dataset.groupBy("target").count().collect()

In [None]:
16225 / (16225 + 143346)

In [None]:
dataset = dataset.drop("toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate").cache()

In [None]:
dataset

In [None]:
dataset.show(2, False, True)

In [None]:
dataset.write.parquet("/user/pavel.klemenkov/lectures/lecture03/data/dataset", mode="overwrite")

## Let's fit the simplest binary-BoW logistic regression

In [None]:
from pyspark.ml.feature import *

## Split comments into words

In [None]:
tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")

In [None]:
dataset2 = tokenizer.transform(dataset)

In [None]:
dataset2.select("id", "words").show(2, False, True)

In [None]:
type(dataset2.take(1)[0].words)

## Convert texts into binary vectors using Hashing trick

In [None]:
hasher = HashingTF(numFeatures=100, binary=True, inputCol=tokenizer.getOutputCol(), outputCol="word_vector")
dataset2 = hasher.transform(dataset2)

In [None]:
dataset2.select("id", "word_vector").show(2, False, True)

## Now let's split into train and test. Don't forget that we have imbalanced classes, so let's do stratified sampling

In [None]:
train = dataset2.sampleBy("target", fractions={0: 0.8, 1: 0.8}, seed=5757)

In [None]:
train.groupby("target").count().collect()

In [None]:
12906 / (12906 + 114769)

In [None]:
test = dataset2.join(train, on="id", how="leftanti")

In [None]:
test.groupby("target").count().collect()

In [None]:
3319 / (3319 + 28577)

In [None]:
train.rdd.getNumPartitions()

In [None]:
test.rdd.getNumPartitions()

In [None]:
train

In [None]:
train = train.drop("comment_text", "words").cache()

In [None]:
test = test.drop("comment_text", "words").coalesce(4).cache()

## Let's fit logistic regression

In [None]:
from pyspark.ml.classification import LogisticRegression

In [None]:
lr = LogisticRegression(featuresCol=hasher.getOutputCol(), labelCol="target", maxIter=15)

In [None]:
lr_model = lr.fit(train)

In [None]:
lr_model

In [None]:
predictions = lr_model.transform(test)

In [None]:
predictions

In [None]:
predictions.select("id", "target", "prediction", "probability", "rawPrediction").show(5, False, True)

In [None]:
predictions.select("target", f.col("prediction").cast("int")).filter("target == prediction").count()

In [None]:
predictions.count()

In [None]:
print("Accuracy is {}".format(28668 / 31896))

In [None]:
predictions.select("target", f.col("prediction").cast("int"))\
           .filter((f.col("target") == 1) & (f.col("prediction") == f.col("target")))\
           .count()

In [None]:
predictions_pd = predictions.select("target", f.col("prediction").cast("int")).toPandas()

In [None]:
predictions_pd.head()

In [None]:
lr.getOrDefault("threshold")

In [None]:
from sklearn.metrics import classification_report, precision_score

In [None]:
print(classification_report(predictions_pd.target, predictions_pd.prediction))

## What if we want more sophisticated metrics?

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [None]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="target", metricName='areaUnderROC')

In [None]:
evaluator.evaluate(predictions)

In [None]:
evaluator.setParams(metricName="precision")

In [None]:
evaluator.evaluate(predictions)

## `spark.ml.evaluation.BinaryClassificationEvaluator` supports only ROC AUC and PR AUC. What if we want more?

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", labelCol="target", metricName="accuracy")

In [None]:
evaluator.evaluate(predictions)

In [None]:
evaluator = evaluator.setMetricName("weightedPrecision")

In [None]:
evaluator.evaluate(predictions)

In [None]:
evaluator = evaluator.setMetricName("weightedRecall")

In [None]:
evaluator.evaluate(predictions)

## Let's define a pipeline!

In [None]:
dataset = spark.read.parquet("/user/pavel.klemenkov/lectures/lecture03/data/dataset")

In [None]:
dataset

In [None]:
dataset.rdd.getNumPartitions()

In [None]:
from pyspark.ml import Pipeline

In [None]:
pipeline = Pipeline(stages=[
    tokenizer,
    hasher,
    lr
])

In [None]:
train = dataset.sampleBy("target", fractions={0: 0.8, 1: 0.8}).cache()

In [None]:
test = dataset.join(train, on="id", how="leftanti").cache()

In [None]:
pipeline_model = pipeline.fit(train)

In [None]:
predictions = pipeline_model.transform(test)

In [None]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="target", metricName='areaUnderROC')

In [None]:
evaluator.evaluate(predictions)

## Okay, may be some more sophisticated stuff?

In [None]:
from pyspark.ml.classification import GBTClassifier

In [None]:
gbt = GBTClassifier(featuresCol=hasher.getOutputCol(), labelCol="target", maxIter=10)

In [None]:
pipeline = Pipeline(stages=[
    tokenizer,
    hasher,
    gbt
])

In [None]:
pipeline_model = pipeline.fit(train)

In [None]:
predictions = pipeline_model.transform(test)

In [None]:
evaluator.evaluate(predictions)

## Lets add more degrees of freedom

In [None]:
pipeline_model.stages

In [None]:
pipeline_model = pipeline.fit(train, params={hasher.numFeatures: 1000})

In [None]:
pipeline_model.stages[1].extractParamMap()

In [None]:
predictions = pipeline_model.transform(test)

In [None]:
evaluator.evaluate(predictions)

## Let's remove stopwords

In [None]:
stop_words = StopWordsRemover.loadDefaultStopWords("english")

In [None]:
stop_words

In [None]:
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="words_filtered", stopWords=stop_words)

In [None]:
hasher = HashingTF(numFeatures=1000, binary=True, inputCol=swr.getOutputCol(), outputCol="word_vector")

In [None]:
pipeline = Pipeline(stages=[
    tokenizer,
    swr,
    hasher,
    lr
])

In [None]:
pipeline_model = pipeline.fit(train)

In [None]:
pipeline_model.stages

In [None]:
predictions = pipeline_model.transform(test)

In [None]:
evaluator.evaluate(predictions)

## Need moar features!

In [None]:
import pyspark.sql.functions as f

In [None]:
dataset.printSchema()

In [None]:
dataset = dataset.withColumn("comment_length", f.length(dataset.comment_text))

In [None]:
train = dataset.sampleBy("target", fractions={0: 0.8, 1: 0.8}).cache()
test = dataset.join(train, on="id", how="leftanti").cache()

In [None]:
train

In [None]:
assembler = VectorAssembler(inputCols=[hasher.getOutputCol(), "comment_length"], outputCol="features")

In [None]:
lr = LogisticRegression(labelCol="target", maxIter=15)

In [None]:
pipeline = Pipeline(stages=[
    tokenizer,
    swr,
    hasher,
    assembler,
    lr
])

In [None]:
pipeline_model = pipeline.fit(train)

In [None]:
pipeline_model.stages

In [None]:
predictions = pipeline_model.transform(test)

In [None]:
evaluator.evaluate(predictions)

In [None]:
pipeline_model.stages[-1].coefficients[-1]

## Ok, how do you do it right!? https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/discussion/52557

## Very funny, anyway?

In [None]:
count_vectorizer = CountVectorizer(inputCol=swr.getOutputCol(), outputCol="word_vector", binary=True)

In [None]:
assembler = VectorAssembler(inputCols=[count_vectorizer.getOutputCol(), "comment_length"], outputCol="features")

In [None]:
pipeline = Pipeline(stages=[
    tokenizer,
    swr,
    count_vectorizer,
    assembler,
    lr
])

In [None]:
pipeline_model = pipeline.fit(train)

In [None]:
predictions = pipeline_model.transform(test)

In [None]:
evaluator.evaluate(predictions)

In [None]:
Image("pics/Obama_not_bad.png")

## Hyperparameter tuning

In [None]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [None]:
paramGrid = ParamGridBuilder().addGrid(count_vectorizer.vocabSize, [100, 500])\
                              .addGrid(lr.regParam, [0.01, 0.05])\
                              .build()

In [None]:
paramGrid

In [None]:
crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid,
                          evaluator=evaluator, numFolds=3, parallelism=4)

In [None]:
cv_model = crossval.fit(train)

In [None]:
cv_model.avgMetrics

In [None]:
cv_model.bestModel

In [None]:
predictions = cv_model.transform(test)

In [None]:
evaluator.evaluate(predictions)

In [None]:
spark.stop()