In [None]:
import os

# Install java
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed pyspark==2.4.4

# Install Spark NLP
! pip install --ignore-installed spark-nlp==2.4.0

In [1]:
! wget -N https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sarcasm/train-balanced-sarcasm.csv -P /tmp

--2020-02-11 19:22:41--  https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sarcasm/train-balanced-sarcasm.csv
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.97.173
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.97.173|:443... connected.
HTTP request sent, awaiting response... 304 Not Modified
File ‘/tmp/train-balanced-sarcasm.csv’ not modified on server. Omitting download.



In [2]:
from pyspark.sql import SparkSession
import sys
import time

packages = [
    'JohnSnowLabs:spark-nlp:2.3.4'
]
spark = SparkSession \
    .builder \
    .appName("ML SQL session") \
    .config('spark.jars.packages', ','.join(packages)) \
    .config('spark.executor.instances','4') \
    .config("spark.executor.memory", "16g") \
    .config("spark.driver.memory","16g") \
    .getOrCreate()

In [3]:
import sparknlp

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.3.4
Apache Spark version:  2.4.4


In [4]:
from pyspark.sql import SQLContext

sql = SQLContext(spark)

trainBalancedSarcasmDF = spark.read.option("header", True).option("inferSchema", True).csv("/tmp/train-balanced-sarcasm.csv")
trainBalancedSarcasmDF.printSchema()

# Let's create a temp view (table) for our SQL queries
trainBalancedSarcasmDF.createOrReplaceTempView('data')

sql.sql('SELECT COUNT(*) FROM data').collect()

root
 |-- label: integer (nullable = true)
 |-- comment: string (nullable = true)
 |-- author: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- score: string (nullable = true)
 |-- ups: string (nullable = true)
 |-- downs: string (nullable = true)
 |-- date: string (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- parent_comment: string (nullable = true)



[Row(count(1)=1010826)]

In [5]:
df = sql.sql('select label,concat(parent_comment,"\n",comment) as comment from data where comment is not null and parent_comment is not null limit 100000')
print(type(df))
df.printSchema()
df.show()

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- label: integer (nullable = true)
 |-- comment: string (nullable = true)

+-----+--------------------+
|label|             comment|
+-----+--------------------+
|    0|Yeah, I get that ...|
|    0|The blazers and M...|
|    0|They're favored t...|
|    0|deadass don't kil...|
|    0|Yep can confirm I...|
|    0|do you find arian...|
|    0|What's your weird...|
|    0|Probably Sephirot...|
|    0|What to upgrade? ...|
|    0|Probably count Ka...|
|    0|I bet if that mon...|
|    0|James Shields Wil...|
|    0|There's no time t...|
|    0|Team Specific Thr...|
|    0|Ill give you a hi...|
|    0|Star Wars, easy. ...|
|    0|You're adorable.
...|
|    0|He actually acts ...|
|    0|Clinton struggles...|
|    0|Is that the Older...|
+-----+--------------------+
only showing top 20 rows



In [6]:
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

from pyspark.ml import Pipeline


document_assembler = DocumentAssembler() \
    .setInputCol("comment") \
    .setOutputCol("document")
    
sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence") \
    .setUseAbbreviations(True)
    
tokenizer = Tokenizer() \
  .setInputCols(["sentence"]) \
  .setOutputCol("token")

stemmer = Stemmer() \
    .setInputCols(["token"]) \
    .setOutputCol("stem")
    
normalizer = Normalizer() \
    .setInputCols(["stem"]) \
    .setOutputCol("normalized")

finisher = Finisher() \
    .setInputCols(["normalized"]) \
    .setOutputCols(["ntokens"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(True)

nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, stemmer, normalizer, finisher])
nlp_model = nlp_pipeline.fit(df)
processed = nlp_model.transform(df).repartition(100).persist()
processed.count()
processed.show()

+-----+--------------------+--------------------+
|label|             comment|             ntokens|
+-----+--------------------+--------------------+
|    0|what do you guys ...|[what, do, you, g...|
|    1|Cool. I got a D.
...|[cool, i, got, a,...|
|    0|PSG recently pick...|[psg, recent, pic...|
|    0|I feel like this ...|[i, feel, like, t...|
|    1|I also own that s...|[i, also, own, th...|
|    1|There is NO WAY T...|[there, i, no, wa...|
|    0|What's with the d...|[what, with, the,...|
|    0|470 is struggling...|[i, struggl, i, g...|
|    0|General Discussio...|[gener, discuss, ...|
|    0|Assertion? Why?
D...|[assert, why, debug]|
|    0|1 Suspect Dead, 9...|[suspect, dead, i...|
|    1|You missed out th...|[you, miss, out, ...|
|    0|okay good game bu...|[okai, good, game...|
|    0|"On somewhat of a...|[on, somewhat, of...|
|    1|the organization ...|[the, organ, that...|
|    1|You're preaching ...|[your, preach, to...|
|    0|She put herself i...|[she, put, hersel...|


In [7]:
train, test = processed.randomSplit(weights=[0.7, 0.3], seed=123)
print(train.count())
print(test.count())

70059
29941


In [8]:
from pyspark.ml import feature as spark_ft

stopWords = spark_ft.StopWordsRemover.loadDefaultStopWords('english')
sw_remover = spark_ft.StopWordsRemover(inputCol='ntokens', outputCol='clean_tokens', stopWords=stopWords)
text2vec = spark_ft.Word2Vec(
    vectorSize=50, minCount=5, seed=123, 
    inputCol='ntokens', outputCol='text_vec', 
    windowSize=5, maxSentenceLength=30
)
assembler = spark_ft.VectorAssembler(inputCols=['text_vec'], outputCol='features')
feature_pipeline = Pipeline(stages=[sw_remover, text2vec,assembler])
feature_model = feature_pipeline.fit(train)

train_featurized = feature_model.transform(train).persist()
train_featurized.count()
train_featurized.show()

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|label|             comment|             ntokens|        clean_tokens|            text_vec|            features|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|    0|"""100 level and ...|[level, and, k, e...|[level, k, easfc,...|[0.00901442845093...|[0.00901442845093...|
|    0|"""Philadelphia
P...|[philadelphia, po...|[philadelphia, po...|[-0.0275632207684...|[-0.0275632207684...|
|    0|"""Stop reading! ...|[stop, read, you,...|[stop, read, read...|[-0.0815967304099...|[-0.0815967304099...|
|    0|"""Waiters"" and ...|[waiter, and, hot...|[waiter, hot, han...|[0.00434228136509...|[0.00434228136509...|
|    0|"""powerful enoug...|[power, enough, t...|[power, enough, d...|[0.06532562058418...|[0.06532562058418...|
|    0|"Also mentions ""...|[also, mention, w...|[also, mention, w...|[0.07693217100145...|[0.07

In [9]:
from pyspark.ml import classification as spark_cls


mlpc = spark_cls.MultilayerPerceptronClassifier(
    maxIter=100, seed=123, layers=[50, 25, 10,2]
)

model = mlpc.fit(train_featurized)


In [10]:
test_featurized = feature_model.transform(test)
preds = model.transform(test_featurized)
preds.show()

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|label|             comment|             ntokens|        clean_tokens|            text_vec|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|    0|"""Add dabbing to...|[add, dab, to, mi...|[add, dab, minecr...|[0.01962281603898...|[0.01962281603898...|[0.27250760867674...|[0.67279076404908...|       0.0|
|    0|"""Chi-pol-tay"" ...|[chipoltai, liter...|[chipoltai, liter...|[0.00619682618171...|[0.00619682618171...|[-0.1024469072464...|[0.54353248079718...|       0.0|
|    0|"""Get in loser
~...|[get, in, loser, ...|[get, loser, wack...|[0.01435403401652...|[0.01435403401652...|[0.08147982811224...|[0.61920518926993...|       0.0|
|   

In [11]:
pred_df = preds.select('comment', 'label', 'prediction').toPandas()

In [12]:
pred_df.head()

Unnamed: 0,comment,label,prediction
0,"""""""Add dabbing to Minecraft""""""\nDAB ON EM",0,0.0
1,"""""""Chi-pol-tay"""" - literally everyone's mom""\n...",0,0.0
2,"""""""Get in loser\n~~Wacky~~ Moderately-Subdued ...",0,0.0
3,"""*Danny reaches for walker* """"I'm getting too ...",0,0.0
4,"""Are you worried about what will happen after ...",0,1.0


In [13]:
import pandas as pd
from sklearn import metrics as skmetrics
pd.DataFrame(
    data=skmetrics.confusion_matrix(pred_df['label'], pred_df['prediction']),
    columns=['pred ' + l for l in ['0','1']],
    index=['true ' + l for l in ['0','1']]
)

Unnamed: 0,pred 0,pred 1
true 0,12110,5207
true 1,6514,6110


In [14]:
print(skmetrics.classification_report(pred_df['label'], pred_df['prediction'], 
                                      target_names=['0','1']))

              precision    recall  f1-score   support

           0       0.65      0.70      0.67     17317
           1       0.54      0.48      0.51     12624

    accuracy                           0.61     29941
   macro avg       0.60      0.59      0.59     29941
weighted avg       0.60      0.61      0.60     29941



In [15]:
spark.stop()