In [4]:
! wget -N https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sarcasm/train-balanced-sarcasm.csv -P /tmp

--2019-03-25 10:33:16--  https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sarcasm/train-balanced-sarcasm.csv
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.178.85
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.178.85|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 255268960 (243M) [text/csv]
Saving to: ‘/tmp/train-balanced-sarcasm.csv’


2019-03-25 10:33:35 (12.6 MB/s) - ‘/tmp/train-balanced-sarcasm.csv’ saved [255268960/255268960]



In [4]:
from pyspark.sql import SparkSession
import sys
import time

packages = [
    'JohnSnowLabs:spark-nlp:2.0.1'
]
spark = SparkSession \
    .builder \
    .appName("ML SQL session") \
    .config('spark.jars.packages', ','.join(packages)) \
    .config('spark.executor.instances','4') \
    .config("spark.executor.memory", "16g") \
    .config("spark.driver.memory","16g") \
    .getOrCreate()

In [5]:
from pyspark.sql import SQLContext

sql = SQLContext(spark)

trainBalancedSarcasmDF = spark.read.option("header", True).option("inferSchema", True).csv("/tmp/train-balanced-sarcasm.csv")
trainBalancedSarcasmDF.printSchema()

# Let's create a temp view (table) for our SQL queries
trainBalancedSarcasmDF.createOrReplaceTempView('data')

sql.sql('SELECT COUNT(*) FROM data').collect()

root
 |-- label: integer (nullable = true)
 |-- comment: string (nullable = true)
 |-- author: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- score: string (nullable = true)
 |-- ups: string (nullable = true)
 |-- downs: string (nullable = true)
 |-- date: string (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- parent_comment: string (nullable = true)



[Row(count(1)=1010826)]

In [6]:
df = sql.sql('select label,concat(parent_comment,"\n",comment) as comment from data where comment is not null and parent_comment is not null limit 100000')
print(type(df))
df.printSchema()
df.show()

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- label: integer (nullable = true)
 |-- comment: string (nullable = true)

+-----+--------------------+
|label|             comment|
+-----+--------------------+
|    0|Yeah, I get that ...|
|    0|The blazers and M...|
|    0|They're favored t...|
|    0|deadass don't kil...|
|    0|Yep can confirm I...|
|    0|do you find arian...|
|    0|What's your weird...|
|    0|Probably Sephirot...|
|    0|What to upgrade? ...|
|    0|Probably count Ka...|
|    0|I bet if that mon...|
|    0|James Shields Wil...|
|    0|There's no time t...|
|    0|Team Specific Thr...|
|    0|Ill give you a hi...|
|    0|Star Wars, easy. ...|
|    0|You're adorable.
...|
|    0|He actually acts ...|
|    0|Clinton struggles...|
|    0|Is that the Older...|
+-----+--------------------+
only showing top 20 rows



In [7]:
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

from pyspark.ml import Pipeline


document_assembler = DocumentAssembler() \
    .setInputCol("comment") \
    .setOutputCol("document")
    
sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence") \
    .setUseAbbreviations(True)
    
tokenizer = Tokenizer() \
  .setInputCols(["sentence"]) \
  .setOutputCol("token")

stemmer = Stemmer() \
    .setInputCols(["token"]) \
    .setOutputCol("stem")
    
normalizer = Normalizer() \
    .setInputCols(["stem"]) \
    .setOutputCol("normalized")

finisher = Finisher() \
    .setInputCols(["normalized"]) \
    .setOutputCols(["ntokens"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(True)

nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, stemmer, normalizer, finisher])
nlp_model = nlp_pipeline.fit(df)
processed = nlp_model.transform(df).persist()
processed.count()
processed.show()

+-----+--------------------+--------------------+
|label|             comment|             ntokens|
+-----+--------------------+--------------------+
|    0|Yeah, I get that ...|[yeah, i, get, th...|
|    0|The blazers and M...|[the, blazer, and...|
|    0|They're favored t...|[thei, re, favor,...|
|    0|deadass don't kil...|[deadass, do, nt,...|
|    0|Yep can confirm I...|[yep, can, confir...|
|    0|do you find arian...|[do, you, find, a...|
|    0|What's your weird...|[what, your, weir...|
|    0|Probably Sephirot...|[probabl, sephiro...|
|    0|What to upgrade? ...|[what, to, upgrad...|
|    0|Probably count Ka...|[probabl, count, ...|
|    0|I bet if that mon...|[i, bet, if, that...|
|    0|James Shields Wil...|[jame, shield, wi...|
|    0|There's no time t...|[there, no, time,...|
|    0|Team Specific Thr...|[team, specif, th...|
|    0|Ill give you a hi...|[ill, give, you, ...|
|    0|Star Wars, easy. ...|[star, war, easi,...|
|    0|You're adorable.
...|[you, re, ador, n...|


In [8]:
train, test = processed.randomSplit(weights=[0.7, 0.3], seed=123)
print(train.count())
print(test.count())

70136
29864


In [9]:
from pyspark.ml import feature as spark_ft

stopWords = spark_ft.StopWordsRemover.loadDefaultStopWords('english')
sw_remover = spark_ft.StopWordsRemover(inputCol='ntokens', outputCol='clean_tokens', stopWords=stopWords)
text2vec = spark_ft.Word2Vec(
    vectorSize=50, minCount=5, seed=123, 
    inputCol='ntokens', outputCol='text_vec', 
    windowSize=5, maxSentenceLength=30
)
assembler = spark_ft.VectorAssembler(inputCols=['text_vec'], outputCol='features')
feature_pipeline = Pipeline(stages=[sw_remover, text2vec,assembler])
feature_model = feature_pipeline.fit(train)

train_featurized = feature_model.transform(train).persist()
train_featurized.count()
train_featurized.show()

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|label|             comment|             ntokens|        clean_tokens|            text_vec|            features|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|    0|              !
Goes|               [goe]|               [goe]|[-0.1480705887079...|[-0.1480705887079...|
|    0|!completed
!compl...|  [complet, complet]|  [complet, complet]|[-0.0071502337232...|[-0.0071502337232...|
|    0|""" ""Very Right ...|[veri, right, win...|[veri, right, win...|[-0.1590647930279...|[-0.1590647930279...|
|    0|""" Perhaps you n...|[perhap, you, ne,...|[perhap, ne, stro...|[-0.0207198142095...|[-0.0207198142095...|
|    0|""" This covering...|[thi, cover, not,...|[thi, cover, onli...|[-0.0748528920636...|[-0.0748528920636...|
|    0|"""*Kirk
I am sin...|[kirk, i, am, sin...|[kirk, singl, gue...|[-0.0922344097867...|[-0.0

In [10]:
from pyspark.ml import classification as spark_cls


mlpc = spark_cls.MultilayerPerceptronClassifier(
    maxIter=100, seed=123, layers=[50, 25, 10,2]
)

model = mlpc.fit(train_featurized)


CPU times: user 23.9 ms, sys: 30.2 ms, total: 54.1 ms
Wall time: 1min 21s


In [11]:
test_featurized = feature_model.transform(test)
preds = model.transform(test_featurized)
preds.show()

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|label|             comment|             ntokens|        clean_tokens|            text_vec|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|    0|!RemindMe 1 week
...|[remindm, week, r...|[remindm, week, r...|[0.11306838598102...|[0.11306838598102...|[0.89789200131038...|[0.77982102193697...|       0.0|
|    0|!Remindme 2 weeks...|[remindm, week, r...|[remindm, week, r...|[0.22176272049546...|[0.22176272049546...|[1.01256287699695...|[0.84369743152748...|       0.0|
|    0|!SH!TPOST!: All t...|[shtpost, all, th...|[shtpost, poor, u...|[-0.1059518517660...|[-0.1059518517660...|[0.12477470407395...|[0.39505276118435...|       1.0|
|   

In [12]:
pred_df = preds.select('comment', 'label', 'prediction').toPandas()

In [13]:
pred_df.head()

Unnamed: 0,comment,label,prediction
0,!RemindMe 1 week\n!RemindMe 2 days,0,0.0
1,!Remindme 2 weeks\n!Remindme 2 weeks,0,0.0
2,!SH!TPOST!: All those poor USA Streamers\nNow ...,0,1.0
3,"""""""**FUCK** Cloud"""" - Link main""\nYep, that's ...",0,0.0
4,"""""""*Komrad\n""*""""Those were just pro-USA rebels...",0,1.0


In [14]:
import pandas as pd
from sklearn import metrics as skmetrics
pd.DataFrame(
    data=skmetrics.confusion_matrix(pred_df['label'], pred_df['prediction']),
    columns=['pred ' + l for l in ['0','1']],
    index=['true ' + l for l in ['0','1']]
)

Unnamed: 0,pred 0,pred 1
true 0,13928,3296
true 1,8415,4225


In [15]:
print(skmetrics.classification_report(pred_df['label'], pred_df['prediction'], 
                                      target_names=['0','1']))

              precision    recall  f1-score   support

           0       0.62      0.81      0.70     17224
           1       0.56      0.33      0.42     12640

   micro avg       0.61      0.61      0.61     29864
   macro avg       0.59      0.57      0.56     29864
weighted avg       0.60      0.61      0.58     29864



In [16]:
spark.stop()