In [1]:
! wget -N https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sarcasm/train-balanced-sarcasm.csv -P /tmp

--2020-02-10 15:32:53--  https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sarcasm/train-balanced-sarcasm.csv
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.239.141
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.239.141|:443... connected.
HTTP request sent, awaiting response... 304 Not Modified
File ‘/tmp/train-balanced-sarcasm.csv’ not modified on server. Omitting download.



In [1]:
from pyspark.sql import SparkSession
import sys
import time
import sparknlp

packages = [
    'JohnSnowLabs:spark-nlp:2.4.0'
]
spark = SparkSession \
    .builder \
    .appName("ML SQL session") \
    .config('spark.jars.packages', ','.join(packages)) \
    .config('spark.executor.instances','1') \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory","2g") \
    .getOrCreate()

In [2]:
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:  2.4.0
Apache Spark version:  2.4.4


In [4]:
from pyspark.sql import SQLContext

sql = SQLContext(spark)

trainBalancedSarcasmDF = spark.read.option("header", True).option("inferSchema", True).csv("/tmp/train-balanced-sarcasm.csv")
trainBalancedSarcasmDF.printSchema()

# Let's create a temp view (table) for our SQL queries
trainBalancedSarcasmDF.createOrReplaceTempView('data')

sql.sql('SELECT COUNT(*) FROM data').collect()

root
 |-- label: integer (nullable = true)
 |-- comment: string (nullable = true)
 |-- author: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- score: string (nullable = true)
 |-- ups: string (nullable = true)
 |-- downs: string (nullable = true)
 |-- date: string (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- parent_comment: string (nullable = true)



[Row(count(1)=1010826)]

In [8]:
df = sql.sql('select label,concat(parent_comment,"\n",comment) as comment from data where comment is not null and parent_comment is not null limit 100000')
print(type(df))
print("Amount of rows:", df.count())
df = df.limit(2000) #minimize dataset if you are not running on a cluster
df.printSchema()
df.show()

<class 'pyspark.sql.dataframe.DataFrame'>
Amount of rows: 100000
root
 |-- label: integer (nullable = true)
 |-- comment: string (nullable = true)

+-----+--------------------+
|label|             comment|
+-----+--------------------+
|    0|Yeah, I get that ...|
|    0|The blazers and M...|
|    0|They're favored t...|
|    0|deadass don't kil...|
|    0|Yep can confirm I...|
|    0|do you find arian...|
|    0|What's your weird...|
|    0|Probably Sephirot...|
|    0|What to upgrade? ...|
|    0|Probably count Ka...|
|    0|I bet if that mon...|
|    0|James Shields Wil...|
|    0|There's no time t...|
|    0|Team Specific Thr...|
|    0|Ill give you a hi...|
|    0|Star Wars, easy. ...|
|    0|You're adorable.
...|
|    0|He actually acts ...|
|    0|Clinton struggles...|
|    0|Is that the Older...|
+-----+--------------------+
only showing top 20 rows



In [10]:
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

from pyspark.ml import Pipeline


document_assembler = DocumentAssembler() \
    .setInputCol("comment") \
    .setOutputCol("document")
    
sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence") \
    .setUseAbbreviations(True)
    
tokenizer = Tokenizer() \
  .setInputCols(["sentence"]) \
  .setOutputCol("token")

stemmer = Stemmer() \
    .setInputCols(["token"]) \
    .setOutputCol("stem")
    
normalizer = Normalizer() \
    .setInputCols(["stem"]) \
    .setOutputCol("normalized")

finisher = Finisher() \
    .setInputCols(["normalized"]) \
    .setOutputCols(["ntokens"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(True)

nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, stemmer, normalizer, finisher])
nlp_model = nlp_pipeline.fit(df)
processed = nlp_model.transform(df).persist()
processed.count()
processed.show()

+-----+--------------------+--------------------+
|label|             comment|             ntokens|
+-----+--------------------+--------------------+
|    0|Yeah, I get that ...|[yeah, i, get, th...|
|    0|The blazers and M...|[the, blazer, and...|
|    0|They're favored t...|[theyr, favor, to...|
|    0|deadass don't kil...|[deadass, dont, k...|
|    0|Yep can confirm I...|[yep, can, confir...|
|    0|do you find arian...|[do, you, find, a...|
|    0|What's your weird...|[what, your, weir...|
|    0|Probably Sephirot...|[probabl, sephiro...|
|    0|What to upgrade? ...|[what, to, upgrad...|
|    0|Probably count Ka...|[probabl, count, ...|
|    0|I bet if that mon...|[i, bet, if, that...|
|    0|James Shields Wil...|[jame, shield, wi...|
|    0|There's no time t...|[there, no, time,...|
|    0|Team Specific Thr...|[team, specif, th...|
|    0|Ill give you a hi...|[ill, give, you, ...|
|    0|Star Wars, easy. ...|[star, war, easi,...|
|    0|You're adorable.
...|  [your, ador, note]|


In [11]:
train, test = processed.randomSplit(weights=[0.7, 0.3], seed=123)
print(train.count())
print(test.count())

1401
599


In [12]:
from pyspark.ml import feature as spark_ft

stopWords = spark_ft.StopWordsRemover.loadDefaultStopWords('english')
sw_remover = spark_ft.StopWordsRemover(inputCol='ntokens', outputCol='clean_tokens', stopWords=stopWords)
text2vec = spark_ft.Word2Vec(
    vectorSize=50, minCount=5, seed=123, 
    inputCol='ntokens', outputCol='text_vec', 
    windowSize=5, maxSentenceLength=30
)
assembler = spark_ft.VectorAssembler(inputCols=['text_vec'], outputCol='features')
feature_pipeline = Pipeline(stages=[sw_remover, text2vec,assembler])
feature_model = feature_pipeline.fit(train)

train_featurized = feature_model.transform(train).persist()
train_featurized.count()
train_featurized.show()

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|label|             comment|             ntokens|        clean_tokens|            text_vec|            features|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|    0|"""Agreed. I thin...|[agr, i, think, w...|[agr, think, issu...|[-0.0277247686728...|[-0.0277247686728...|
|    0|"""It's kind of h...|[it, kind, of, ha...|[kind, hard, turn...|[-0.0212900592346...|[-0.0212900592346...|
|    0|"""Mom
Holy shitb...|[mom, holi, shitb...|[mom, holi, shitb...|[-0.0079348648898...|[-0.0079348648898...|
|    0|"""People""
Umm, ...|[peopl, umm, he, ...|[peopl, umm, cant...|[-0.0139320314240...|[-0.0139320314240...|
|    0|"""Play it cool; ...|[plai, it, cool, ...|[plai, cool, plai...|[-0.0255794662419...|[-0.0255794662419...|
|    0|"""Said it last y...|[said, it, last, ...|[said, last, year...|[-0.0173314982658...|[-0.0

In [13]:
from pyspark.ml import classification as spark_cls


mlpc = spark_cls.MultilayerPerceptronClassifier(
    maxIter=100, seed=123, layers=[50, 25, 10,2]
)

model = mlpc.fit(train_featurized)


In [14]:
test_featurized = feature_model.transform(test)
preds = model.transform(test_featurized)
preds.show()

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|label|             comment|             ntokens|        clean_tokens|            text_vec|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|    0|"""Did Hillary Cl...|[did, hillari, cl...|[hillari, clinton...|[-0.0360369030890...|[-0.0360369030890...|[0.25204980283576...|[0.87505328308586...|       0.0|
|    0|"""Gingrich
And C...|[gingrich, and, c...|[gingrich, christ...|[-0.0311310966753...|[-0.0311310966753...|[0.32648835421839...|[0.89253965071566...|       0.0|
|    0|"""Hey you wanna ...|[hei, you, wanna,...|[hei, wanna, get,...|[0.00124455507223...|[0.00124455507223...|[0.62729998326438...|[0.94636325127557...|       0.0|
|   

In [15]:
pred_df = preds.select('comment', 'label', 'prediction').toPandas()

In [16]:
pred_df.head()

Unnamed: 0,comment,label,prediction
0,"""""""Did Hillary Clinton break the law?"""" Chaffe...",0,0.0
1,"""""""Gingrich\nAnd Christie will be in charge of...",0,0.0
2,"""""""Hey you wanna get highhh""""""\nOh man, oh man...",0,0.0
3,"""""""QR Code""""\n""For some reason my brain was se...",0,0.0
4,"""""""The Germans bombed Pearl Harbor"""" Not sure ...",0,0.0


In [17]:
import pandas as pd
from sklearn import metrics as skmetrics
pd.DataFrame(
    data=skmetrics.confusion_matrix(pred_df['label'], pred_df['prediction']),
    columns=['pred ' + l for l in ['0','1']],
    index=['true ' + l for l in ['0','1']]
)

Unnamed: 0,pred 0,pred 1
true 0,537,0
true 1,62,0


In [18]:
print(skmetrics.classification_report(pred_df['label'], pred_df['prediction'], 
                                      target_names=['0','1']))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95       537
           1       0.00      0.00      0.00        62

    accuracy                           0.90       599
   macro avg       0.45      0.50      0.47       599
weighted avg       0.80      0.90      0.85       599



  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
spark.stop()