![](https://memesbams.com/wp-content/uploads/2017/11/sheldon-sarcasm-meme.jpg)

https://www.kaggle.com/danofer/sarcasm
<div class="markdown-converter__text--rendered"><h3>Context</h3>

<p>This dataset contains 1.3 million Sarcastic comments from the Internet commentary website Reddit. The dataset was generated by scraping comments from Reddit (not by me :)) containing the <code>\s</code> ( sarcasm) tag. This tag is often used by Redditors to indicate that their comment is in jest and not meant to be taken seriously, and is generally a reliable indicator of sarcastic comment content.</p>

<h3>Content</h3>

<p>Data has balanced and imbalanced (i.e true distribution) versions. (True ratio is about 1:100). The
corpus has 1.3 million sarcastic statements, along with what they responded to as well as many non-sarcastic comments from the same source.</p>

<p>Labelled comments are in the <code>train-balanced-sarcasm.csv</code> file.</p>

<h3>Acknowledgements</h3>

<p>The data was gathered by: Mikhail Khodak and Nikunj Saunshi and Kiran Vodrahalli for their article "<a href="https://arxiv.org/abs/1704.05579" rel="nofollow">A Large Self-Annotated Corpus for Sarcasm</a>". The data is hosted <a href="http://nlp.cs.princeton.edu/SARC/0.0/" rel="nofollow">here</a>.</p>

<p>Citation:</p>

<pre><code>@unpublished{SARC,
  authors={Mikhail Khodak and Nikunj Saunshi and Kiran Vodrahalli},
  title={A Large Self-Annotated Corpus for Sarcasm},
  url={https://arxiv.org/abs/1704.05579},
  year=2017
}
</code></pre>

<p><a href="http://nlp.cs.princeton.edu/SARC/0.0/readme.txt" rel="nofollow">Annotation of files in the original dataset: readme.txt</a>.</p>

<h3>Inspiration</h3>

<ul>
<li>Predicting sarcasm and relevant NLP features (e.g. subjective determinant, racism, conditionals, sentiment heavy words, "Internet Slang" and specific phrases). </li>
<li>Sarcasm vs Sentiment</li>
<li>Unusual linguistic features such as caps, italics, or elongated words. e.g., "Yeahhh, I'm sure THAT is the right answer".</li>
<li>Topics that people tend to react to sarcastically</li>
</ul></div>

In [1]:
import sys
import time
import sparknlp

from pyspark.sql import SparkSession
packages = [
    'JohnSnowLabs:spark-nlp: 2.4.2'
]
spark = SparkSession \
    .builder \
    .appName("ML SQL session") \
    .config('spark.jars.packages', ','.join(packages)) \
    .config('spark.executor.instances','1') \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory","2g") \
    .getOrCreate()

In [2]:
print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

Spark NLP version:   2.4.2
Apache Spark version:  2.4.4


In [3]:
! wget -N https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sarcasm/train-balanced-sarcasm.csv -P /tmp

--2020-02-10 15:35:40--  https://s3.amazonaws.com/auxdata.johnsnowlabs.com/public/resources/en/sarcasm/train-balanced-sarcasm.csv
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.45.30
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.45.30|:443... connected.
HTTP request sent, awaiting response... 304 Not Modified
File ‘/tmp/train-balanced-sarcasm.csv’ not modified on server. Omitting download.



In [4]:
from pyspark.sql import SQLContext

sql = SQLContext(spark)

trainBalancedSarcasmDF = spark.read.option("header", True).option("inferSchema", True).csv("/tmp/train-balanced-sarcasm.csv")
trainBalancedSarcasmDF.printSchema()

# Let's create a temp view (table) for our SQL queries
trainBalancedSarcasmDF.createOrReplaceTempView('data')

sql.sql('SELECT COUNT(*) FROM data').collect()

root
 |-- label: integer (nullable = true)
 |-- comment: string (nullable = true)
 |-- author: string (nullable = true)
 |-- subreddit: string (nullable = true)
 |-- score: string (nullable = true)
 |-- ups: string (nullable = true)
 |-- downs: string (nullable = true)
 |-- date: string (nullable = true)
 |-- created_utc: string (nullable = true)
 |-- parent_comment: string (nullable = true)



[Row(count(1)=1010826)]

In [5]:
sql.sql('select * from data limit 20').show()

+-----+--------------------+------------------+------------------+-----+---+-----+-------+-------------------+--------------------+
|label|             comment|            author|         subreddit|score|ups|downs|   date|        created_utc|      parent_comment|
+-----+--------------------+------------------+------------------+-----+---+-----+-------+-------------------+--------------------+
|    0|          NC and NH.|         Trumpbart|          politics|    2| -1|   -1|2016-10|2016-10-16 23:55:23|Yeah, I get that ...|
|    0|You do know west ...|         Shbshb906|               nba|   -4| -1|   -1|2016-11|2016-11-01 00:24:10|The blazers and M...|
|    0|They were underdo...|          Creepeth|               nfl|    3|  3|    0|2016-09|2016-09-22 21:45:37|They're favored t...|
|    0|"This meme isn't ...|         icebrotha|BlackPeopleTwitter|   -8| -1|   -1|2016-10|2016-10-18 21:03:47|deadass don't kil...|
|    0|I could use one o...|         cush2push|MaddenUltimateTeam|    6| -1|

In [6]:
sql.sql('select label,count(*) as cnt from data group by label order by cnt desc').show()

+-----+------+
|label|   cnt|
+-----+------+
|    0|505413|
|    1|505413|
+-----+------+



In [6]:
sql.sql('select count(*) from data where comment is null').collect()

[Row(count(1)=53)]

In [10]:
df = sql.sql('select label,concat(parent_comment,"\n",comment) as comment from data where comment is not null and parent_comment is not null limit 100000')
print(type(df))
df.printSchema()
print("Amount of rows:", df.count())
df = df.limit(2000) #minimize dataset if you are not running on a cluster
df.show()

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- label: integer (nullable = true)
 |-- comment: string (nullable = true)

Amount of rows: 100000
+-----+--------------------+
|label|             comment|
+-----+--------------------+
|    0|Yeah, I get that ...|
|    0|The blazers and M...|
|    0|They're favored t...|
|    0|deadass don't kil...|
|    0|Yep can confirm I...|
|    0|do you find arian...|
|    0|What's your weird...|
|    0|Probably Sephirot...|
|    0|What to upgrade? ...|
|    0|Probably count Ka...|
|    0|I bet if that mon...|
|    0|James Shields Wil...|
|    0|There's no time t...|
|    0|Team Specific Thr...|
|    0|Ill give you a hi...|
|    0|Star Wars, easy. ...|
|    0|You're adorable.
...|
|    0|He actually acts ...|
|    0|Clinton struggles...|
|    0|Is that the Older...|
+-----+--------------------+
only showing top 20 rows



In [11]:
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *

from pyspark.ml import Pipeline


document_assembler = DocumentAssembler() \
    .setInputCol("comment") \
    .setOutputCol("document")
    
sentence_detector = SentenceDetector() \
    .setInputCols(["document"]) \
    .setOutputCol("sentence") \
    .setUseAbbreviations(True)
    
tokenizer = Tokenizer() \
  .setInputCols(["sentence"]) \
  .setOutputCol("token")

stemmer = Stemmer() \
    .setInputCols(["token"]) \
    .setOutputCol("stem")
    
normalizer = Normalizer() \
    .setInputCols(["stem"]) \
    .setOutputCol("normalized")

finisher = Finisher() \
    .setInputCols(["normalized"]) \
    .setOutputCols(["ntokens"]) \
    .setOutputAsArray(True) \
    .setCleanAnnotations(True)

nlp_pipeline = Pipeline(stages=[document_assembler, sentence_detector, tokenizer, stemmer, normalizer, finisher])
nlp_model = nlp_pipeline.fit(df)
processed = nlp_model.transform(df).persist()
processed.count()
processed.show()

+-----+--------------------+--------------------+
|label|             comment|             ntokens|
+-----+--------------------+--------------------+
|    0|Yeah, I get that ...|[yeah, i, get, th...|
|    0|The blazers and M...|[the, blazer, and...|
|    0|They're favored t...|[theyr, favor, to...|
|    0|deadass don't kil...|[deadass, dont, k...|
|    0|Yep can confirm I...|[yep, can, confir...|
|    0|do you find arian...|[do, you, find, a...|
|    0|What's your weird...|[what, your, weir...|
|    0|Probably Sephirot...|[probabl, sephiro...|
|    0|What to upgrade? ...|[what, to, upgrad...|
|    0|Probably count Ka...|[probabl, count, ...|
|    0|I bet if that mon...|[i, bet, if, that...|
|    0|James Shields Wil...|[jame, shield, wi...|
|    0|There's no time t...|[there, no, time,...|
|    0|Team Specific Thr...|[team, specif, th...|
|    0|Ill give you a hi...|[ill, give, you, ...|
|    0|Star Wars, easy. ...|[star, war, easi,...|
|    0|You're adorable.
...|  [your, ador, note]|


In [12]:
train, test = processed.randomSplit(weights=[0.7, 0.3], seed=123)
print(train.count())
print(test.count())

1401
599


In [13]:
from pyspark.ml import feature as spark_ft

stopWords = spark_ft.StopWordsRemover.loadDefaultStopWords('english')
sw_remover = spark_ft.StopWordsRemover(inputCol='ntokens', outputCol='clean_tokens', stopWords=stopWords)
tf = spark_ft.CountVectorizer(vocabSize=500, inputCol='clean_tokens', outputCol='tf')
idf = spark_ft.IDF(minDocFreq=5, inputCol='tf', outputCol='idf')

feature_pipeline = Pipeline(stages=[sw_remover, tf, idf])
feature_model = feature_pipeline.fit(train)

train_featurized = feature_model.transform(train).persist()
train_featurized.count()
train_featurized.show()

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|label|             comment|             ntokens|        clean_tokens|                  tf|                 idf|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|    0|"""Agreed. I thin...|[agr, i, think, w...|[agr, think, issu...|(500,[0,1,7,9,31,...|(500,[0,1,7,9,31,...|
|    0|"""It's kind of h...|[it, kind, of, ha...|[kind, hard, turn...|(500,[4,6,133,135...|(500,[4,6,133,135...|
|    0|"""Mom
Holy shitb...|[mom, holi, shitb...|[mom, holi, shitb...|(500,[414,484],[1...|(500,[414,484],[5...|
|    0|"""People""
Umm, ...|[peopl, umm, he, ...|[peopl, umm, cant...|(500,[7,49],[1.0,...|(500,[7,49],[2.44...|
|    0|"""Play it cool; ...|[plai, it, cool, ...|[plai, cool, plai...|(500,[21,57,77,18...|(500,[21,57,77,18...|
|    0|"""Said it last y...|[said, it, last, ...|[said, last, year...|(500,[1,4,19,25,2...|(500,

In [14]:
train_featurized.groupBy("label").count().show()
train_featurized.printSchema()

+-----+-----+
|label|count|
+-----+-----+
|    0| 1285|
|    1|  116|
+-----+-----+

root
 |-- label: integer (nullable = true)
 |-- comment: string (nullable = true)
 |-- ntokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- clean_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tf: vector (nullable = true)
 |-- idf: vector (nullable = true)



In [15]:
from pyspark.ml import classification as spark_cls

rf = spark_cls. RandomForestClassifier(labelCol="label", featuresCol="idf", numTrees=100)

model = rf.fit(train_featurized)

In [16]:
test_featurized = feature_model.transform(test)
preds = model.transform(test_featurized)
preds.show()

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|label|             comment|             ntokens|        clean_tokens|                  tf|                 idf|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|    0|"""Did Hillary Cl...|[did, hillari, cl...|[hillari, clinton...|(500,[27,108,158,...|(500,[27,108,158,...|[91.6120253196463...|[0.91612025319646...|       0.0|
|    0|"""Gingrich
And C...|[gingrich, and, c...|[gingrich, christ...|(500,[13,107,495]...|(500,[13,107,495]...|[92.6657623563818...|[0.92665762356381...|       0.0|
|    0|"""Hey you wanna ...|[hei, you, wanna,...|[hei, wanna, get,...|(500,[5,6,10,16,7...|(500,[5,6,10,16,7...|[92.0788263446630...|[0.92078826344662...|       0.0|
|   

In [17]:
pred_df = preds.select('comment', 'label', 'prediction').toPandas()

In [18]:
pred_df.head()

Unnamed: 0,comment,label,prediction
0,"""""""Did Hillary Clinton break the law?"""" Chaffe...",0,0.0
1,"""""""Gingrich\nAnd Christie will be in charge of...",0,0.0
2,"""""""Hey you wanna get highhh""""""\nOh man, oh man...",0,0.0
3,"""""""QR Code""""\n""For some reason my brain was se...",0,0.0
4,"""""""The Germans bombed Pearl Harbor"""" Not sure ...",0,0.0


In [19]:
import pandas as pd
from sklearn import metrics as skmetrics
pd.DataFrame(
    data=skmetrics.confusion_matrix(pred_df['label'], pred_df['prediction']),
    columns=['pred ' + l for l in ['0','1']],
    index=['true ' + l for l in ['0','1']]
)

Unnamed: 0,pred 0,pred 1
true 0,537,0
true 1,62,0


In [20]:
print(skmetrics.classification_report(pred_df['label'], pred_df['prediction'], 
                                      target_names=['0','1']))

              precision    recall  f1-score   support

           0       0.90      1.00      0.95       537
           1       0.00      0.00      0.00        62

    accuracy                           0.90       599
   macro avg       0.45      0.50      0.47       599
weighted avg       0.80      0.90      0.85       599



  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
spark.stop()