### LDA with 1 topic 2 words on sample of comments on the month preceding the USA 2016 election

In [28]:
start_date = datetime.date(year=2016, month=10, day=30)
end_date = datetime.date(year=2016, month=11, day=7)

ls = [start_date, end_date]
min(ls)

datetime.date(2016, 10, 30)

In [42]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import *
import pyspark.sql.functions as func
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()
comments = spark.read.load('../data/sample.parquet')
comments = comments.withColumn('created', func.from_unixtime(comments['created_utc'], 'yyyy-MM-dd HH:mm:ss.SS').cast(DateType()))
comments.registerTempTable("comments")
sc = spark.sparkContext

import spacy
import gensim
from spacy.lang.en import English
import nltk
import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet as wn
import datetime
en_stop = set(nltk.corpus.stopwords.words('english'))
parser = English()

start_date = datetime.date(year=2016, month=10, day=30)
end_date = datetime.date(year=2016, month=11, day=7)

oct_2016_news_comments = comments.select('link_id','body','created', 'subreddit').filter(comments.created > start_date).filter(comments.created < end_date).filter(comments.subreddit == 'news')

def lda_on_posts(dataset, words_per_topic, n_topics, parser=English(), stop_words=en_stop):
    '''   
    This function performs a LDA (Latent Dirichlet Allocation) model on a set of reddit comments.
    Useful for topic modelling/extraction from a reddit post.
    Parameters
    −−−−−−−−−−
    dataset: pyspark RDD or Dataframe, schema should have only three data type : 
              the post id (link_id), the body of the comment and the creation date in this order.
              
    words_per_topic: number of words that should constitute a topic per post.
    
    n_topics: number of topics to extract by post
    
    parser: the natural language parser used, corresponds to a language normally,
            by default english (as it is the most used language on reddit).
            should be a parser from the spacy.lang library.
    
    stop_words: set of words that constitutes stop words (i.e. that should be
                removed from the tokens)

    Returns
    −−−−−−−
    A RDD with the following pair of data as rows : (<post_id>, <topic (as a list of words)>)) 
    '''
    #useful functions for preprocessing the data for LDA
    def tokenize(text):
        lda_tokens = []
        tokens = parser(text)
        for token in tokens:
            if token.orth_.isspace():
                continue
            elif token.like_url:
                lda_tokens.append('URL')
            elif token.orth_.startswith('@'):
                lda_tokens.append('SCREEN_NAME')
            else:
                lda_tokens.append(token.lower_)
        return lda_tokens

    def get_lemma(word):
        lemma = wn.morphy(word)
        if lemma is None:
            return word
        else:
            return lemma

    def get_lemma2(word):
        return WordNetLemmatizer().lemmatize(word)

    def prepare_text_for_lda(text):
        tokens = tokenize(text)
        tokens = [token for token in tokens if len(token) > 4]
        tokens = [token for token in tokens if token not in en_stop]
        tokens = [get_lemma(token) for token in tokens]
        return tokens
    
    def get_n_topics(text_data):
        dictionary = gensim.corpora.Dictionary(text_data)
        corpus = [dictionary.doc2bow(text) for text in text_data]
        ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = n_topics, id2word=dictionary, passes=15)
        topics = ldamodel.print_topics(num_words=words_per_topic)
        return topics
    
    def extract_key_words(lda_result):
        return re.findall(r'\"(.*?)\"', lda_result)

    #detecting type of the input given.
    if isinstance(dataset, pyspark.sql.dataframe.DataFrame):
        dataset = dataset.rdd
    elif not isinstance(dataset, pyspark.rdd.RDD):
        raise ValueError('Wrong type of dataset, must be either a pyspark RDD or pyspark DataFrame')
    
    #TODO : keep the minimum timestamp (r[2] of the dataset) during computations.
    
    #filtering comments that were removed, to avoid them to pollute the topics extracted
    filter_absent_comm = dataset.filter(lambda r: r[1] != '[removed]' and r[1] != '[deleted]')
    
    #applying text preprocesisng for LDA + filtering all empty sets (without tokens as the result of the LDA preprocessing)
    LDA_preprocessed = filter_absent_comm.map(lambda r: (r[0], list(prepare_text_for_lda(r[1])))).filter(lambda r: r[1])
    
    #groupy every comments by post/thread id.
    post_and_list_token = LDA_preprocessed.groupByKey().map(lambda x : (x[0], list(x[1])))
    
    #generating n topics per post/thread.
    res_lda = post_and_list_token.map(lambda r: (r[0],get_n_topics(r[1]))).flatMap(lambda r: [(r[0], t) for t in r[1]])
    
    return res_lda.map(lambda r: (r[0], ' '.join(extract_key_words(r[1][1])))).toDF().selectExpr("_1 as post_id", "_2 as topic")

res = lda_on_posts(oct_2016_news_comments.rdd.map(lambda r: r), 2, 1)
#hillary_and_trump = res.filter(lambda r: ('trump' in r[1]) or ('hillary' in r[1]) or ('donald' in r[1]) or ('clinton' in r[1]))
res.show()

+---------+--------------------+
|  post_id|               topic|
+---------+--------------------+
|t3_5axjwf|already:/ pennsyl...|
|t3_5aw066|        excuse least|
|t3_5as8g0|    amendment civics|
|t3_5aiyva|       school people|
|t3_5ap0a7|     military payday|
|t3_5apdai|              thanks|
|t3_5b5m2v|      people picture|
|t3_5b3jm9|       dealer option|
|t3_5acf8u|       police arrest|
|t3_5a61rd|       bones chicken|
|t3_5ah1kd|    people insurance|
|t3_5aih4r|        drug looking|
|t3_5b6w4x|      complain going|
|t3_5akrra|     another appoint|
|t3_5b9ik4|       money private|
|t3_5abvof|     believing great|
|t3_5bbo5w|      murderer extra|
|t3_5aqikp|         replacement|
|t3_5b7a5q|       humor officer|
|t3_5b201j|        larger force|
+---------+--------------------+
only showing top 20 rows



In [None]:
start_date = datetime.date(year=2016, month=11, day=5)
end_date = datetime.date(year=2016, month=11, day=7)



In [33]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

def get_n_topics(text_data):
    dictionary = gensim.corpora.Dictionary(text_data)
    corpus = [dictionary.doc2bow(text) for text in text_data]
    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = n_topics, id2word=dictionary, passes=15)
    topics = ldamodel.print_topics(num_words=words_per_topic)
    return topics

def extract_key_words(lda_result):
    return re.findall(r'\"(.*?)\"', lda_result)

dataset = oct_2016_news_comments.rdd

#filtering comments that were removed, to avoid them to pollute the topics extracted
filter_absent_comm = dataset.filter(lambda r: r[1] != '[removed]' and r[1] != '[deleted]')



In [35]:
#applying text preprocesisng for LDA + filtering all empty sets (without tokens as the result of the LDA preprocessing)
LDA_preprocessed = filter_absent_comm.map(lambda r: (r[0], list(prepare_text_for_lda(r[1])), r[2])).filter(lambda r: r[1])
LDA_preprocessed.toDF().show()

+---------+--------------------+----------+
|       _1|                  _2|        _3|
+---------+--------------------+----------+
|t3_5az1f9|    [thank, skeltal]|2016-11-04|
|t3_5b1xhn|[suspect, black, ...|2016-11-04|
|t3_5b3i7p|[middle, school, ...|2016-11-04|
|t3_5b1jl9|[credibility, cro...|2016-11-04|
|t3_5b1dlb|[prove, anything,...|2016-11-04|
|t3_5bfcaf|             [write]|2016-11-06|
|t3_5bgsjz|[baltimore, enjoy...|2016-11-06|
|t3_5aw0t2|[guess, relative,...|2016-11-03|
|t3_5aw0t2|[actually, point,...|2016-11-03|
|t3_5aw0t2|[water, water, co...|2016-11-03|
|t3_5b2up5|            [potato]|2016-11-04|
|t3_5b3jm9|[first, sentence,...|2016-11-04|
|t3_5b3thi|[remember, hearin...|2016-11-04|
|t3_5b5m2v|[own, private, na...|2016-11-04|
|t3_5b3i7p|[worse, middle, s...|2016-11-04|
|t3_5b5m2v|[county, spend, f...|2016-11-04|
|t3_5b5380|[going, convict, ...|2016-11-04|
|t3_5abor2|[going, honest, d...|2016-10-31|
|t3_5bbwsp|[imagine, celebra...|2016-11-06|
+---------+--------------------+

In [39]:
#groupy every comments by post/thread id.

zeroValue = (list(), datetime.date(year=3016, month=12, day=30))

combFun = lambda l, r: (l[0]+r[0], min(l[1], r[1]))

seqFun = lambda prev, curr: (prev[0]+[curr[0]], prev[1] if prev[1] < curr[1] else curr[1])

post_and_list_token = LDA_preprocessed.map(lambda r: (r[0], (r[1], r[2]))).aggregateByKey(zeroValue, seqFun, combFun)#.map(lambda x : (x[0], list(x[1]), min(list(x[2]))))

post_and_list_token.toDF().show()

+---------+--------------------+
|       _1|                  _2|
+---------+--------------------+
|t3_5axjwf|[[[pennsylvania, ...|
|t3_5aw066|[[[least, excuse]...|
|t3_5as8g0|[[[marriage, equa...|
|t3_5aiyva|[[[freshman, scho...|
|t3_5ap0a7|[[[military, wife...|
|t3_5apdai|[[[thanks]], 2016...|
|t3_5b5m2v|[[[own, private, ...|
|t3_5b3jm9|[[[first, sentenc...|
|t3_5acf8u|[[[making, narrat...|
|t3_5a61rd|[[[toss, chicken,...|
|t3_5ah1kd|[[[health, insura...|
|t3_5aih4r|[[[breakdown, sit...|
|t3_5b6w4x|[[[reddit, compla...|
|t3_5akrra|[[[appoint, anoth...|
|t3_5b9ik4|[[[several, tribe...|
|t3_5abvof|[[[believing, val...|
|t3_5bbo5w|[[[heavily, downv...|
|t3_5aqikp|[[[replacement]],...|
|t3_5b7a5q|[[[random, person...|
|t3_5b201j|[[[small, scale, ...|
+---------+--------------------+
only showing top 20 rows



In [40]:


#generating n topics per post/thread.
res_lda = post_and_list_token.map(lambda r: (r[0],get_n_topics(r[1]), r[2])).flatMap(lambda r: [(r[0], t, r[2]) for t in r[1]])

res_lda.map(lambda r: (r[0], extract_key_words(r[1][1]), r[2])).toDF().show()



Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 67.0 failed 1 times, most recent failure: Lost task 2.0 in stage 67.0 (TID 1794, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/Applications/spark-2.3.2-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 253, in main
    process()
  File "/Applications/spark-2.3.2-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 248, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/Applications/spark-2.3.2-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 379, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/Applications/spark-2.3.2-bin-hadoop2.7/python/pyspark/rdd.py", line 1352, in takeUpToNumLeft
    yield next(iterator)
  File "/Applications/spark-2.3.2-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/util.py", line 55, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-40-d1e1ed4a069f>", line 4, in <lambda>
  File "<ipython-input-33-bafebf25b217>", line 33, in get_n_topics
  File "/anaconda3/lib/python3.6/site-packages/gensim/corpora/dictionary.py", line 79, in __init__
    self.add_documents(documents, prune_at=prune_at)
  File "/anaconda3/lib/python3.6/site-packages/gensim/corpora/dictionary.py", line 195, in add_documents
    self.doc2bow(document, allow_update=True)  # ignore the result, here we only care about updating token ids
  File "/anaconda3/lib/python3.6/site-packages/gensim/corpora/dictionary.py", line 238, in doc2bow
    counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
TypeError: decoding to str: need a bytes-like object, list found

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:330)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:470)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:453)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:284)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$$anonfun$3.apply(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD$$anonfun$3.apply(PythonRDD.scala:152)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1651)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1639)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1638)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1638)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1872)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1821)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1810)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/Applications/spark-2.3.2-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 253, in main
    process()
  File "/Applications/spark-2.3.2-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/worker.py", line 248, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/Applications/spark-2.3.2-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/serializers.py", line 379, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/Applications/spark-2.3.2-bin-hadoop2.7/python/pyspark/rdd.py", line 1352, in takeUpToNumLeft
    yield next(iterator)
  File "/Applications/spark-2.3.2-bin-hadoop2.7/python/lib/pyspark.zip/pyspark/util.py", line 55, in wrapper
    return f(*args, **kwargs)
  File "<ipython-input-40-d1e1ed4a069f>", line 4, in <lambda>
  File "<ipython-input-33-bafebf25b217>", line 33, in get_n_topics
  File "/anaconda3/lib/python3.6/site-packages/gensim/corpora/dictionary.py", line 79, in __init__
    self.add_documents(documents, prune_at=prune_at)
  File "/anaconda3/lib/python3.6/site-packages/gensim/corpora/dictionary.py", line 195, in add_documents
    self.doc2bow(document, allow_update=True)  # ignore the result, here we only care about updating token ids
  File "/anaconda3/lib/python3.6/site-packages/gensim/corpora/dictionary.py", line 238, in doc2bow
    counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
TypeError: decoding to str: need a bytes-like object, list found

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:330)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:470)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:453)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:284)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$$anonfun$3.apply(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD$$anonfun$3.apply(PythonRDD.scala:152)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2074)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more


In [53]:
start_date = datetime.date(year=2016, month=11, day=5)
end_date = datetime.date(year=2016, month=11, day=7)
oct_comments = spark.read.parquet("../data/oct_2016_news_comment.parquet")
#oct_comments.first()[2]
#oct_comments = oct_comments.withColumn('created', func.from_unixtime(comments['created'], 'yyyy-MM-dd HH:mm:ss.SS').cast(DateType()))
day_b4_elec_comm = oct_comments.filter(oct_comments.created > start_date).filter(oct_comments.created < end_date)

topic_day_b4_elec = lda_on_posts(day_b4_elec_comm.rdd.map(lambda r: r), 2, 1)
topic_day_b4_elec.write.mode('overwrite').parquet('../data/test_results_lda_nov06')

In [54]:
res = spark.read.parquet('../data/test_results_lda_nov06')
res.show()

+---------+-------------------+
|  post_id|              topic|
+---------+-------------------+
|t3_5bg8ip|      never display|
|t3_5bb8oc|       would people|
|t3_5bfi4i|     people country|
|t3_5be3qj|     people refugee|
|t3_5bcre9|    pipeline people|
|t3_5bg6hy|        woman would|
|t3_5bfxt1|     would election|
|t3_5bgcq7|  claim palestinian|
|t3_5bepra|     service animal|
|t3_5bfrx1| newspaper democrat|
|t3_5bdemf|    behavior acquit|
|t3_5bfd2h|    operation would|
|t3_5ba8vb|   terrorism agency|
|t3_5b15mv|       pizza relate|
|t3_5b9puh|       queen better|
|t3_5batmk|     speech explain|
|t3_5bgqrx|    people sandwich|
|t3_5bgsjz|request information|
|t3_5b5vht|       disgust roam|
|t3_5b3kss|   education people|
+---------+-------------------+
only showing top 20 rows

