In [164]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [165]:
spark = SparkSession.builder.getOrCreate()

In [166]:
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

In [167]:
data = spark.read.json("sample.json")

In [168]:
with_timestamps = (data
 .where(~(F.isnull("created_at") | F.isnull("text")))
 .select(
     F.to_timestamp("created_at", "EEE MMM dd HH:mm:ss Z yyyy").alias("timestamp"),
     F.split("text", r"\s+").alias("split_text"),
 )
 .where(~F.isnull("timestamp"))
)

In [169]:
with_timestamps.show()

+-------------------+-----------------------------------+
|          timestamp|                         split_text|
+-------------------+-----------------------------------+
|2011-02-25 15:04:49|            [@sai3ki, おやすみ＾＾]|
|2011-02-25 15:04:56|             [感動を返せ王子wwwwww]|
|2011-02-25 15:05:02|               [RT, @IAmSteveHar...|
|2011-02-25 15:05:03|               [@67terremoto, ar...|
|2011-02-25 15:05:07| [え!!どうしたんめっちゃおもろい...|
|2011-02-25 15:05:12|               [RT, @Corinthians...|
|2011-02-25 15:05:22|               [Acompanhem, as, ...|
|2011-02-25 15:05:26|               [GO, FOLLOW, THIS...|
|2011-02-25 15:05:33|               [Genit, ah, RT, @...|
|2011-02-25 15:05:39|               [é, realmente, me...|
|2011-02-25 15:05:42|  [『ヒグチ薬局、本日閉店。ｍ(__...|
|2011-02-25 15:05:51|                 [@izabelll, TGIF!]|
|2011-02-25 15:05:58|              [@barley_candy, あ...|
|2011-02-25 15:06:13|               [@Joanaacantillo2...|
|2011-02-25 15:06:18|[会議ー　…めんどくさいからさ、い...|
|2011-02-25 

In [170]:
words = (with_timestamps
 .select(
     F.month("timestamp").alias("month"),
     F.dayofyear("timestamp").alias("dayofyear"),
     F.explode("split_text").alias("word"),
 )
)

In [171]:
words.show()

+-----+---------+--------------------+
|month|dayofyear|                word|
+-----+---------+--------------------+
|    2|       56|             @sai3ki|
|    2|       56|        おやすみ＾＾|
|    2|       56|感動を返せ王子wwwwww|
|    2|       56|                  RT|
|    2|       56|    @IAmSteveHarvey:|
|    2|       56|              Having|
|    2|       56|                   a|
|    2|       56|        relationship|
|    2|       56|                with|
|    2|       56|                 God|
|    2|       56|                  is|
|    2|       56|                  an|
|    2|       56|                  on|
|    2|       56|               going|
|    2|       56|            process.|
|    2|       56|                 You|
|    2|       56|                have|
|    2|       56|                  to|
|    2|       56|                work|
|    2|       56|                  at|
+-----+---------+--------------------+
only showing top 20 rows



In [176]:
counts = (words
 .groupBy("dayofyear", "word")
 .agg(F.count("*").alias("count"))
 .orderBy(F.desc("count"))
)

In [177]:
counts.show()

+---------+----+-----+
|dayofyear|word|count|
+---------+----+-----+
|       57|  RT| 1996|
|       57|   a| 1161|
|       58|  RT|  991|
|       56|  RT|  951|
|       57|  to|  912|
|       57|   I|  902|
|       57| the|  859|
|       57|  de|  685|
|       56|   a|  596|
|       57| que|  594|
|       57| you|  562|
|       56| the|  523|
|       58|   a|  495|
|       57|  me|  495|
|       57|  in|  483|
|       56|  to|  476|
|       57| and|  471|
|       57|  is|  439|
|       58|  to|  436|
|       56|   I|  435|
+---------+----+-----+
only showing top 20 rows



In [182]:
window = Window.partitionBy("word").orderBy("dayofyear").rowsBetween(Window.currentRow - 1, Window.currentRow)
(counts
 .withColumn(
     "delta",
     F.last("count").over(window) - F.first("count").over(window),
 )
 .orderBy(F.asc("word"))
).show()

+---------+----------+-----+-----+
|dayofyear|      word|count|delta|
+---------+----------+-----+-----+
|       57|$Passei|    1|    0|
|       57|         !|  209|  102|
|       58|         !|   83| -126|
|       56|         !|  107|    0|
|       56|        !!|   17|    0|
|       57|        !!|   54|   37|
|       58|        !!|   12|  -42|
|       58|       !!!|    9|  -18|
|       57|       !!!|   27|   12|
|       56|       !!!|   15|    0|
|       57|      !!!!|    8|    0|
|       58|      !!!!|    2|   -6|
|       57|     !!!!!|    2|   -1|
|       58|     !!!!!|    1|   -1|
|       56|     !!!!!|    3|    0|
|       58|    !!!!!!|    1|   -1|
|       57|    !!!!!!|    2|    0|
|       56|    !!!!!!|    2|    0|
|       57|   !!!!!!!|    1|    0|
|       56|   !!!!!!!|    1|    0|
+---------+----------+-----+-----+
only showing top 20 rows

