In [1]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [2]:
%load_ext autoreload
%autoreload 2

import trending_topics as trends

In [3]:
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")

In [6]:
df = spark.read.json("../sample.json")
df = trends.filter_null(df)
df = trends.parse_timestamps(df)
df = trends.get_topics(df, hashtags=True)
df = df.select("timestamp", "topic")

df.show()
df.select(F.min("timestamp"), F.max("timestamp")).show()

+-------------------+---------------+
|          timestamp|          topic|
+-------------------+---------------+
|2011-02-25 15:05:12|    corinthians|
|2011-02-25 15:05:22|         ildivo|
|2011-02-25 15:05:42|     centerkita|
|2011-02-25 15:06:49|   followfriday|
|2011-02-25 15:07:52|      viatumblr|
|2011-02-25 15:09:04|    foreversnsd|
|2011-02-25 15:09:04|      ilovesnsd|
|2011-02-25 15:11:29|constantcontact|
|2011-02-25 15:13:10|           hhrs|
|2011-02-25 15:13:28| vampirediaries|
|2011-02-25 15:13:28|            tvd|
|2011-02-25 15:14:49|     nowplaying|
|2011-02-25 15:15:03|    bacamantera|
|2011-02-25 15:15:08|             ff|
|2011-02-25 15:15:08|         ffsexy|
|2011-02-25 15:15:24|        acidman|
|2011-02-25 15:15:24|        acidman|
|2011-02-25 15:15:35| teamfollowback|
|2011-02-25 15:15:56|          credo|
|2011-02-25 15:15:56|            arg|
+-------------------+---------------+
only showing top 20 rows

+-------------------+-------------------+
|     min(timestamp)

In [7]:
trends.count_within_timeframe(df, "1 hour").orderBy(F.desc("count")).show()

+--------------------+------------+-----+
|              window|       topic|count|
+--------------------+------------+-----+
|[2011-02-26 17:00...|  cambiochat|   37|
|[2011-02-26 13:00...|   ficadiogo|   12|
|[2011-02-25 18:00...|          ff|   12|
|[2011-02-26 05:00...|    clericot|   11|
|[2011-02-26 07:00...| longlivetvd|   10|
|[2011-02-25 19:00...|          ff|   10|
|[2011-02-25 23:00...|          ff|    9|
|[2011-02-25 21:00...|          ff|    8|
|[2011-02-26 01:00...|          ff|    8|
|[2011-02-25 22:00...|          ff|    8|
|[2011-02-25 20:00...|          ff|    8|
|[2011-02-25 17:00...|          ff|    7|
|[2011-02-25 16:00...|          ff|    7|
|[2011-02-26 21:00...|  nowplaying|    7|
|[2011-02-26 02:00...|          ff|    6|
|[2011-02-25 15:00...|          ff|    6|
|[2011-02-26 00:00...|          ff|    5|
|[2011-02-26 21:00...|db40birthday|    5|
|[2011-02-26 03:00...|          ff|    5|
|[2011-02-25 22:00...|     90sswag|    4|
+--------------------+------------

In [8]:
counted = trends.count_within_timeframe(df, "30 minutes")
with_slope = trends.calculate_slope(counted)
with_slope.orderBy(F.desc("slope")).show()

+--------------------+--------------------+-----+------------------+
|              window|               topic|count|             slope|
+--------------------+--------------------+-----+------------------+
|[2011-02-26 17:00...|          cambiochat|   14|               6.0|
|[2011-02-25 15:30...|                  ff|    5|               4.0|
|[2011-02-26 21:00...|          nowplaying|    5| 3.545454545454545|
|[2011-02-25 21:00...|             90sswag|    3|               2.0|
|[2011-02-25 23:30...|             gaddafi|    3|               2.0|
|[2011-02-26 17:30...|          cambiochat|   23|             1.875|
|[2011-02-26 23:30...|                  np|    3| 1.727272727272727|
|[2011-02-25 18:00...|                  ff|    9|1.6999999999999997|
|[2011-02-25 23:30...|               libya|    3|1.6250000000000002|
|[2011-02-26 22:30...|                  fb|    3|               1.5|
|[2011-02-26 19:30...|                cuse|    2|               1.0|
|[2011-02-27 02:00...|            

In [9]:
from datetime import datetime
trends.trending_topics(with_slope, datetime.fromisoformat("2011-02-25 22:34:50"), 5).show()

+--------------------+---------+-----+-------------------+----------+
|              window|    topic|count|              slope|trend_rank|
+--------------------+---------+-----+-------------------+----------+
|[2011-02-25 22:30...|       ff|    6| 0.3333333333333333|         1|
|[2011-02-25 22:30...|  90sswag|    2|0.14285714285714285|         2|
|[2011-02-25 22:30...|teamzeeti|    1|                0.0|         3|
|[2011-02-25 22:30...|   random|    1|                0.0|         3|
|[2011-02-25 22:30...|  fashion|    1|                0.0|         3|
+--------------------+---------+-----+-------------------+----------+

