Reference https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object

- if quoted, concatenate both texts - OK
- quais as 10 hashtags que foram mais publicadas dentro de uma hora qualquer durante o período da coleta? Isto é, para cada hora, do minuto 00 até o minuto 59, conte quantas vezes cada hashtag aparece. Ao final, identifique quais as 10 hashtags diferentes, o dia/hora em que cada uma foi mais frequente e quantas vezes ela apareceu naquela hora.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
important_columns = ["coordinates", "entities", "id_str", "created_at",
               "is_quote_status", "quoted_status_id_str", "text"]

In [3]:
spark = SparkSession.builder.appName('pratical_work').getOrCreate()

In [4]:
data = spark.read.json("hdfs:///datasets/geo_curitiba")

In [5]:
data = data.select(important_columns)

In [6]:
data.show(1)

+-----------+-----------------+------------------+---------------+---------------+--------------------+--------------------+
|coordinates|         entities|            id_str|     created_at|is_quote_status|quoted_status_id_str|                text|
+-----------+-----------------+------------------+---------------+---------------+--------------------+--------------------+
|       null|[[],, [], [], []]|726561770303840256|[1462060799000]|          false|                null|Olha o universo j...|
+-----------+-----------------+------------------+---------------+---------------+--------------------+--------------------+
only showing top 1 row

+-----------+-----------------+------------------+---------------+---------------+--------------------+--------------------+
|coordinates|         entities|            id_str|     created_at|is_quote_status|quoted_status_id_str|                text|
+-----------+-----------------+------------------+---------------+---------------+-------------------

In [7]:
map_data = data.select(["quoted_status_id_str", "text"])\
                .where(F.col("quoted_status_id_str").isNotNull()).collect()
map_dict = dict(map(list, map_data))

In [8]:
map_func = F.udf(lambda key: map_dict.get(key, ""))

In [9]:
data = data.withColumn("quoted_text", map_func(data.quoted_status_id_str))

In [10]:
data.show(1)

+-----------+-----------------+------------------+---------------+---------------+--------------------+--------------------+-----------+
|coordinates|         entities|            id_str|     created_at|is_quote_status|quoted_status_id_str|                text|quoted_text|
+-----------+-----------------+------------------+---------------+---------------+--------------------+--------------------+-----------+
|       null|[[],, [], [], []]|726561770303840256|[1462060799000]|          false|                null|Olha o universo j...|           |
+-----------+-----------------+------------------+---------------+---------------+--------------------+--------------------+-----------+
only showing top 1 row

+-----------+-----------------+------------------+---------------+---------------+--------------------+--------------------+-----------+
|coordinates|         entities|            id_str|     created_at|is_quote_status|quoted_status_id_str|                text|quoted_text|
+-----------+----

In [11]:
data = data.withColumn("text", F.when(data.is_quote_status == True, \
                                        F.concat(F.col("text"), F.lit(" "), F.col("quoted_text")))\
                                .otherwise(data.text))

In [12]:
data.show(1)

+-----------+-----------------+------------------+---------------+---------------+--------------------+--------------------+-----------+
|coordinates|         entities|            id_str|     created_at|is_quote_status|quoted_status_id_str|                text|quoted_text|
+-----------+-----------------+------------------+---------------+---------------+--------------------+--------------------+-----------+
|       null|[[],, [], [], []]|726561770303840256|[1462060799000]|          false|                null|Olha o universo j...|           |
+-----------+-----------------+------------------+---------------+---------------+--------------------+--------------------+-----------+
only showing top 1 row

+-----------+-----------------+------------------+---------------+---------------+--------------------+--------------------+-----------+
|coordinates|         entities|            id_str|     created_at|is_quote_status|quoted_status_id_str|                text|quoted_text|
+-----------+----

In [13]:
data = data.withColumn("hashtags", data.entities.hashtags)

In [14]:
important_columns = ["coordinates", "id_str", "text", "hashtags", "created_at"]

In [15]:
data = data.select(important_columns)

In [16]:
data.show(1)

+-----------+------------------+--------------------+--------+---------------+
|coordinates|            id_str|                text|hashtags|     created_at|
+-----------+------------------+--------------------+--------+---------------+
|       null|726561770303840256|Olha o universo j...|      []|[1462060799000]|
+-----------+------------------+--------------------+--------+---------------+
only showing top 1 row

+-----------+------------------+--------------------+--------+---------------+
|coordinates|            id_str|                text|hashtags|     created_at|
+-----------+------------------+--------------------+--------+---------------+
|       null|726561770303840256|Olha o universo j...|      []|[1462060799000]|
+-----------+------------------+--------------------+--------+---------------+
only showing top 1 row



In [17]:
data.write.parquet("hdfs:///user/ghra2016/cleaned_data")