Reference https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object
- quais as 10 hashtags que foram mais publicadas dentro de uma hora qualquer durante o período da coleta? Isto é, para cada hora, do minuto 00 até o minuto 59, conte quantas vezes cada hashtag aparece. Ao final, identifique quais as 10 hashtags diferentes, o dia/hora em que cada uma foi mais frequente e quantas vezes ela apareceu naquela hora.

In [32]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

from pyspark.ml.feature import VectorAssembler

import matplotlib.pyplot as plt

In [33]:
spark = SparkSession.builder.appName('pratical_work').getOrCreate()

In [34]:
data = spark.read.parquet("hdfs:///user/ghra2016/cleaned_data")

In [35]:
data.printSchema()

root
 |-- id_str: string (nullable = true)
 |-- text: string (nullable = true)
 |-- hashtags: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- indices: array (nullable = true)
 |    |    |    |-- element: long (containsNull = true)
 |    |    |-- text: string (nullable = true)
 |-- created_at: timestamp (nullable = true)
 |-- x: float (nullable = true)
 |-- y: float (nullable = true)



In [36]:
data.show(1)

+------------------+--------------------+--------+-------------------+---+---+
|            id_str|                text|hashtags|         created_at|  x|  y|
+------------------+--------------------+--------+-------------------+---+---+
|721527751338024960|To morrendo de so...|      []|2016-04-17 02:36:35|NaN|NaN|
+------------------+--------------------+--------+-------------------+---+---+
only showing top 1 row



In [37]:
w_coordinates = data.select(["hashtags", "x", "y"]).na.drop()

In [38]:
w_coordinates.show(1)

+--------------------+--------+----------+
|            hashtags|       x|         y|
+--------------------+--------+----------+
|[[[24, 33], elekf...|-49.2776|-25.384666|
+--------------------+--------+----------+
only showing top 1 row



In [15]:
clustering_input = w_coordinates.select(["x", "y"])

In [16]:
assembler = VectorAssembler(inputCols=["x", "y"], outputCol='features')
clustering_input = assembler.transform(clustering_input)

In [17]:
SEED = 1
# k_list = range(3, 30)
# distances = []
# sillhouettes = []
# for k in k_list:
#     kmeans = KMeans().setK(k).setSeed(SEED)
#     model = kmeans.fit(clustering_input)
#     predictions = model.transform(clustering_input)
    
#     total_dist = model.computeCost(clustering_input)
#     evaluator = ClusteringEvaluator()
    
#     distances.append(total_dist)
#     sillhouettes.append(evaluator.evaluate(predictions))

In [18]:
# fig, (ax1, ax2) = plt.subplots(1, 2)

# ax1.plot(k_list, distances)
# ax1.set_title("Distance Sums with Different Ks")

# ax2.plot(k_list, sillhouettes)
# ax2.set_title("Sillhouete with Different Ks")

# plt.show()

In [19]:
kmeans = KMeans().setK(20).setSeed(SEED)

model = kmeans.fit(clustering_input)
predictions = model.transform(clustering_input)

centroids = model.clusterCenters()

In [31]:
w_coordinates = w_coordinates.withColumn("index", F.monotonically_increasing_id())

In [20]:
predictions.show(2)

+--------+----------+--------------------+----------+
|       x|         y|            features|prediction|
+--------+----------+--------------------+----------+
|-49.2776|-25.384666|[-49.277599334716...|         0|
|-43.9622|  -19.9278|[-43.962200164794...|         5|
+--------+----------+--------------------+----------+
only showing top 2 rows

