## Librarys

In [1]:
import modules_spark as ModulesSpark
from pyspark.sql import functions as f

## Spark Session

In [2]:
spark = ModulesSpark.create_spark_session()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/20 20:34:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Data lake: read user and tweet tables

In [3]:
users = ModulesSpark.read_extracts(spark,
                                   "/home/mbrugnar/datalake/silver/aluraonline/user")
users.printSchema()

[Stage 0:>                                                          (0 + 6) / 6]                                                                                

root
 |-- created_at: string (nullable = true)
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- username: string (nullable = true)
 |-- process_date: date (nullable = true)



In [4]:
tweets = ModulesSpark.read_extracts(spark, 
                                   "/home/mbrugnar/datalake/silver/aluraonline/tweet")
tweets.printSchema()

root
 |-- author_id: string (nullable = true)
 |-- conversation_id: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- id: string (nullable = true)
 |-- in_reply_to_user_id: string (nullable = true)
 |-- like_count: long (nullable = true)
 |-- quote_count: long (nullable = true)
 |-- reply_count: long (nullable = true)
 |-- retweet_count: long (nullable = true)
 |-- text: string (nullable = true)
 |-- process_date: date (nullable = true)



## Getting insights

#### Searching for the id of Alura's twitter account and getting tweets from Alura account

In [5]:
alura_id = users.select("id", "name", "username").where(f.col("username") == "AluraOnline").distinct()
alura_id.toPandas()

Unnamed: 0,id,name,username
0,1566580880,Alura,AluraOnline


In [6]:
alura_tweets = tweets.where(f.col("author_id") == "1566580880")
alura_tweets.toPandas()

Unnamed: 0,author_id,conversation_id,created_at,id,in_reply_to_user_id,like_count,quote_count,reply_count,retweet_count,text,process_date
0,1566580880,1526576323572924418,2022-05-17T14:52:12.000Z,1526576323572924418,,17,2,1,4,Tem mais uma websérie nova no canal!😍\n\nDessa...,2022-05-17
1,1566580880,1526569088373366784,2022-05-17T14:23:27.000Z,1526569088373366784,,7,0,1,1,E vamos de https://t.co/ouNHpg5vRV! 💥🚀\n\nHoje...,2022-05-17
2,1566580880,1526261285247229957,2022-05-16T18:00:21.000Z,1526261285247229957,,8,0,0,3,O Aluraverso tá ON e a todo vapor na Twitch! 🚀...,2022-05-16
3,1566580880,1526223808650170368,2022-05-16T15:31:26.000Z,1526223808650170368,,9,0,1,2,"Segunda é dia de Alura+! 🤩\n\nNesta semana, vo...",2022-05-16
4,1566580880,1527362671162470421,2022-05-19T18:56:52.000Z,1527362671162470421,,7,0,1,1,Artigo novo sobre React no ar! 😍\n\nO que são ...,2022-05-19
5,1566580880,1527305392472170496,2022-05-19T15:09:16.000Z,1527305392472170496,,9,0,1,2,Segundo ep da nossa websérie sobre memes do Ja...,2022-05-19
6,1566580880,1527000160307380228,2022-05-18T18:56:23.000Z,1527000160307380228,,9,0,1,4,Você também precisou se adaptar ao trabalho re...,2022-05-18


#### Simple Summary

In [16]:
summary = alura_tweets.select("created_at", "id", "like_count", "quote_count", "reply_count", "retweet_count", "in_reply_to_user_id")\
    .groupBy(f.to_date("created_at").alias("created_date"))\
    .agg(
        f.count("id").alias("total_tweets"),
        f.avg("like_count").alias("avg_likes"),
        f.avg("quote_count").alias("avg_quote"),
        f.avg("reply_count").alias("avg_reply"),
        f.avg("retweet_count").alias("avg_retweet"),
        f.sum(f.when(f.col("in_reply_to_user_id").isNotNull(), 1).otherwise(0)).alias("total_answer")
        
    )
summary.toPandas()

Unnamed: 0,created_date,total_tweets,avg_likes,avg_quote,avg_reply,avg_retweet,total_answer
0,2022-05-17,2,12.0,1.0,1.0,2.5,0
1,2022-05-16,2,8.5,0.0,0.5,2.5,0
2,2022-05-19,2,8.0,0.0,1.0,1.5,0
3,2022-05-18,1,9.0,0.0,1.0,4.0,0


In [None]:
summary\
    .coalesce(1)\
    .write\
    .mode("overwrite")\
    .json("/home/mbrugnar/datalake/gold/summary")

#### Alura's interaction with other accounts

In [8]:
alura_author = tweets.select("author_id", "conversation_id").where("author_id == '1566580880'")
alura_author.toPandas()

Unnamed: 0,author_id,conversation_id
0,1566580880,1526576323572924418
1,1566580880,1526569088373366784
2,1566580880,1526261285247229957
3,1566580880,1526223808650170368
4,1566580880,1527362671162470421
5,1566580880,1527305392472170496
6,1566580880,1527000160307380228


In [15]:
others_authors = tweets.alias("tweet")\
    .join(
        alura_author.alias("alura"),
        [
            f.col("alura.author_id") != f.col("tweet.author_id"),
            f.col("alura.conversation_id") == f.col("tweet.conversation_id")
        ],
        "left"
    )\
    .withColumn(
        "alura_conversation",
        f.when(f.col("alura.conversation_id").isNotNull(), 1).otherwise(0)
    )\
    .withColumn(
        "alura_reply",
        f.when(f.col("tweet.in_reply_to_user_id") == "1566580880", 1).otherwise(0)
    )\
    .groupBy(
        f.to_date("created_at").alias("created_date")
    )\
    .agg(
        f.countDistinct("tweet.id").alias("n_tweets"),
        f.countDistinct("tweet.conversation_id").alias("n_conversation"),
        f.sum("alura_conversation").alias("alura_conversation"),
        f.sum("alura_reply").alias("alura_reply")
    )\
    .withColumn(
        "weekday", f.date_format("created_date", "E")
    )
others_authors.toPandas()

Unnamed: 0,created_date,n_tweets,n_conversation,alura_conversation,alura_reply,weekday
0,2022-05-19,20,16,1,1,Thu
1,2022-05-18,13,10,0,1,Wed
2,2022-05-14,12,5,0,0,Sat
3,2022-05-16,20,17,0,2,Mon
4,2022-05-15,6,6,0,0,Sun
5,2022-05-17,33,27,1,4,Tue


In [14]:
others_authors.union(
    others_authors.select(
        f.lit("Total"),
        f.sum("n_tweets"),
        f.sum("n_conversation"),
        f.sum("alura_conversation"),
        f.sum("alura_reply"),
        f.lit("Week"),
    )
).toPandas()

Unnamed: 0,created_date,n_tweets,n_conversation,alura_conversation,alura_reply,weekday
0,2022-05-19,20,16,1,1,Thu
1,2022-05-18,13,10,0,1,Wed
2,2022-05-14,12,5,0,0,Sat
3,2022-05-16,20,17,0,2,Mon
4,2022-05-15,6,6,0,0,Sun
5,2022-05-17,33,27,1,4,Tue
6,Total,104,81,2,8,Week


In [None]:
others_authors\
    .coalesce(1)\
    .write\
    .mode("overwrite")\
    .json("/home/mbrugnar/datalake/gold/twitter_insight_tweet")