In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [None]:
spark = SparkSession.builder.appName("Social Media").getOrCreate()

In [None]:
users_df = spark.read.csv("../data/users.csv", inferSchema=True, header=True)
posts_df = spark.read.csv("../data/posts.csv", inferSchema=True, header=True)
comments_df = spark.read.csv("../data/comments.csv", inferSchema=True, header=True)

In [None]:
users_df.show(5)
posts_df.show(5)
comments_df.show(5)

### Analysis
- Find the total number of users from each country.
- Find the top 5 users with the highest number of posts.

In [None]:
# Total number of users from each country
users_df.groupBy("country").agg(count("user_id").alias("Total Users")).show()

In [None]:
# Top 5 users with the highest number of posts
posts_df.groupBy("user_id").agg(count("post_id").alias("Posts Number")) \
    .orderBy(col("Posts Number").desc()).limit(5). \
    show()

### Join and Filtering:
- Join the posts and users tables to create a new DataFrame (postsWithUsersDF) containing information about each post along with the user details.
- Filter the postsWithUsersDF to include only posts with more than 100 likes.

In [None]:
posts_with_users_df = posts_df.join(users_df, posts_df.user_id == users_df.user_id, "inner")

In [None]:
posts_with_users_df.filter(col("likes") > 100).show()

#### Date Manipulation:
- Convert the post_date and comment_date columns in the posts and comments tables to DateType.
- Find the average number of comments per post for each user.

In [None]:
posts_df = posts_df.withColumn("post_date", to_date(col("post_date"), "M/d/yyyy"))
comments_df = comments_df.withColumn("comment_date", to_date(col("comment_date"), "M/d/yyyy"))

In [None]:
posts_df.groupBy("user_id").agg(round(avg("comments"), 2).alias("Comments per Post")).show()

#### Advanced Analysis:
- Use Spark DataFrame operations to calculate the total number of comments for each user.
- Find the user with the highest average likes per post.

In [None]:
# Total number of comments for each user
comments_df.groupBy("user_id").agg(count("comment_id").alias("Number of Comments")).show()

In [None]:
# The user with the highest average likes per post
posts_df.groupBy("user_id").agg(round(avg("likes"), 2).alias("Like Average")).orderBy(col("Like Average").desc()).limit(1).show()