In [0]:
%sql
USE pinterest_data_9105411ea84a;


In [0]:
df_geo = spark.table("df_geo")
df_pin = spark.table("df_pin")
df_user = spark.table("df_user")


In [0]:
%python
from pyspark.sql.functions import count, col
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

# Join the geo and pin DataFrames on the 'ind' column
df_joined = df_geo.join(df_pin, "ind")

# Group by country and category and count the occurrences
df_category_count = df_joined.groupBy("country", "category").agg(count("*").alias("category_count"))

# Find the most popular category for each country by sorting within each group
window = Window.partitionBy("country").orderBy(col("category_count").desc())

df_most_popular = df_category_count.withColumn("rank", rank().over(window)) \
                                   .filter(col("rank") == 1) \
                                   .drop("rank")

# Select the desired columns for the final DataFrame
df_final = df_most_popular.select("country", "category", "category_count")

display(df_final.select("*"))

country,category,category_count
Algeria,beauty,1
American Samoa,quotes,1
Anguilla,tattoos,1
Antarctica (the territory South of 60 deg S),art,1
Argentina,finance,1
Argentina,tattoos,1
Austria,home-decor,1
Bahamas,diy-and-crafts,1
Bahrain,home-decor,1
Belarus,christmas,2


In [0]:
%python
from pyspark.sql.functions import col, year, count

df_joined = df_pin.join(df_geo, 'ind', 'inner')

# Convert the timestamp column from string to timestamp type if it's not already
df_joined = df_joined.withColumn("timestamp", col("timestamp").cast("timestamp"))

# Filter the DataFrame for posts between 2018 and 2022
df_filtered = df_joined.filter((year("timestamp") >= 2018) & (year("timestamp") <= 2022))

# Create a new column with just the year from the timestamp
df_with_year = df_filtered.withColumn("post_year", year("timestamp"))

# Group by post_year and category and count the occurrences
df_category_count = df_with_year.groupBy("post_year", "category").agg(count("*").alias("category_count"))

# Order the result for better readability
df_result = df_category_count.orderBy("post_year", "category")

display(df_result.select("*"))

post_year,category,category_count
2018,christmas,2
2018,diy-and-crafts,1
2018,tattoos,1
2019,christmas,1
2019,home-decor,1
2020,beauty,1
2020,finance,1
2021,art,1
2021,home-decor,1
2021,mens-fashion,1


In [0]:
%python
from pyspark.sql.functions import col, row_number
from pyspark.sql.window import Window

df_joined = df_pin.join(df_geo, 'ind', 'inner')

# Define a window spec partitioned by country
windowSpec = Window.partitionBy("country").orderBy(col("follower_count").desc())

# Use the window spec to add a row number for each user within each country partition
df_ranked = df_joined.withColumn("row_number", row_number().over(windowSpec))

# Filter for the top user (row_number 1) in each country
df_top_user_per_country = df_ranked.filter(col("row_number") == 1) \
                                   .select("country", "poster_name", "follower_count")

display(df_top_user_per_country.select("*"))

country,poster_name,follower_count
Algeria,Wellness Mama,209000.0
American Samoa,Epic Reads,77000.0
Anguilla,Dicas de Mulher,2000000.0
Antarctica (the territory South of 60 deg S),Messy Ever After,17000.0
Argentina,Next Luxury,800000.0
Austria,My 100 Year Old home,33000.0
Bahamas,The Kitchen Table Classroom,221000.0
Bahrain,Decor Home Ideas,144000.0
Belarus,"Prudent Penny Pincher - Home Decor, Organization, Crafts, Recipes",647000.0
Bhutan,alohaboho | Pampas Grass Decor | Earthy Urban Decor,3000.0


In [0]:
%python
from pyspark.sql.functions import when, col, count, rank
from pyspark.sql import Window

df_joined = df_pin.join(df_user, 'ind', 'inner')

# Create the age_group column
df_with_age_group = df_joined.withColumn(
    "age_group",
    when(col("age").between(18, 24), "18-24")
    .when(col("age").between(25, 35), "25-35")
    .when(col("age").between(36, 50), "36-50")
    .otherwise("50+")
)

# Group by age_group and category and count the occurrences
df_category_count = df_with_age_group.groupBy("age_group", "category").agg(count("*").alias("category_count"))

# Define a window spec partitioned by age_group and ordered by category_count descending
windowSpec = Window.partitionBy("age_group").orderBy(col("category_count").desc())

# Use the window spec to add a rank for each category within each age group partition
df_ranked = df_category_count.withColumn("rank", rank().over(windowSpec))

# Filter for the top-ranked category within each age group
df_top_category_per_age_group = df_ranked.filter(col("rank") == 1).select("age_group", "category", "category_count")

display(df_top_category_per_age_group)

age_group,category,category_count
18-24,tattoos,1
25-35,finance,1
25-35,diy-and-crafts,1
25-35,art,1
25-35,education,1
25-35,travel,1
36-50,home-decor,1
50+,art,1
50+,christmas,1


In [0]:
%python
from pyspark.sql.functions import col, when, percentile_approx

df_joined = df_pin.join(df_user, 'ind', 'inner')

# Age groups
df_with_age_group = df_joined.withColumn(
    "age_group",
    when(col("age").between(18, 24), "18-24")
    .when(col("age").between(25, 35), "25-35")
    .when(col("age").between(36, 50), "36-50")
    .otherwise("50+")
)

# Group by age_group and calculate the median follower count
df_median_follower_count = df_with_age_group.groupBy("age_group")\
                                            .agg(percentile_approx("follower_count", 0.5).alias("median_follower_count"))

# Display the result
display(df_median_follower_count)


age_group,median_follower_count
50+,7000.0
18-24,190000.0
25-35,22000.0
36-50,129000.0


In [0]:
%python
from pyspark.sql.types import DateType
from pyspark.sql.functions import col, year, count

# Convert the date_joined column from string to date type
df_user = df_user.withColumn("date_joined", col("date_joined").cast(DateType()))

# Extract the year from the date_joined column
df_with_year = df_user.withColumn("post_year", year(col("date_joined")))

# Filter the DataFrame for years between 2015 and 2017
df_filtered = df_with_year.filter((col("post_year") >= 2015) & (col("post_year") <= 2017))

# Group by post_year and count the number of users
df_number_users_joined = df_filtered.groupBy("post_year").agg(count("*").alias("number_users_joined"))

display(df_number_users_joined)

post_year,number_users_joined
2015,139
2016,149
2017,56


In [0]:
%python
from pyspark.sql.functions import col, year, expr

# Filter users who joined between 2015 and 2017
df_filtered_users = df_user.withColumn("date_joined", col("date_joined").cast("timestamp")) \
                           .withColumn("post_year", year("date_joined")) \
                           .filter((col("post_year") >= 2015) & (col("post_year") <= 2017))

df_joined = df_filtered_users.join(df_pin, 'ind', 'inner')

# Calculate the median follower count per post year
df_median_follower_count = df_joined.groupBy("post_year") \
                                    .agg(expr("percentile_approx(follower_count, 0.5)").alias("median_follower_count"))

display(df_median_follower_count)

post_year,median_follower_count
2015,190000.0
2016,28000.0
2017,7000.0


In [0]:
# Filter users who joined between 2015 and 2017
df_users_filtered = df_user.withColumn("date_joined", col("date_joined").cast("timestamp")) \
                           .withColumn("post_year", year(col("date_joined"))) \
                           .filter((col("post_year") >= 2015) & (col("post_year") <= 2017))

# Create the age_group column
df_users_age_grouped = df_users_filtered.withColumn(
    "age_group",
    when(col("age").between(18, 24), "18-24")
    .when(col("age").between(25, 35), "25-35")
    .when(col("age").between(36, 50), "36-50")
    .otherwise("50+")
)

# Join df_users_age_grouped with df_pin on the user identifier to get follower counts
df_joined = df_users_age_grouped.join(df_pin, 'ind', 'inner')

# Group by age_group and post_year, and calculate the median follower count
df_median_followers = df_joined.groupBy("age_group", "post_year") \
                               .agg(expr("percentile_approx(follower_count, 0.5)").alias("median_follower_count"))

display(df_median_followers)

age_group,post_year,median_follower_count
50+,2017,7000.0
18-24,2015,190000.0
25-35,2016,22000.0
36-50,2016,129000.0
50+,2016,28000.0
25-35,2015,17000.0
