In [0]:
import pyspark
import multiprocessing

In [0]:
%run /Users/joelcosta94i@gmail.com/data_cleaning

In [0]:
# Load the data from the S3 bucket, then clean it.
df_pin, df_geo, df_user = load_cleaned_data()

In [0]:
# Create session to run queries
cfg = (
    pyspark.SparkConf()
    # Setting the master to run locally and with the maximum amount of cpu coresfor multiprocessing.
    .setMaster(f"local[{multiprocessing.cpu_count()}]")
    # Setting application name
    .setAppName("TestApp")
    # Setting config value via string
    .set("spark.eventLog.enabled", False)
    # Setting environment variables for executors to use
    .setExecutorEnv(pairs=[("VAR3", "value3"), ("VAR4", "value4")])
    # Setting memory if this setting was not set previously
    .setIfMissing("spark.executor.memory", "1g")
)

session = pyspark.sql.SparkSession.builder.config(conf=cfg).getOrCreate()

In [0]:
# Create TempViews for queries
df_pin.createOrReplaceTempView("pin_table")
df_geo.createOrReplaceTempView("geo_table")
df_user.createOrReplaceTempView("user_table")

In [0]:
# The most popular category in each country (ties allowed. to disallow ties, use ROW_NUMBER instead of RANK)
query_popular_category_in_each_country = """
    WITH category_count_per_country AS
    (
        SELECT
            country,
            category,
            COUNT(*) AS category_count,
            RANK() OVER (
                PARTITION BY country ORDER BY COUNT(pin_table.ind) DESC
            ) AS rank
        FROM
            pin_table
        JOIN
            geo_table ON pin_table.ind = geo_table.ind
        GROUP BY
            country, category
    )
    SELECT
        country,
        category,
        category_count
    FROM
        category_count_per_country
    WHERE
        rank = 1
"""

popular_category_in_each_country_df = session.sql(query_popular_category_in_each_country)

In [0]:
# Most popular category each year (ties allowed. to disallow ties, use ROW_NUMBER instead of RANK)
query_popular_category_each_year = """
    WITH category_count_per_year AS (
    SELECT
        YEAR(timestamp) AS post_year,
        category,
        COUNT(*) AS category_count,
        RANK() OVER (
            PARTITION BY
                YEAR(timestamp)
            ORDER BY
                COUNT(pin_table.ind) DESC
        ) AS rank
    FROM
        pin_table
    JOIN
        geo_table ON pin_table.ind = geo_table.ind
    GROUP BY
        YEAR(timestamp), category
    )
    SELECT
        post_year,
        category,
        category_count
    FROM
        category_count_per_year
    WHERE
        rank = 1   
"""

popular_category_each_year_df = session.sql(query_popular_category_each_year)

In [0]:
# Most followers in each country (ties allowed. to disallow ties, use ROW_NUMBER instead of RANK)
query_most_followers_per_country = """
    WITH ranked_poster_country_followers_table AS (
    SELECT
        country,
        poster_name,
        follower_count,
        RANK() OVER (
            PARTITION BY
                country
            ORDER BY
                follower_count DESC
        ) AS rank
    FROM
        pin_table
    JOIN
        geo_table ON pin_table.ind = geo_table.ind
    GROUP BY
        poster_name, country, follower_count
    )
    SELECT
        country,
        poster_name,
        follower_count
    FROM
        ranked_poster_country_followers_table
    WHERE
        rank = 1
"""

most_followers_per_country_df = session.sql(query_most_followers_per_country)
most_followers_per_country_df.show()

# country with most followers
most_followers_per_country_df.createOrReplaceTempView("most_followers_per_country_table")
query_country_with_most_followers = """
    SELECT
        country,
        follower_count
    FROM
        most_followers_per_country_table
    ORDER BY
        follower_count DESC
    LIMIT 1
"""

country_with_most_followers_df = session.sql(query_country_with_most_followers)

In [0]:
# Most popular category for different age groups (ties allowed. to disallow ties, use ROW_NUMBER instead of RANK)
query_popular_category_per_age_group = """
    WITH age_group_category_table AS
    (
        SELECT
            category,
            CASE
                WHEN age BETWEEN 18 AND 24 THEN '18-24'
                WHEN age BETWEEN 25 AND 35 THEN '25-35'
                WHEN age BETWEEN 36 AND 50 THEN '36-50'
                ELSE '+50'
            END AS age_group
        FROM
            pin_table
        JOIN
            user_table ON pin_table.ind = user_table.ind
    ),
    ranked_age_group_category_table AS
    (
        SELECT
            age_group,
            category,
            COUNT(*) AS category_count,
            RANK() OVER (
                PARTITION BY
                    age_group
                ORDER BY
                    COUNT(*) DESC
            ) AS rank
        FROM
            age_group_category_table
        GROUP BY
            age_group, category
    )
    SELECT
        age_group,
        category,
        category_count
    FROM
        ranked_age_group_category_table
    WHERE
        rank = 1
"""

popular_category_per_age_group_df = session.sql(query_popular_category_per_age_group)

In [0]:
# Median follower count for different age groups
query_median_follower_count_per_age_group = """
    WITH age_group_follower_count_table AS
    (
        SELECT
            follower_count,
            CASE
                WHEN age BETWEEN 18 AND 24 THEN '18-24'
                WHEN age BETWEEN 25 AND 35 THEN '25-35'
                WHEN age BETWEEN 36 AND 50 THEN '36-50'
                ELSE '+50'
            END AS age_group
        FROM
            pin_table
        JOIN
            user_table ON pin_table.ind = user_table.ind
    )
    SELECT
        age_group,
        PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY follower_count) AS median_follower_count
    FROM
        age_group_follower_count_table
    GROUP BY
        age_group
    ORDER BY
        CASE
            WHEN age_group = '18-24' THEN 10
            WHEN age_group = '25-35' THEN 20
            WHEN age_group = '36-50' THEN 30
            ELSE 100
        END
"""

median_follower_count_per_age_group_df = session.sql(query_median_follower_count_per_age_group)

In [0]:
# Number of users joined each year between 2015-2020
query_users_joined_per_year = """
    SELECT
        YEAR(date_joined) AS post_year,
        COUNT(*) AS number_users_joined
    FROM
        user_table
    GROUP BY
        YEAR(date_joined)
    HAVING
        YEAR(date_joined) BETWEEN 2015 AND 2020
    ORDER BY
        post_year
"""

query_users_joined_per_year_df = session.sql(query_users_joined_per_year)

In [0]:
# Median follower count of users based on their joining year.
query_median_follower_count_per_joining_year = """
    SELECT
        YEAR(date_joined) AS year_joined,
        PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY follower_count) AS median_follower_count
    FROM
        user_table
    JOIN
        pin_data ON user_table.ind = pin_data.ind
    GROUP BY
        YEAR(date_joined)
    HAVING
        YEAR(date_joined) BETWEEN 2015 and 2020
    ORDER BY
        year_joined
"""

median_follower_count_per_joining_year_df = session.sql(query_median_follower_count_per_joining_year)

In [0]:
# Median follower count of users based on joining year and age group
query_median_follower_count_per_age_group_and_joining_year = """
    WITH follower_count_age_group_post_year_table AS
    (
        SELECT
            CASE
                WHEN age BETWEEN 18 AND 24 THEN '18-24'
                WHEN age BETWEEN 25 AND 35 THEN '25-35'
                WHEN age BETWEEN 36 AND 50 THEN '36-50'
                ELSE '+50' 
            END AS age_group,
            YEAR(timestamp) AS post_year,
            follower_count
        FROM
            pin_table
        JOIN
            geo_table ON pin_table.ind = geo_table.ind
        JOIN
            user_table ON pin_table.ind = user_table.ind
    )
    SELECT
        age_group,
        post_year,
        PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY follower_count) AS median_follower_count
    FROM
        follower_count_age_group_post_year_table
    GROUP BY
        age_group, post_year
    ORDER BY
        CASE
            WHEN age_group = '18-24' THEN 10
            WHEN age_group = '25-35' THEN 20
            WHEN age_group = '36-50' THEN 30
            ELSE 100
        END,
        post_year
"""

median_follower_count_per_age_group_and_joining_year_df = session.sql(query_median_follower_count_per_age_group_and_joining_year)