In [None]:
# Checking the contents in FileStore, the location where we uploaded our AWS credentials
dbutils.fs.ls("/FileStore/tables")

In [None]:
from pyspark.sql.functions import *
import urllib

In [None]:
# Specify file type to be csv
file_type = "csv"
# Indicates file has first row as the header
first_row_is_header = "true"
# Indicates file has comma as the delimeter
delimiter = ","
# Read the CSV file to spark dataframe
aws_keys_df = spark.read.format(file_type)\
.option("header", first_row_is_header)\
.option("sep", delimiter)\
.load("/FileStore/tables/authentication_credentials.csv")

In [None]:
# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.where(col('User name')=='databricks-user').select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.where(col('User name')=='databricks-user').select('Secret access key').collect()[0]['Secret access key']
# Encode the secrete key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

In [None]:
AWS_S3_BUCKET = "user-0e4c2ab6fb3b-bucket"
# Mount name for the bucket
MOUNT_NAME = "/mnt/pdp_mount"
# Source url
SOURCE_URL = "s3n://{0}:{1}@{2}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_S3_BUCKET)
# Mount the drive
dbutils.fs.mount(SOURCE_URL, MOUNT_NAME)

In [None]:
# Check if the S3 bucket was mounted succesfully
display(dbutils.fs.ls("/mnt/pdp_mount/../..")) 

In [None]:
# Specify the whole path to check the contents of a given topic in the S3 bucket
display(dbutils.fs.ls("/mnt/pdp_mount/topics/0e4c2ab6fb3b.user/partition=0/"))

In [None]:
# File location and type
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
file_location = "/mnt/pdp_mount/topics/0e4c2ab6fb3b.pin/partition=0/*.json" 
file_type = "json"
# Ask Spark to infer the schema
infer_schema = "true"
# Read in JSONs from mounted S3 bucket
df_pin = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location)
# Display Spark dataframe to check its content
display(df_pin)

In [None]:
# File location and type
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
file_location = "/mnt/pdp_mount/topics/0e4c2ab6fb3b.geo/partition=0/*.json" 
file_type = "json"
# Ask Spark to infer the schema
infer_schema = "true"
# Read in JSONs from mounted S3 bucket
df_geo = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location)
# Display Spark dataframe to check its content
display(df_geo)

In [None]:
# File location and type
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
file_location = "/mnt/pdp_mount/topics/0e4c2ab6fb3b.user/partition=0/*.json" 
file_type = "json"
# Ask Spark to infer the schema
infer_schema = "true"
# Read in JSONs from mounted S3 bucket
df_user = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location)
# Display Spark dataframe to check its content
display(df_user)

1) Clean the DataFrame that contains information about Pinterest posts

In [None]:
# Replace empty entries and entries with no relevant data in each column with Nones
df_pin = df_pin.select([when(col(c) == "", None).otherwise(col(c)).alias(c) for c in df_pin.columns])

df_pin = df_pin.withColumn("follower_count", when(df_pin.follower_count.contains("User Info Error"), 0).otherwise(df_pin.follower_count))

df_pin = df_pin.withColumn("poster_name", when(df_pin.poster_name.contains("User Info Error"), None).otherwise(df_pin.poster_name))

df_pin = df_pin.select([when(col(c).contains("N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e"), None).otherwise(col(c)).alias(c) for c in df_pin.columns])

df_pin = df_pin.select([when(col(c).contains("Image src error"), None).otherwise(col(c)).alias(c) for c in df_pin.columns])

df_pin = df_pin.select([when(col(c).contains("No description"), None).otherwise(col(c)).alias(c) for c in df_pin.columns])

df_pin = df_pin.select([when(col(c).contains("Untitled"), None).otherwise(col(c)).alias(c) for c in df_pin.columns])

df_pin = df_pin.select([when(col(c).contains("No Title Data Available"), None).otherwise(col(c)).alias(c) for c in df_pin.columns])

# Ensure that each column containing numeric data has a numeric data type
df_pin = df_pin.withColumn('follower_count', when(df_pin.follower_count.endswith('k'), regexp_replace(df_pin.follower_count, 'k', '000')) \
    .when(df_pin.follower_count.endswith('M'), regexp_replace(df_pin.follower_count, 'M', '000000')) \
    .otherwise(df_pin.follower_count))

# change the datatype of the "follower_count" column to int
df_pin = df_pin.withColumn("follower_count",df_pin.follower_count.cast('int'))

# Clean the data in the save_location column to include only the save location path
df_pin = df_pin.withColumn('save_location', when(df_pin.save_location.startswith('Local save in '), regexp_replace(df_pin.save_location, 'Local save in ', '')))

# Rename the index column to ind
df_pin = df_pin.withColumnRenamed('index', 'ind')

# Reorder the DataFrame columns
df_pin = df_pin.select("ind", "unique_id", "title", "description", "follower_count", "poster_name", "tag_list", "is_image_or_video", "image_src", "save_location", "category")

2) Clean the DataFrame that contains information about geolocation

In [None]:
# Create a new column coordinates that contains an array based on the latitude and longitude columns
# Drop the latitude and longitude columns from the DataFrame
# Reorder the DataFrame columns
df_geo = df_geo.withColumn("coordinates", array("latitude", "longitude")) \
.select("ind", "country", "coordinates", "timestamp")

# Convert the timestamp column from a string to a timestamp data type
df_geo = df_geo.withColumn("timestamp", to_timestamp("timestamp"))

3) Clean the DataFrame that contains information about users

In [None]:
# Create a new column user_name that concatenates the information found in the first_name and last_name columns
# Drop the first_name and last_name columns from the DataFrame
df_user = df_user.withColumn("user_name", concat("first_name", "last_name")) \
    .select("age", "date_joined", "ind", "user_name")

# Convert the date_joined column from a string to a timestamp data type
# Reorder the DataFrame columns
df_user = df_user.withColumn("date_joined", to_timestamp("date_joined")) \
    .select("ind", "user_name", "age", "date_joined")

4) Find the most popular category in each country

In [None]:
df_pin.createOrReplaceTempView("PIN")
df_geo.createOrReplaceTempView("GEO")
df_user.createOrReplaceTempView("USER")
df_most_popular_cat_by_country = df_pin.join(df_geo, on = "ind") \
    .groupBy('country', 'category') \
        .agg(count('*').alias('count')) \
            .groupBy('country') \
                .agg(max(struct('count', 'category')).alias('max_count')) \
                    .select('country', 'max_count.category', 'max_count.count') \
                        .withColumnRenamed("count", "category_count")

df_most_popular_cat_by_country.show()

5) Find which was the most popular category each year

In [None]:
df_most_popular_category_by_year = df_pin.join(df_geo, on = 'ind') \
    .filter((year('timestamp') >= 2018) & (year('timestamp') <= 2022))

df_most_popular_category_by_year = df_most_popular_category_by_year.groupBy(year('timestamp').alias('post_year'), 'category') \
    .agg(count('*').alias('category_count'))

df_most_popular_category_by_year = df_most_popular_category_by_year.orderBy(['post_year', 'category_count'], ascending = [True, False]) \
    .groupBy('post_year') \
        .agg({'category': 'first', 'category_count': 'first'}) \
            .select('post_year', 'first(category)', 'first(category_count)') \
                .withColumnRenamed('first(category)', 'category') \
                    .withColumnRenamed('first(category_count)', 'category_count')

df_most_popular_category_by_year.show()

6) Find the user with most followers in each country

In [None]:
from pyspark.sql.window import Window

df_most_followers_by_country = df_pin.join(df_geo, on = 'ind') \
    .groupBy('country', 'poster_name') \
        .agg(max('follower_count')) \
            .withColumn('rank', rank().over(Window.partitionBy('country').orderBy(desc('max(follower_count)')))) \
                .filter('rank = 1') \
                    .select('country', 'poster_name', 'max(follower_count)') \
                        .withColumnRenamed('max(follower_count)', 'follower_count')

df_most_followers_by_country.show()

df_country_with_most_followed_user = df_most_followers_by_country.groupBy('country') \
    .agg(max('follower_count').alias('follower_count')) \
        .orderBy(desc('follower_count')) \
            .limit(1) \
                .select('country', 'follower_count')

df_country_with_most_followed_user.show()

7) Find the most popular category for different age groups

In [None]:
df_user_age_group = df_pin.join(df_user, on = 'ind') \
    .withColumn("age_group", 
                when(df_user.age.between(18, 24), "18-24")
                .when(df_user.age.between(25, 35), "25-35")
                .when(df_user.age.between(36, 50), "36-50")
                .when(df_user.age > 50, "50+"))

df_category_count_by_age = df_user_age_group.groupBy("age_group", "category") \
    .agg(count("*").alias("category_count")) \
        .groupBy("age_group") \
            .agg(max(struct("category_count", "category")).alias("max_count")) \
                .select("age_group", "max_count.category", "max_count.category_count")

df_category_count_by_age.show()

8) Find the median follower count for different age groups

In [None]:
df_median_follower_count_by_age = df_user_age_group.groupBy("age_group") \
    .agg(percentile_approx("follower_count", 0.5, lit(1000000)).alias("median_follower_count")) \
        .select("age_group", "median_follower_count")

df_median_follower_count_by_age.show()

9) Find how many users have joined each year

In [None]:
df_user_geo = df_user.join(df_geo, on = "ind") \
    .select(year("timestamp").alias("post_year"), 'date_joined') \
        .where((df_user.date_joined >= '2015-01-01') & (df_user.date_joined < '2021-01-01'))

df_user_geo.show()

df_users_joined_by_year = df_user_geo.groupBy("post_year") \
   .agg(count("*").alias("number_users_joined")) \
        .sort(asc("post_year")) \
            .select("post_year", "number_users_joined")

df_users_joined_by_year.show()

10) Find the median follower count of users based on their joining year

In [None]:
df_user_geo_pin = df_user.join(df_geo, on = "ind").join(df_pin, on = "ind") \
    .select(year("timestamp").alias("post_year"), 'date_joined', "follower_count") \
        .where((df_user.date_joined >= '2015-01-01') & (df_user.date_joined < '2021-01-01'))

df_user_median_follower_count = df_user_geo_pin.groupBy("post_year") \
    .agg(percentile_approx("follower_count", 0.5, lit(1000000)).alias("median_follower_count")) \
        .select("post_year", "median_follower_count")

df_user_median_follower_count.show()



11) Find the median follower count of users based on their joining year and age group

In [None]:

df_user_geo_pin = df_user.join(df_geo, on = "ind").join(df_pin, on = "ind") \
    .withColumn("age_group", 
                when(df_user.age.between(18, 24), "18-24")
                .when(df_user.age.between(25, 35), "25-35")
                .when(df_user.age.between(36, 50), "36-50")
                .when(df_user.age > 50, "50+")) \
        .select("age_group", year("timestamp").alias("post_year"), 'date_joined', "follower_count") \
            .where((df_user.date_joined >= '2015-01-01') & (df_user.date_joined < '2021-01-01'))

df_median_follower_by_year_joined = df_user_geo_pin.groupBy("age_group", "post_year") \
    .agg(percentile_approx("follower_count", 0.5, lit(1000000)).alias("median_follower_count")) \
        .select("age_group", "post_year", "median_follower_count")

df_median_follower_by_year_joined.show()

In [None]:
# Unmount the bucket
dbutils.fs.unmount(MOUNT_NAME)