In [0]:
# Read the Delta table for authentication credentials
df_auth = spark.read.format("delta").load("dbfs:/user/hive/warehouse/authentication_credentials")

# Assuming df_auth is your DataFrame containing the credentials
auth_row = df_auth.first()  # Get the first row of the DataFrame
access_key_id = auth_row['Access key ID']
secret_access_key = auth_row['Secret access key']

# Set the credentials in the environment (if needed)
import os
os.environ['AWS_ACCESS_KEY_ID'] = access_key_id
os.environ['AWS_SECRET_ACCESS_KEY'] = secret_access_key

# Mount S3 bucket using the extracted credentials
try:
    dbutils.fs.mount(
        source = "s3a://user-1226d593b7e7-bucket",
        mount_point = "/mnt/pinterest_data",
        extra_configs = {
            "fs.s3a.access.key": access_key_id,
            "fs.s3a.secret.key": secret_access_key
        }
    )
    print("Mount successful!")
except Exception as e:
    print(f"Mount failed: {e}")



In [0]:
# Define the base path for your S3 bucket
base_path = "s3://user-1226d593b7e7-bucket/topics"
base_path = "/mnt/pinterest_data/topics"
# base path needs to be /mnt/pinterest_data/topics

# Reading Pinterest post data
df_pin = spark.read.json(f"{base_path}/1226d593b7e7.pin/partition=0/")
display(df_pin)  # Displaying the DataFrame

# Reading geolocation data
df_geo = spark.read.json(f"{base_path}/1226d593b7e7.geo/partition=0/")
display(df_geo)  # Displaying the DataFrame

# Reading user data
df_user = spark.read.json(f"{base_path}/1226d593b7e7.user/partition=0/")
display(df_user)  # Displaying the DataFrame


category,description,downloaded,follower_count,image_src,index,is_image_or_video,poster_name,save_location,tag_list,title,unique_id
home-decor,"–¢—Ä–∞–¥–∏—Ü–∏–æ–Ω–Ω—ã–µ —à–≤–µ–¥—Å–∫–∏–µ –∫–æ—Ç—Ç–µ–¥–∂–∏, –æ–±—ã—á–Ω–æ —Å –∫—Ä–∞—Å–Ω—ã–º —Ñ–∞—Å–∞–¥–æ–º ‚Äî —ç—Ç–æ –Ω–∞—Å—Ç–æ—è—â–µ–µ –≤–æ–ø–ª–æ—â–µ–Ω–∏–µ–º –∏–¥–µ–∞–ª—å–Ω–æ–≥–æ –∑–∏–º–Ω–µ–≥–æ —É—é—Ç–∞. –û–Ω–∏ –æ–±—ã—á–Ω–æ –æ—Ñ–æ—Ä–º–ª–µ–Ω—ã –æ—á–µ–Ω—å –ø—Ä–æ—Å—Ç–æ –∏ ‚úåPUFIK. Beautiful Interiors. On‚Ä¶¬†",1,136k,https://i.pinimg.com/originals/32/eb/72/32eb72e4fd8654c115a64528bd1f34b4.png,6717,image,PUFIK Interiors & Inspirations,Local save in /data/home-decor,"Scandinavian Cottage,Swedish Cottage,Swedish Home Decor,Swedish Farmhouse,Swedish Style,Swedish Kitchen,Kitchen Black,Swedish House,Cozy Cottage",„Äö –£—é—Ç–Ω—ã–µ —à–≤–µ–¥—Å–∫–∏–µ –∫–æ—Ç—Ç–µ–¥–∂–∏ –æ—Ç Carina Olander „Äõ ‚óæ –§–æ—Ç–æ ‚óæ –ò–¥–µ–∏ ‚óæ –î–∏–∑–∞–π–Ω,bc5ab9ee-505e-44f6-92ba-677fe4fdf3e3
christmas,"Features: Material:Lint Size:48ÔΩò18cm Quantity:1 pc Shape:Santa Claus, snowman. Elk Occasion:Christmas Description: 1. Fashion design, high quality 2. Santa Claus, snowman. Elk C‚Ä¶¬†",1,5k,https://i.pinimg.com/originals/b5/7f/21/b57f219fa89c1165b57525b8eae711da.jpg,1706,image,Wear24-7,Local save in /data/christmas,"Merry Christmas To You,Christmas Toys,Great Christmas Gifts,Christmas Snowman,Christmas Ornaments,Holiday,Christmas Party Decorations,Christmas Themes,Decoration Party",Standing Figurine Toys Xmas Santa Claus Snowman Reindeer Figure Plush Dolls Christmas Decorations Ornaments Home Indoor Table Ornaments Christmas Party Tree Hanging Decor Toys Gifts for Kids Friends‚Ä¶,b5c8a1b5-9e90-4522-9bec-2477b698d5b7
christmas,"Christmas decorating ideas for porches. Beautiful holiday decor ideas for front porches both small and large. Outdoor decorations like sleds, lanterns, Christmas trees, wreaths,‚Ä¶¬†",1,46k,https://i.pinimg.com/videos/thumbnails/originals/40/83/f5/4083f5b4971bf235f89a4784ab87271e.0000001.jpg,2482,video,"Life on Summerhill | Home, Holiday Decor & DIY Website",Local save in /data/christmas,"Diy Christmas Decorations For Home,Farmhouse Christmas Decor,Christmas Home,Christmas Holidays,Christmas Front Porches,How To Decorate For Christmas,Christmas Porch Ideas,Christmas Decorating Ideas,Large Outdoor Christmas Decorations",FORNT PORCH CHRISTMAS DECORATING IDEAS,08604f20-fa17-4b9a-9949-781717eca6cd
christmas,"Christmas decorating ideas for porches. Beautiful holiday decor ideas for front porches both small and large. Outdoor decorations like sleds, lanterns, Christmas trees, wreaths,‚Ä¶¬†",1,46k,https://i.pinimg.com/videos/thumbnails/originals/40/83/f5/4083f5b4971bf235f89a4784ab87271e.0000001.jpg,2482,video,"Life on Summerhill | Home, Holiday Decor & DIY Website",Local save in /data/christmas,"Diy Christmas Decorations For Home,Farmhouse Christmas Decor,Christmas Home,Christmas Holidays,Christmas Front Porches,How To Decorate For Christmas,Christmas Porch Ideas,Christmas Decorating Ideas,Large Outdoor Christmas Decorations",FORNT PORCH CHRISTMAS DECORATING IDEAS,08604f20-fa17-4b9a-9949-781717eca6cd
vehicles,"By David Crane ; defrev (at) gmail (dot) com All photos contained in this article were shot by DefenseReview.com (DR), and are copyrighted. DefenseReview.com owns the copyright‚Ä¶¬†",1,709,https://i.pinimg.com/originals/36/63/12/366312d747da1358397610a86bf21b20.jpg,10538,image,Ricky Lee,Local save in /data/vehicles,"Army Vehicles,Armored Vehicles,Cool Trucks,Cool Cars,Amphibious Vehicle,Offroader,Bug Out Vehicle,Vehicle Wraps,Terrain Vehicle",BC Customs (BCC) Search and Rescue Tactical Vehicle-5 (SRTV-5) Baja Racing-Type All-Terrain Combat Vehicle Armed/Weaponized with 7.62mm NATO Garwood Industries (GI) M134G Minigun/Gatling Gun: SXOR‚Ä¶,5d9fa7e2-2118-4442-99b6-537d60463a6a
art,Marble Wall Art Modern Abstract Canvas Artwork Contemporary Home Decor Canvas Wall Art Ready to Hang Canvas Each canvas is professionally printed and hand-stretched in the USA.‚Ä¶¬†,1,305,https://i.pinimg.com/originals/b2/6e/95/b26e950a283805d09ef9a4a279781217.jpg,527,image,Wall Canvas Mall,Local save in /data/art,"Modern Art Paintings,Modern Artwork,Modern Wall Art,Blue Artwork,Modern Canvas Art,Contemporary Home Decor,Modern Art Prints,Framed Canvas Prints,Wall Art Prints","Blue Gold Marble Canvas , Luxury Wall Art, Abstract Wall Decor, Navy Blue Abstract, Modern Artwork, Oversize Canvas Art, Contemporary Art - 1 Panel 12x9 / Gallery Wrap",ed8af037-ee87-4a80-97ac-99f5b153cf7e
event-planning,"Updated: January 25, 2017 You‚Äôve organized some events for your family, friends or community and you have gained a budding reputation for knowing how put events together. You‚Äôve‚Ä¶¬†",1,4k,https://i.pinimg.com/originals/c3/2b/c6/c32bc6ad263857cb0eea19f9cd12beb9.jpg,4357,image,EventPlanning.com | Learn How To Become An Event Planner,Local save in /data/event-planning,"Event Planning Quotes,Event Planning Checklist,Event Planning Business,Business Events,Business Ideas,Business Names,Business Opportunities,Corporate Events,Wedding Event Planner",First Steps in Launching Your Own Event Business - Learn About Event Planning,ccf116e9-9096-4943-a344-1960ce216445
art,If I could only choose one paint brush it would be the angled brush! I am going to break down four separate Techniques I like to use an Angled Paint Brush with a video painting‚Ä¶¬†,1,20k,https://i.pinimg.com/originals/cc/8e/81/cc8e8190f773d5e3bb7d86890b566da7.png,25,image,The Social Easel Online Paint Studio | Video Painting Tutorials,Local save in /data/art,"Fall Canvas Painting,Basic Painting,Acrylic Painting Flowers,Canvas Painting Tutorials,Autumn Painting,Painting Techniques,Diy Painting,Painting & Drawing,Canvas Art",How to use an Angled Paint Brush! Painting Techniques with The Social Easel Online Paint Studio,f19b91c7-2a58-41ae-a013-3806d248baec
christmas,15 unique Christmas porch ideas that will leave you feeling inspired and help you tackle decorating your own entryway for the holidays! It‚Äôs almost time to start decorating for‚Ä¶¬†,1,19k,https://i.pinimg.com/originals/ff/f8/3b/fff83b02aeb29e2e9341a56fc5e63345.png,1967,image,Ashley - Modern Glam,Local save in /data/christmas,"Exterior Christmas Lights,Front Door Christmas Decorations,Christmas Lights Outside,Christmas House Lights,Decorating With Christmas Lights,Porch Decorating,Christmas Porch Decorations,Front Porch Ideas For Christmas,Christmas Lights Outdoor Trees",15 Fun & Festive Christmas Porch Ideas,0b9d5b95-51a6-465e-ae4a-2cb68ceada29
travel,"This Costa Rica itinerary is the ultimate guide to spending two weeks in Costa Rica. Find out about visiting La Fortuna, Arenal, Monteverde, Naranjo, Corcovado National Park, Or‚Ä¶¬†",1,10k,https://i.pinimg.com/originals/30/93/cb/3093cb01d9de2d125fda8ba5e3e41946.jpg,10138,image,"Wanderlust Chloe ‚úàÔ∏è Travel guides, inspo and adventure travel ‚úàÔ∏è",Local save in /data/travel,"Costa Rica Travel,Rio Celeste Costa Rica,Dream Vacations,Vacation Spots,Vacation Travel,Travel Pictures,Travel Photos,Fortuna Costa Rica,Costa Rica Pictures","14 Amazing Things To Do In Costa Rica | Volcanoes, Waterfalls, Wildlife And More",927c4658-cc3f-4b92-9b5c-70743d0c238d


country,ind,latitude,longitude,timestamp
Antarctica (the territory South of 60 deg S),2418,-88.4642,-171.061,2022-05-27T11:30:59
Antarctica (the territory South of 60 deg S),2418,-88.4642,-171.061,2022-05-27T11:30:59
Antarctica (the territory South of 60 deg S),5162,-71.6607,-149.206,2019-09-27T19:06:43
Antarctica (the territory South of 60 deg S),1335,-77.9931,-175.682,2022-03-19T17:29:42
Antarctica (the territory South of 60 deg S),9185,-10.3764,-22.9809,2019-10-06T18:12:55
Antarctica (the territory South of 60 deg S),9335,-88.4642,-171.061,2020-11-14T23:42:22
Saint Vincent and the Grenadines,2301,13.4683,51.7244,2020-11-14T00:25:28
Holy See (Vatican City State),7180,-22.7118,-167.739,2018-04-22T21:33:50
Svalbard & Jan Mayen Islands,10204,-14.3257,26.9087,2020-04-06T02:56:29
French Southern Territories,6014,-26.6026,155.206,2019-04-30T12:33:13


age,date_joined,first_name,ind,last_name
27,2016-03-08T13:38:37,Christopher,2015,Bradshaw
27,2016-03-08T13:38:37,Christopher,2015,Bradshaw
59,2017-05-12T21:22:17,Alexander,10673,Cervantes
48,2016-02-27T16:57:44,Christopher,1857,Hamilton
45,2016-09-15T06:02:53,Christopher,10020,Hawkins
39,2016-06-29T20:43:59,Christina,6398,Davenport
20,2015-10-23T04:13:23,Alexandria,3599,Alvarado
20,2015-10-23T04:13:23,Alexandria,4256,Alvarado
44,2016-12-18T16:05:39,Michelle,1901,Richardson
20,2015-10-23T04:13:23,Alexandria,3831,Alvarado


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType

# Define the base path for your S3 bucket (the correct mount)
base_path = "/mnt/pinterest_data/topics"

# Read Pinterest post data
df_pin = spark.read.json(f"{base_path}/1226d593b7e7.pin/partition=0/")
display(df_pin)

# 1. Replace empty entries or irrelevant data with None
# Assuming that empty strings or null values represent irrelevant data
df_pin_clean = df_pin.replace('', None).na.fill('None')

# 2. Ensure follower_count is a number and convert to int
# Remove any non-numeric characters and cast the column to IntegerType
df_pin_clean = df_pin_clean.withColumn(
    "follower_count",
    F.regexp_replace(F.col("follower_count"), r"[^\d]", "").cast(IntegerType())
)

# 3. Ensure numeric columns have numeric types
# For other numeric columns, check their types and convert if necessary
# Assuming `follower_count` is the only numeric column that needs conversion

# 4. Clean the `save_location` column to include only the path
df_pin_clean = df_pin_clean.withColumn(
    "save_location",
    F.expr("regexp_extract(save_location, '([^/]+$)', 1)")  # Extract the last part of the path
)

# 5. Rename the `index` column to `ind`
df_pin_clean = df_pin_clean.withColumnRenamed("index", "ind")

# 6. Reorder the columns as specified
df_pin_clean = df_pin_clean.select(
    "ind",
    "unique_id",
    "title",
    "description",
    "follower_count",
    "poster_name",
    "tag_list",
    "is_image_or_video",
    "image_src",
    "save_location",
    "category"
)

# Show the cleaned DataFrame
display(df_pin_clean)


category,description,downloaded,follower_count,image_src,index,is_image_or_video,poster_name,save_location,tag_list,title,unique_id
home-decor,"–¢—Ä–∞–¥–∏—Ü–∏–æ–Ω–Ω—ã–µ —à–≤–µ–¥—Å–∫–∏–µ –∫–æ—Ç—Ç–µ–¥–∂–∏, –æ–±—ã—á–Ω–æ —Å –∫—Ä–∞—Å–Ω—ã–º —Ñ–∞—Å–∞–¥–æ–º ‚Äî —ç—Ç–æ –Ω–∞—Å—Ç–æ—è—â–µ–µ –≤–æ–ø–ª–æ—â–µ–Ω–∏–µ–º –∏–¥–µ–∞–ª—å–Ω–æ–≥–æ –∑–∏–º–Ω–µ–≥–æ —É—é—Ç–∞. –û–Ω–∏ –æ–±—ã—á–Ω–æ –æ—Ñ–æ—Ä–º–ª–µ–Ω—ã –æ—á–µ–Ω—å –ø—Ä–æ—Å—Ç–æ –∏ ‚úåPUFIK. Beautiful Interiors. On‚Ä¶¬†",1,136k,https://i.pinimg.com/originals/32/eb/72/32eb72e4fd8654c115a64528bd1f34b4.png,6717,image,PUFIK Interiors & Inspirations,Local save in /data/home-decor,"Scandinavian Cottage,Swedish Cottage,Swedish Home Decor,Swedish Farmhouse,Swedish Style,Swedish Kitchen,Kitchen Black,Swedish House,Cozy Cottage",„Äö –£—é—Ç–Ω—ã–µ —à–≤–µ–¥—Å–∫–∏–µ –∫–æ—Ç—Ç–µ–¥–∂–∏ –æ—Ç Carina Olander „Äõ ‚óæ –§–æ—Ç–æ ‚óæ –ò–¥–µ–∏ ‚óæ –î–∏–∑–∞–π–Ω,bc5ab9ee-505e-44f6-92ba-677fe4fdf3e3
christmas,"Features: Material:Lint Size:48ÔΩò18cm Quantity:1 pc Shape:Santa Claus, snowman. Elk Occasion:Christmas Description: 1. Fashion design, high quality 2. Santa Claus, snowman. Elk C‚Ä¶¬†",1,5k,https://i.pinimg.com/originals/b5/7f/21/b57f219fa89c1165b57525b8eae711da.jpg,1706,image,Wear24-7,Local save in /data/christmas,"Merry Christmas To You,Christmas Toys,Great Christmas Gifts,Christmas Snowman,Christmas Ornaments,Holiday,Christmas Party Decorations,Christmas Themes,Decoration Party",Standing Figurine Toys Xmas Santa Claus Snowman Reindeer Figure Plush Dolls Christmas Decorations Ornaments Home Indoor Table Ornaments Christmas Party Tree Hanging Decor Toys Gifts for Kids Friends‚Ä¶,b5c8a1b5-9e90-4522-9bec-2477b698d5b7
christmas,"Christmas decorating ideas for porches. Beautiful holiday decor ideas for front porches both small and large. Outdoor decorations like sleds, lanterns, Christmas trees, wreaths,‚Ä¶¬†",1,46k,https://i.pinimg.com/videos/thumbnails/originals/40/83/f5/4083f5b4971bf235f89a4784ab87271e.0000001.jpg,2482,video,"Life on Summerhill | Home, Holiday Decor & DIY Website",Local save in /data/christmas,"Diy Christmas Decorations For Home,Farmhouse Christmas Decor,Christmas Home,Christmas Holidays,Christmas Front Porches,How To Decorate For Christmas,Christmas Porch Ideas,Christmas Decorating Ideas,Large Outdoor Christmas Decorations",FORNT PORCH CHRISTMAS DECORATING IDEAS,08604f20-fa17-4b9a-9949-781717eca6cd
christmas,"Christmas decorating ideas for porches. Beautiful holiday decor ideas for front porches both small and large. Outdoor decorations like sleds, lanterns, Christmas trees, wreaths,‚Ä¶¬†",1,46k,https://i.pinimg.com/videos/thumbnails/originals/40/83/f5/4083f5b4971bf235f89a4784ab87271e.0000001.jpg,2482,video,"Life on Summerhill | Home, Holiday Decor & DIY Website",Local save in /data/christmas,"Diy Christmas Decorations For Home,Farmhouse Christmas Decor,Christmas Home,Christmas Holidays,Christmas Front Porches,How To Decorate For Christmas,Christmas Porch Ideas,Christmas Decorating Ideas,Large Outdoor Christmas Decorations",FORNT PORCH CHRISTMAS DECORATING IDEAS,08604f20-fa17-4b9a-9949-781717eca6cd
vehicles,"By David Crane ; defrev (at) gmail (dot) com All photos contained in this article were shot by DefenseReview.com (DR), and are copyrighted. DefenseReview.com owns the copyright‚Ä¶¬†",1,709,https://i.pinimg.com/originals/36/63/12/366312d747da1358397610a86bf21b20.jpg,10538,image,Ricky Lee,Local save in /data/vehicles,"Army Vehicles,Armored Vehicles,Cool Trucks,Cool Cars,Amphibious Vehicle,Offroader,Bug Out Vehicle,Vehicle Wraps,Terrain Vehicle",BC Customs (BCC) Search and Rescue Tactical Vehicle-5 (SRTV-5) Baja Racing-Type All-Terrain Combat Vehicle Armed/Weaponized with 7.62mm NATO Garwood Industries (GI) M134G Minigun/Gatling Gun: SXOR‚Ä¶,5d9fa7e2-2118-4442-99b6-537d60463a6a
art,Marble Wall Art Modern Abstract Canvas Artwork Contemporary Home Decor Canvas Wall Art Ready to Hang Canvas Each canvas is professionally printed and hand-stretched in the USA.‚Ä¶¬†,1,305,https://i.pinimg.com/originals/b2/6e/95/b26e950a283805d09ef9a4a279781217.jpg,527,image,Wall Canvas Mall,Local save in /data/art,"Modern Art Paintings,Modern Artwork,Modern Wall Art,Blue Artwork,Modern Canvas Art,Contemporary Home Decor,Modern Art Prints,Framed Canvas Prints,Wall Art Prints","Blue Gold Marble Canvas , Luxury Wall Art, Abstract Wall Decor, Navy Blue Abstract, Modern Artwork, Oversize Canvas Art, Contemporary Art - 1 Panel 12x9 / Gallery Wrap",ed8af037-ee87-4a80-97ac-99f5b153cf7e
event-planning,"Updated: January 25, 2017 You‚Äôve organized some events for your family, friends or community and you have gained a budding reputation for knowing how put events together. You‚Äôve‚Ä¶¬†",1,4k,https://i.pinimg.com/originals/c3/2b/c6/c32bc6ad263857cb0eea19f9cd12beb9.jpg,4357,image,EventPlanning.com | Learn How To Become An Event Planner,Local save in /data/event-planning,"Event Planning Quotes,Event Planning Checklist,Event Planning Business,Business Events,Business Ideas,Business Names,Business Opportunities,Corporate Events,Wedding Event Planner",First Steps in Launching Your Own Event Business - Learn About Event Planning,ccf116e9-9096-4943-a344-1960ce216445
art,If I could only choose one paint brush it would be the angled brush! I am going to break down four separate Techniques I like to use an Angled Paint Brush with a video painting‚Ä¶¬†,1,20k,https://i.pinimg.com/originals/cc/8e/81/cc8e8190f773d5e3bb7d86890b566da7.png,25,image,The Social Easel Online Paint Studio | Video Painting Tutorials,Local save in /data/art,"Fall Canvas Painting,Basic Painting,Acrylic Painting Flowers,Canvas Painting Tutorials,Autumn Painting,Painting Techniques,Diy Painting,Painting & Drawing,Canvas Art",How to use an Angled Paint Brush! Painting Techniques with The Social Easel Online Paint Studio,f19b91c7-2a58-41ae-a013-3806d248baec
christmas,15 unique Christmas porch ideas that will leave you feeling inspired and help you tackle decorating your own entryway for the holidays! It‚Äôs almost time to start decorating for‚Ä¶¬†,1,19k,https://i.pinimg.com/originals/ff/f8/3b/fff83b02aeb29e2e9341a56fc5e63345.png,1967,image,Ashley - Modern Glam,Local save in /data/christmas,"Exterior Christmas Lights,Front Door Christmas Decorations,Christmas Lights Outside,Christmas House Lights,Decorating With Christmas Lights,Porch Decorating,Christmas Porch Decorations,Front Porch Ideas For Christmas,Christmas Lights Outdoor Trees",15 Fun & Festive Christmas Porch Ideas,0b9d5b95-51a6-465e-ae4a-2cb68ceada29
travel,"This Costa Rica itinerary is the ultimate guide to spending two weeks in Costa Rica. Find out about visiting La Fortuna, Arenal, Monteverde, Naranjo, Corcovado National Park, Or‚Ä¶¬†",1,10k,https://i.pinimg.com/originals/30/93/cb/3093cb01d9de2d125fda8ba5e3e41946.jpg,10138,image,"Wanderlust Chloe ‚úàÔ∏è Travel guides, inspo and adventure travel ‚úàÔ∏è",Local save in /data/travel,"Costa Rica Travel,Rio Celeste Costa Rica,Dream Vacations,Vacation Spots,Vacation Travel,Travel Pictures,Travel Photos,Fortuna Costa Rica,Costa Rica Pictures","14 Amazing Things To Do In Costa Rica | Volcanoes, Waterfalls, Wildlife And More",927c4658-cc3f-4b92-9b5c-70743d0c238d


ind,unique_id,title,description,follower_count,poster_name,tag_list,is_image_or_video,image_src,save_location,category
6717,bc5ab9ee-505e-44f6-92ba-677fe4fdf3e3,„Äö –£—é—Ç–Ω—ã–µ —à–≤–µ–¥—Å–∫–∏–µ –∫–æ—Ç—Ç–µ–¥–∂–∏ –æ—Ç Carina Olander „Äõ ‚óæ –§–æ—Ç–æ ‚óæ –ò–¥–µ–∏ ‚óæ –î–∏–∑–∞–π–Ω,"–¢—Ä–∞–¥–∏—Ü–∏–æ–Ω–Ω—ã–µ —à–≤–µ–¥—Å–∫–∏–µ –∫–æ—Ç—Ç–µ–¥–∂–∏, –æ–±—ã—á–Ω–æ —Å –∫—Ä–∞—Å–Ω—ã–º —Ñ–∞—Å–∞–¥–æ–º ‚Äî —ç—Ç–æ –Ω–∞—Å—Ç–æ—è—â–µ–µ –≤–æ–ø–ª–æ—â–µ–Ω–∏–µ–º –∏–¥–µ–∞–ª—å–Ω–æ–≥–æ –∑–∏–º–Ω–µ–≥–æ —É—é—Ç–∞. –û–Ω–∏ –æ–±—ã—á–Ω–æ –æ—Ñ–æ—Ä–º–ª–µ–Ω—ã –æ—á–µ–Ω—å –ø—Ä–æ—Å—Ç–æ –∏ ‚úåPUFIK. Beautiful Interiors. On‚Ä¶¬†",136.0,PUFIK Interiors & Inspirations,"Scandinavian Cottage,Swedish Cottage,Swedish Home Decor,Swedish Farmhouse,Swedish Style,Swedish Kitchen,Kitchen Black,Swedish House,Cozy Cottage",image,https://i.pinimg.com/originals/32/eb/72/32eb72e4fd8654c115a64528bd1f34b4.png,home-decor,home-decor
1706,b5c8a1b5-9e90-4522-9bec-2477b698d5b7,Standing Figurine Toys Xmas Santa Claus Snowman Reindeer Figure Plush Dolls Christmas Decorations Ornaments Home Indoor Table Ornaments Christmas Party Tree Hanging Decor Toys Gifts for Kids Friends‚Ä¶,"Features: Material:Lint Size:48ÔΩò18cm Quantity:1 pc Shape:Santa Claus, snowman. Elk Occasion:Christmas Description: 1. Fashion design, high quality 2. Santa Claus, snowman. Elk C‚Ä¶¬†",5.0,Wear24-7,"Merry Christmas To You,Christmas Toys,Great Christmas Gifts,Christmas Snowman,Christmas Ornaments,Holiday,Christmas Party Decorations,Christmas Themes,Decoration Party",image,https://i.pinimg.com/originals/b5/7f/21/b57f219fa89c1165b57525b8eae711da.jpg,christmas,christmas
2482,08604f20-fa17-4b9a-9949-781717eca6cd,FORNT PORCH CHRISTMAS DECORATING IDEAS,"Christmas decorating ideas for porches. Beautiful holiday decor ideas for front porches both small and large. Outdoor decorations like sleds, lanterns, Christmas trees, wreaths,‚Ä¶¬†",46.0,"Life on Summerhill | Home, Holiday Decor & DIY Website","Diy Christmas Decorations For Home,Farmhouse Christmas Decor,Christmas Home,Christmas Holidays,Christmas Front Porches,How To Decorate For Christmas,Christmas Porch Ideas,Christmas Decorating Ideas,Large Outdoor Christmas Decorations",video,https://i.pinimg.com/videos/thumbnails/originals/40/83/f5/4083f5b4971bf235f89a4784ab87271e.0000001.jpg,christmas,christmas
2482,08604f20-fa17-4b9a-9949-781717eca6cd,FORNT PORCH CHRISTMAS DECORATING IDEAS,"Christmas decorating ideas for porches. Beautiful holiday decor ideas for front porches both small and large. Outdoor decorations like sleds, lanterns, Christmas trees, wreaths,‚Ä¶¬†",46.0,"Life on Summerhill | Home, Holiday Decor & DIY Website","Diy Christmas Decorations For Home,Farmhouse Christmas Decor,Christmas Home,Christmas Holidays,Christmas Front Porches,How To Decorate For Christmas,Christmas Porch Ideas,Christmas Decorating Ideas,Large Outdoor Christmas Decorations",video,https://i.pinimg.com/videos/thumbnails/originals/40/83/f5/4083f5b4971bf235f89a4784ab87271e.0000001.jpg,christmas,christmas
10538,5d9fa7e2-2118-4442-99b6-537d60463a6a,BC Customs (BCC) Search and Rescue Tactical Vehicle-5 (SRTV-5) Baja Racing-Type All-Terrain Combat Vehicle Armed/Weaponized with 7.62mm NATO Garwood Industries (GI) M134G Minigun/Gatling Gun: SXOR‚Ä¶,"By David Crane ; defrev (at) gmail (dot) com All photos contained in this article were shot by DefenseReview.com (DR), and are copyrighted. DefenseReview.com owns the copyright‚Ä¶¬†",709.0,Ricky Lee,"Army Vehicles,Armored Vehicles,Cool Trucks,Cool Cars,Amphibious Vehicle,Offroader,Bug Out Vehicle,Vehicle Wraps,Terrain Vehicle",image,https://i.pinimg.com/originals/36/63/12/366312d747da1358397610a86bf21b20.jpg,vehicles,vehicles
527,ed8af037-ee87-4a80-97ac-99f5b153cf7e,"Blue Gold Marble Canvas , Luxury Wall Art, Abstract Wall Decor, Navy Blue Abstract, Modern Artwork, Oversize Canvas Art, Contemporary Art - 1 Panel 12x9 / Gallery Wrap",Marble Wall Art Modern Abstract Canvas Artwork Contemporary Home Decor Canvas Wall Art Ready to Hang Canvas Each canvas is professionally printed and hand-stretched in the USA.‚Ä¶¬†,305.0,Wall Canvas Mall,"Modern Art Paintings,Modern Artwork,Modern Wall Art,Blue Artwork,Modern Canvas Art,Contemporary Home Decor,Modern Art Prints,Framed Canvas Prints,Wall Art Prints",image,https://i.pinimg.com/originals/b2/6e/95/b26e950a283805d09ef9a4a279781217.jpg,art,art
4357,ccf116e9-9096-4943-a344-1960ce216445,First Steps in Launching Your Own Event Business - Learn About Event Planning,"Updated: January 25, 2017 You‚Äôve organized some events for your family, friends or community and you have gained a budding reputation for knowing how put events together. You‚Äôve‚Ä¶¬†",4.0,EventPlanning.com | Learn How To Become An Event Planner,"Event Planning Quotes,Event Planning Checklist,Event Planning Business,Business Events,Business Ideas,Business Names,Business Opportunities,Corporate Events,Wedding Event Planner",image,https://i.pinimg.com/originals/c3/2b/c6/c32bc6ad263857cb0eea19f9cd12beb9.jpg,event-planning,event-planning
25,f19b91c7-2a58-41ae-a013-3806d248baec,How to use an Angled Paint Brush! Painting Techniques with The Social Easel Online Paint Studio,If I could only choose one paint brush it would be the angled brush! I am going to break down four separate Techniques I like to use an Angled Paint Brush with a video painting‚Ä¶¬†,20.0,The Social Easel Online Paint Studio | Video Painting Tutorials,"Fall Canvas Painting,Basic Painting,Acrylic Painting Flowers,Canvas Painting Tutorials,Autumn Painting,Painting Techniques,Diy Painting,Painting & Drawing,Canvas Art",image,https://i.pinimg.com/originals/cc/8e/81/cc8e8190f773d5e3bb7d86890b566da7.png,art,art
1967,0b9d5b95-51a6-465e-ae4a-2cb68ceada29,15 Fun & Festive Christmas Porch Ideas,15 unique Christmas porch ideas that will leave you feeling inspired and help you tackle decorating your own entryway for the holidays! It‚Äôs almost time to start decorating for‚Ä¶¬†,19.0,Ashley - Modern Glam,"Exterior Christmas Lights,Front Door Christmas Decorations,Christmas Lights Outside,Christmas House Lights,Decorating With Christmas Lights,Porch Decorating,Christmas Porch Decorations,Front Porch Ideas For Christmas,Christmas Lights Outdoor Trees",image,https://i.pinimg.com/originals/ff/f8/3b/fff83b02aeb29e2e9341a56fc5e63345.png,christmas,christmas
10138,927c4658-cc3f-4b92-9b5c-70743d0c238d,"14 Amazing Things To Do In Costa Rica | Volcanoes, Waterfalls, Wildlife And More","This Costa Rica itinerary is the ultimate guide to spending two weeks in Costa Rica. Find out about visiting La Fortuna, Arenal, Monteverde, Naranjo, Corcovado National Park, Or‚Ä¶¬†",10.0,"Wanderlust Chloe ‚úàÔ∏è Travel guides, inspo and adventure travel ‚úàÔ∏è","Costa Rica Travel,Rio Celeste Costa Rica,Dream Vacations,Vacation Spots,Vacation Travel,Travel Pictures,Travel Photos,Fortuna Costa Rica,Costa Rica Pictures",image,https://i.pinimg.com/originals/30/93/cb/3093cb01d9de2d125fda8ba5e3e41946.jpg,travel,travel


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import TimestampType

# Read geolocation data
df_geo = spark.read.json(f"{base_path}/1226d593b7e7.geo/partition=0/")
display(df_geo)

# 1. Create a new column `coordinates` containing an array of [latitude, longitude]
df_geo_clean = df_geo.withColumn(
    "coordinates", F.array("latitude", "longitude")
)

# 2. Drop the `latitude` and `longitude` columns
df_geo_clean = df_geo_clean.drop("latitude", "longitude")

# 3. Convert the `timestamp` column from string to timestamp type
df_geo_clean = df_geo_clean.withColumn(
    "timestamp", F.col("timestamp").cast(TimestampType())
)

# 4. Reorder the DataFrame columns as specified
df_geo_clean = df_geo_clean.select(
    "ind",
    "country",
    "coordinates",
    "timestamp"
)

# Show the cleaned DataFrame
display(df_geo_clean)


country,ind,latitude,longitude,timestamp
Antarctica (the territory South of 60 deg S),2418,-88.4642,-171.061,2022-05-27T11:30:59
Antarctica (the territory South of 60 deg S),2418,-88.4642,-171.061,2022-05-27T11:30:59
Antarctica (the territory South of 60 deg S),5162,-71.6607,-149.206,2019-09-27T19:06:43
Antarctica (the territory South of 60 deg S),1335,-77.9931,-175.682,2022-03-19T17:29:42
Antarctica (the territory South of 60 deg S),9185,-10.3764,-22.9809,2019-10-06T18:12:55
Antarctica (the territory South of 60 deg S),9335,-88.4642,-171.061,2020-11-14T23:42:22
Saint Vincent and the Grenadines,2301,13.4683,51.7244,2020-11-14T00:25:28
Holy See (Vatican City State),7180,-22.7118,-167.739,2018-04-22T21:33:50
Svalbard & Jan Mayen Islands,10204,-14.3257,26.9087,2020-04-06T02:56:29
French Southern Territories,6014,-26.6026,155.206,2019-04-30T12:33:13


ind,country,coordinates,timestamp
2418,Antarctica (the territory South of 60 deg S),"List(-88.4642, -171.061)",2022-05-27T11:30:59.000+0000
2418,Antarctica (the territory South of 60 deg S),"List(-88.4642, -171.061)",2022-05-27T11:30:59.000+0000
5162,Antarctica (the territory South of 60 deg S),"List(-71.6607, -149.206)",2019-09-27T19:06:43.000+0000
1335,Antarctica (the territory South of 60 deg S),"List(-77.9931, -175.682)",2022-03-19T17:29:42.000+0000
9185,Antarctica (the territory South of 60 deg S),"List(-10.3764, -22.9809)",2019-10-06T18:12:55.000+0000
9335,Antarctica (the territory South of 60 deg S),"List(-88.4642, -171.061)",2020-11-14T23:42:22.000+0000
2301,Saint Vincent and the Grenadines,"List(13.4683, 51.7244)",2020-11-14T00:25:28.000+0000
7180,Holy See (Vatican City State),"List(-22.7118, -167.739)",2018-04-22T21:33:50.000+0000
10204,Svalbard & Jan Mayen Islands,"List(-14.3257, 26.9087)",2020-04-06T02:56:29.000+0000
6014,French Southern Territories,"List(-26.6026, 155.206)",2019-04-30T12:33:13.000+0000


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.types import TimestampType

# Read user data
df_user = spark.read.json(f"{base_path}/1226d593b7e7.user/partition=0/")
display(df_user)

# 1. Create a new column `user_name` by concatenating `first_name` and `last_name`
df_user_clean = df_user.withColumn(
    "user_name", F.concat(F.col("first_name"), F.lit(" "), F.col("last_name"))
)

# 2. Drop the `first_name` and `last_name` columns
df_user_clean = df_user_clean.drop("first_name", "last_name")

# 3. Convert the `date_joined` column from string to timestamp type
df_user_clean = df_user_clean.withColumn(
    "date_joined", F.col("date_joined").cast(TimestampType())
)

# 4. Reorder the DataFrame columns as specified
df_user_clean = df_user_clean.select(
    "ind",
    "user_name",
    "age",
    "date_joined"
)

# Show the cleaned DataFrame
display(df_user_clean)


age,date_joined,first_name,ind,last_name
27,2016-03-08T13:38:37,Christopher,2015,Bradshaw
27,2016-03-08T13:38:37,Christopher,2015,Bradshaw
59,2017-05-12T21:22:17,Alexander,10673,Cervantes
48,2016-02-27T16:57:44,Christopher,1857,Hamilton
45,2016-09-15T06:02:53,Christopher,10020,Hawkins
39,2016-06-29T20:43:59,Christina,6398,Davenport
20,2015-10-23T04:13:23,Alexandria,3599,Alvarado
20,2015-10-23T04:13:23,Alexandria,4256,Alvarado
44,2016-12-18T16:05:39,Michelle,1901,Richardson
20,2015-10-23T04:13:23,Alexandria,3831,Alvarado


ind,user_name,age,date_joined
2015,Christopher Bradshaw,27,2016-03-08T13:38:37.000+0000
2015,Christopher Bradshaw,27,2016-03-08T13:38:37.000+0000
10673,Alexander Cervantes,59,2017-05-12T21:22:17.000+0000
1857,Christopher Hamilton,48,2016-02-27T16:57:44.000+0000
10020,Christopher Hawkins,45,2016-09-15T06:02:53.000+0000
6398,Christina Davenport,39,2016-06-29T20:43:59.000+0000
3599,Alexandria Alvarado,20,2015-10-23T04:13:23.000+0000
4256,Alexandria Alvarado,20,2015-10-23T04:13:23.000+0000
1901,Michelle Richardson,44,2016-12-18T16:05:39.000+0000
3831,Alexandria Alvarado,20,2015-10-23T04:13:23.000+0000


In [0]:
from pyspark.sql import functions as F

# Join df_pin_clean and df_geo_clean on 'ind'
df_joined = df_pin_clean.join(df_geo_clean, on='ind', how='inner')

# Group by country and category, and count the number of posts in each category per country
df_category_popularity = df_joined.groupBy('country', 'category') \
    .agg(F.count('category').alias('category_count')) \
    .orderBy(F.desc('category_count'))

# Display the result
display(df_category_popularity)


country,category,category_count
Armenia,diy-and-crafts,17
Albania,mens-fashion,13
Algeria,quotes,13
Azerbaijan,event-planning,10
Colombia,finance,10
Aruba,tattoos,9
Bulgaria,finance,9
Maldives,beauty,9
Cote d'Ivoire,education,9
Cote d'Ivoire,diy-and-crafts,9


In [0]:
from pyspark.sql import Window
from pyspark.sql import functions as F

# Combine the pin and geo dataframes
popular_category_year_df = df_pin_clean.join(df_geo_clean, df_geo_clean["ind"] == df_pin_clean["ind"], how="inner")

# Define the date range from 2018 to 2022
dates = ("2018-01-01", "2022-12-31")

# Filter for posts between 2018 and 2022, selecting the year from the timestamp and category
popular_category_year_df = popular_category_year_df.filter(
    popular_category_year_df.timestamp.between(*dates)
).select(
    F.year("timestamp").alias("post_year"),
    "category"
)

# Create a window specification
window = Window.partitionBy("post_year", "category")

# Create category count column
popular_category_year_df = popular_category_year_df.withColumn("category_count", F.count("category").over(window))

# Add row number to each unique category for each year
popular_category_year_df = popular_category_year_df.withColumn("row", F.row_number().over(window.orderBy("category_count")))

# Filter to keep only rows with row number 1 (most popular category) and order results
popular_category_year_df = popular_category_year_df.filter(popular_category_year_df.row == 1) \
    .orderBy(["post_year", "category_count", "category"], ascending=[True, False, True]) \
    .drop("row")

# Display the result
display(popular_category_year_df)




post_year,category,category_count
2018,beauty,18
2018,diy-and-crafts,18
2018,travel,18
2018,art,17
2018,quotes,15
2018,christmas,13
2018,mens-fashion,11
2018,education,10
2018,vehicles,10
2018,tattoos,9


In [0]:
from pyspark.sql import functions as F

# Combine pin and geo dataframes
user_followers_country_df = df_pin_clean.join(df_geo_clean, df_geo_clean["ind"] == df_pin_clean["ind"], how="inner")

# Select columns to show, drop null value rows and duplicate rows, and change order
user_followers_country_df = user_followers_country_df.select("country", "poster_name", "follower_count") \
    .na.drop() \
    .dropDuplicates(["country", "poster_name"]) \
    .orderBy(["country", "follower_count"], ascending=[True, False])

# Group by country with the most followers for each country in descending order
country_most_followers_df = user_followers_country_df.groupBy("country") \
    .agg(F.max("follower_count").alias("follower_count")) \
    .orderBy("follower_count", ascending=False)

# Display the DataFrames
display(user_followers_country_df)
display(country_most_followers_df)


country,poster_name,follower_count
Afghanistan,TheUnstitchd,723
Afghanistan,Fiverr,565
Afghanistan,AllPosters,72
Afghanistan,craftberry bush,47
Afghanistan,Miss PlanIt,30
Afghanistan,"It's me, JD | DIY, Crafts, Home & Organization",27
Afghanistan,"Offroadium.com - Offroad, Lifted Trucks & 4x4",25
Afghanistan,Ethnic Earring,14
Afghanistan,9GAG,3
Afghanistan,Walmart,2


country,follower_count
Algeria,942
Sudan,940
Bulgaria,912
Lesotho,908
Congo,908
Armenia,892
Aruba,874
Central African Republic,868
Holy See (Vatican City State),831
Argentina,800


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Combine pin and user dataframes
popular_category_age_df = df_pin_clean.join(df_user_clean, df_user_clean["ind"] == df_pin_clean["ind"], how="inner")

# Create age group column with conditionals for each age range
popular_category_age_df = popular_category_age_df.withColumn(
    "age_group",
    F.when(popular_category_age_df.age < 18, "Under 18")
     .when(popular_category_age_df.age <= 24, "18-24")
     .when(popular_category_age_df.age <= 35, "25-35")
     .when(popular_category_age_df.age <= 50, "36-50")
     .otherwise("50+")
)

# Create window
window = Window.partitionBy("age_group", "category")

# Create category count column and select columns to show
popular_category_age_df = popular_category_age_df.withColumn("category_count", F.count("category").over(window)) \
    .select("age_group", "category", "category_count")

# Add column assigning row numbers to each unique category for each age group
popular_category_age_df = popular_category_age_df.withColumn("row", F.row_number().over(window.orderBy("category_count")))

# Filter rows so only rows with row number 1 remain and reorder columns, then drop row column
popular_category_age_df = popular_category_age_df.filter(popular_category_age_df.row == 1) \
    .orderBy(["age_group", "category_count"], ascending=[True, False]) \
    .drop("row")

# Display the result
display(popular_category_age_df)


age_group,category,category_count
18-24,mens-fashion,43
18-24,tattoos,40
18-24,christmas,37
18-24,art,31
18-24,diy-and-crafts,29
18-24,quotes,27
18-24,travel,26
18-24,beauty,18
18-24,education,16
18-24,finance,15


In [0]:
from pyspark.sql import functions as F

# Combine pin and user dataframes
median_follower_age_df = df_pin_clean.join(df_user_clean, df_user_clean["ind"] == df_pin_clean["ind"], how="inner")

# Create age group column with conditionals for each age range
median_follower_age_df = median_follower_age_df.withColumn(
    "age_group",
    F.when(median_follower_age_df.age < 18, "Under 18")
     .when(median_follower_age_df.age <= 24, "18-24")
     .when(median_follower_age_df.age <= 35, "25-35")
     .when(median_follower_age_df.age <= 50, "36-50")
     .otherwise("50+")
)

# Create a median follower count column after grouping by age group
median_follower_age_df = median_follower_age_df.groupBy("age_group") \
    .agg(F.expr("percentile_approx(follower_count, 0.5)").alias("median_follower_count")) \
    .orderBy("age_group")

# Display the result
display(median_follower_age_df)


age_group,median_follower_count
18-24,46
25-35,43
36-50,13
50+,20


In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window

# Step 1: Display a sample of df_user_clean to check the data
display(df_user)

# Step 2: Range of dates from 2015 to 2020 
dates = ("2015-01-01", "2020-12-31")

# Step 3: Convert date_joined to TimestampType and then to DateType
df_user = df_user.withColumn("date_joined", F.to_timestamp("date_joined"))

# Step 4: Filter out date_joined column for dates between 2015 and 2020
users_joining_df = df_user.filter(df_user.date_joined.between(*dates)) \
    .select(F.year("date_joined").alias("post_year"))

# Step 5: Count the number of users joined for each year
users_joining_count_df = users_joining_df.groupBy("post_year").agg(F.count("*").alias("number_users_joined"))

# Step 6: Display the result
display(users_joining_count_df)





age,date_joined,first_name,ind,last_name
27,2016-03-08,Christopher,2015,Bradshaw
27,2016-03-08,Christopher,2015,Bradshaw
59,2017-05-12,Alexander,10673,Cervantes
48,2016-02-27,Christopher,1857,Hamilton
45,2016-09-15,Christopher,10020,Hawkins
39,2016-06-29,Christina,6398,Davenport
20,2015-10-23,Alexandria,3599,Alvarado
20,2015-10-23,Alexandria,4256,Alvarado
44,2016-12-18,Michelle,1901,Richardson
20,2015-10-23,Alexandria,3831,Alvarado


post_year,number_users_joined
2015,191
2016,242
2017,76


In [0]:
## MEDIAN FOLLOWER COUNT FOR DIFFERENT AGE GROUPS

# Combine pin and user dataframes
median_follower_age_df = df_pin_clean.join(df_user_clean, df_user_clean["ind"] == df_pin_clean["ind"], how="inner")

# Creates age group column with conditionals for each age range
median_follower_age_df = median_follower_age_df.withColumn("age_group", 
    when(median_follower_age_df.age < 18, median_follower_age_df.age)
    .when(median_follower_age_df.age <= 24, "18-24")
    .when(median_follower_age_df.age <= 35, "25-35")
    .when(median_follower_age_df.age <= 50, "36-50")
    .otherwise("50+")
)

# Creates a median follower count column after grouping by age group
median_follower_age_df = median_follower_age_df.groupBy("age_group") \
    .agg(expr("percentile_approx(follower_count, 0.5)").alias("median_follower_count")) \
    .orderBy("age_group")

display(median_follower_age_df)




age_group,median_follower_count
18-24,46
25-35,43
36-50,13
50+,20


In [0]:
## NUMBER OF USERS JOINING EACH YEAR (2015 - 2020)

# Range of dates from 2015 to 2020 
dates = ("2015-01-01", "2020-12-31")

# Filter out date joined column for dates between 2015 and 2020, renaming it as post year with just the year value and select columns to show
users_joining_df = df_user.filter(df_user.date_joined.between(*dates)) \
    .select(year("date_joined").alias("post_year"))

# Create number of users joined for each year
users_joining_count_df = users_joining_df.groupBy("post_year") \
    .agg(F.count("*").alias("number_users_joined"))

# Display the result
display(users_joining_count_df)


post_year,number_users_joined
2015,191
2016,242
2017,76


In [0]:
## MEDIAN FOLLOWER COUNT OF USERS BASED ON JOINING YEAR (2015 - 2020)

# Combine pin and user dataframes
median_follower_year_df = df_pin_clean.join(df_user, df_user["ind"] == df_pin_clean["ind"], how="inner")

# Range of dates from 2015 to 2020 
dates = ("2015-01-01", "2020-12-31")

# Filter out date joined column for dates between 2015 and 2020, renaming it as post year with just the year value and select columns to show
median_follower_year_df = median_follower_year_df.filter(median_follower_year_df.date_joined.between(*dates)) \
    .select(year("date_joined").alias("post_year"), "follower_count")

# Creates a median follower count column after grouping by post year
median_follower_year_df = median_follower_year_df.groupBy("post_year") \
    .agg(expr("percentile_approx(follower_count, 0.5)").alias("median_follower_count")) \
    .orderBy("post_year")

display(median_follower_year_df)


post_year,median_follower_count
2015,45
2016,41
2017,15


In [0]:
## MEDIAN FOLLOWER COUNT OF USERS BASED ON AGE GROUP AND JOINING YEAR (2015 - 2020)

# Combine pin and user dataframes
median_follower_age_year_df = df_pin_clean.join(df_user, df_user["ind"] == df_pin_clean["ind"], how="inner")

# Creates age group column with conditionals for each age range
median_follower_age_year_df = median_follower_age_year_df.withColumn("age_group", 
    when(median_follower_age_year_df.age < 18, median_follower_age_year_df.age)
    .when(median_follower_age_year_df.age <= 24, "18-24")
    .when(median_follower_age_year_df.age <= 35, "25-35")
    .when(median_follower_age_year_df.age <= 50, "36-50")
    .otherwise("50+")
)

# Range of dates from 2015 to 2020 
dates = ("2015-01-01", "2020-12-31")

# Filter out date joined column for dates between 2015 and 2020, renaming it as post year with just the year value and select columns to show
median_follower_age_year_df = median_follower_age_year_df.filter(median_follower_age_year_df.date_joined.between(*dates)) \
    .select("age_group", year("date_joined").alias("post_year"), "follower_count")

# Creates a median follower count column after grouping by both age group and post year
median_follower_age_year_df = median_follower_age_year_df.groupBy("age_group", "post_year") \
    .agg(expr("percentile_approx(follower_count, 0.5)").alias("median_follower_count")) \
    .orderBy("age_group", "post_year")

display(median_follower_age_year_df)

age_group,post_year,median_follower_count
18-24,2015,73
18-24,2016,46
18-24,2017,40
25-35,2015,42
25-35,2016,43
25-35,2017,46
36-50,2015,25
36-50,2016,13
36-50,2017,9
50+,2015,196
