In [0]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import *


In [0]:
# clean df_pin
def clean_df_pin(df_pin: DataFrame) -> DataFrame:
    df = df_pin

    # replace empty entries and invalid entries with None
    entries_to_replace = ["", "No description available Story format", "No description available", "Image src error.", "User Info Error", "N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e", "No Title Data Available"]
    for column in df.columns:
        df = df.withColumn(column, when(col(column).isin(entries_to_replace), None).otherwise(col(column)))
    
    # transofrm follower_count to numeric data 
    column = "follower_count"
    df = df.withColumn(column, when(col(column).contains("k"), regexp_replace(col(column), "k", "").cast("int") * 1000).when(col(column).contains("M"), regexp_replace(col(column), "M", "").cast("int") * 1000 * 1000).otherwise(col(column).cast("int")))

    # casting numeric columns to numreic data type
    df = df.withColumn("downloaded", col("downloaded").cast("int"))
    df = df.withColumn("index", col("index").cast("int"))

    # Clean the data in the save_location column to include only the save location path
    df = df.withColumn("save_location",regexp_replace(col("save_location"), "Local save in ", ""))

    # Rename the index column to ind.
    df = df.withColumnRenamed("index", "ind")

    # reorder columns
    df = df.select("ind", "unique_id", "title", "description", "follower_count", "poster_name", "tag_list", "is_image_or_video", "image_src", "save_location", "category")
    return df


In [0]:
# clean df_geo
def clean_df_geo(df_geo: DataFrame) -> DataFrame:
    df = df_geo

    # create "coordinate" column
    df = df.withColumn("coordinates", array(col("latitude"), col("longitude")))

    # drop the "latitude" and "longitude" columns
    df = df.drop("latitude")
    df = df.drop("longitude")

    # Convert the "timestamp" column to a timestamp data type
    df = df.withColumn("timestamp", to_timestamp(col("timestamp")))

    # convert the "ind" column to a numeric data type
    df = df.withColumn("ind", col("ind").cast("int"))

    # reoder columns
    df = df.select("ind", "country", "coordinates", "timestamp")
    return df


In [0]:
# clean df_user
def clean_df_user(df_user: DataFrame) -> DataFrame:
    df = df_user

    # create "username" column
    df = df.withColumn("username", concat("first_name", lit(" "), "last_name"))

    # drop the unwanted columns
    df = df.drop("first_name")
    df = df.drop("last_name")

    if "index" in df.columns:
        df = df.drop("index")
        df = df.drop("name")
        df = df.drop("role")

    # Convert the "join_date" column to a timestamp data type
    df = df.withColumn("date_joined", to_timestamp(col("date_joined")))

    # Convert numeric columns to a numeric data type
    df = df.withColumn("ind", col("ind").cast("int"))
    df = df.withColumn("age", col("age").cast("int"))

    # Reorder columns
    df = df.select("ind", "username", "age", "date_joined")

    # remove rows where suername is null
    df = df.filter(df.username.isNotNull())
    return df
