#### Cleaning df_pin dataframe function

In [0]:
def clean_df_pin(df_pin):
    irrelevant_data = {"description": "No description available Story format",
                    "follower_count": "User Info Error",
                    "image_src" : "Image src error.",
                    "poster_name" : "User Info Error",
                    "tag_list": "N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e",
                    "title" : "No Title Data Available"}
    
    # Function to remove irrelevant data from all columns
    def replace_values_to_null(dataframe, col_name, value_to_replace): 
        dataframe = dataframe.replace({value_to_replace: None}, subset=[col_name])
        return dataframe

    for key, value in irrelevant_data.items():
        df_pin = replace_values_to_null(dataframe=df_pin, col_name=key, value_to_replace=value)

    # Transform follower_count column to ensure every entry is a number, and data type is an int.
    df_pin = df_pin.withColumn("follower_count", regexp_replace("follower_count", "k", "000"))\
        .withColumn("follower_count", regexp_replace("follower_count", "M", "000000"))
    df_pin = df_pin.withColumn("follower_count", df_pin["follower_count"].cast("int"))

    # Ensure that remaining columns containing numeric data have numeric data type.
    df_pin = df_pin.withColumn("downloaded", df_pin["downloaded"].cast("int"))
    df_pin = df_pin.withColumn("index", df_pin["index"].cast("int"))

    # Clean the data in the save_location column to include only the save location path
    df_pin = df_pin.withColumn("save_location", regexp_replace("save_location", "Local save in ", ""))

    # Rename the index column to ind
    df_pin = df_pin.withColumnRenamed("index", "ind")
    # Reorder the DataFrame columns
    df_pin = df_pin.select(['ind', 'unique_id', 'title', 'description', 'follower_count', 'poster_name', 
                            'tag_list', 'is_image_or_video', 'image_src', 'save_location', 'category'])
    
    return df_pin

#### Cleaning df_geo dataframe function

In [0]:
def clean_df_geo(df_geo):    
    # Create a new column coordinates that contains an array based on the latitude and longitude columns
    df_geo = df_geo.withColumn('coordinates', array(col('latitude'), col('longitude')))
    # Drop the latitude and longitude columns from the DataFrame
    df_geo = df_geo.drop("latitude", "longitude")

    # Convert the timestamp column to a timestamp data type
    # df_geo = df_geo.withColumn("timestamp", unix_timestamp("timestamp", "yyyy-MM-dd HH:mm:ss").cast("timestamp"))
    df_geo = df_geo.withColumn("timestamp", to_timestamp("timestamp"))
    df_geo = df_geo.withColumn("ind", df_geo["ind"].cast("int"))

    # Reorder the DataFrame columns
    df_geo = df_geo.select(['ind', 'country', 'coordinates', 'timestamp'])

    return df_geo

#### Cleaning df_user dataframe function

In [0]:
def clean_df_user(df_user):    
    # Create a new column user_name that concatenates the information found in the first_name and last_name columns
    df_user = df_user.withColumn("user_name", concat(col("first_name"), lit(" "), col("last_name")))
    # Drop the first_name and last_name columns from the DataFrame
    df_user = df_user.drop("first_name", "last_name")

    # Convert the date_joined column from a string to a timestamp data type
    df_user = df_user.withColumn("date_joined", unix_timestamp("date_joined", "yyyy-MM-dd HH:mm:ss").cast("timestamp"))
    # Ensure that each column containing numeric data has a numeric data type
    df_user = df_user.withColumn("ind", df_user["ind"].cast("int"))
    df_user = df_user.withColumn("age", df_user["age"].cast("int"))

    # Reorder the DataFrame columns
    df_user = df_user.select(['ind', 'user_name', 'age', 'date_joined'])
    
    return df_user