# Data Cleaning/ Transformations script

## This file will contain the methods that will be used to clean/ transform the data from the three tables for the Pinterest data pipeline project

In [0]:
# Creating a method that will take in a dataframe, column name  and renamed column name as an input and provide an output of the dataframe with the column name changed:
def change_col_name(df, col_name, col_renamed):
    return df.withColumnRenamed(col_name, col_renamed)

In [0]:
# Creating a method that will take in a dataframe and column names (as a list) and return a dataframe with a reordered dataframe with only selected columns provided by user:
def desired_cols(df, col_names):
    return df.select(col_names)

In [0]:
# Creating a method which will take in a dataframe and the column name to drop duplicates on that specific column:
def drop_duplicates(df, column):
    return df.dropDuplicates([column])

In [0]:
# Creating a method which will take in a dataframe, column name and data type as an input and return the dataframe with the column name specified to the data type specified:
def casting_col(df, column, data_type):
    return df.withColumn(column, col(column).cast(data_type))

In [0]:
# During EDA, the 'follower_count' column in Pinterest Posts data table has values such as '10k, 500, 100k, 2M' so we need to turn all of these into integers:
# Create a custom user defined function (UDF) to remove the "k" and "M" from the 'follower_count' column and return an integer equivalent:

# First, defining a custom UDF to convert 'follower_count to integers and anything containing "k" or "M" in the values to integers:
def convert_follower_count(value):
    if 'M' in value:
        return int(value.strip('M')) * 1000000
    elif 'k' in value:
        return int(value.strip('k')) * 1000
    else:
        return int(value)

# Registering the UDF in Spark with function:
convert_follower_count_udf = udf(convert_follower_count)

In [0]:
# The "save_location" should only contain the the path location path so need to removed the prefix to this column:
# Creating a custom UDF to remove the "Local save in " of each row in the column:

def remove_prefix_save_location(value):
        return value.replace('Local save in ', '')
    
# Registering the UDF in Spark with function:
remove_prefix_save_location_udf = udf(remove_prefix_save_location)

In [0]:
# Creating a method which is now unique to the pinterest posts data
# This will take the input as the pinterest posts dataframe and return the fully transformed version:
def transform_pin(df_pin):
    df_pin = change_col_name(df_pin, "index", "ind")
    df_pin = drop_duplicates(df_pin, "ind")
    df_pin = desired_cols(df_pin, ["ind", "unique_id", "title", "description", "follower_count", "poster_name", "tag_list", "is_image_or_video", "image_src", "save_location", "category"])
    df_pin = casting_col(df_pin, "ind", "integer")
    df_pin = df_pin.filter(col("follower_count") != "User Info Error")
    df_pin = df_pin.withColumn("follower_count", convert_follower_count_udf(col("follower_count")))
    df_pin = casting_col(df_pin, "follower_count", "integer")
    df_pin = df_pin.withColumn("save_location", remove_prefix_save_location_udf(col("save_location")))
    # Replacing the "Image src error." with None in the image_src column:
    df_pin = df_pin.withColumn("image_src", when(col("image_src") == "Image src error.", None).otherwise(col("image_src")))
    # Replacing the "N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e" with None in the tag_list column:
    df_pin = df_pin.withColumn("tag_list", when(col("tag_list") == "N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e", None).otherwise(col("tag_list")))
    # Replacing the "No description available" & the "No description available Story format" with None from the description column:
    df_pin = df_pin.withColumn("description", when((col("description") == "No description available") | (col("description") == "No description available Story format"), None).otherwise(col("description")))
    # Replacing the blanks, "loading..." with None in the 'title' column"
    df_pin = df_pin.withColumn("title", when((col("title") == "") | (col("title") == "Loading..."), None).otherwise(col("title")))
    return df_pin


In [0]:
# Creating a method which is now unique to the geolocation data
# This will take the input as the geolocation dataframe and return the fully transformed version:
def transform_geo(df_geo):
    df_geo = drop_duplicates(df_geo, "ind")
    df_geo = df_geo.withColumn("coordinates", array(col("latitude"), col("longitude"))) # creating a new column called coordinates
    df_geo = desired_cols(df_geo, ["ind", "country", "coordinates", "timestamp"])
    df_geo = casting_col(df_geo, "timestamp", "timestamp")
    df_geo = casting_col(df_geo, "ind", "integer")
    return df_geo

In [0]:
# Creating a method which is now unique to the user data
# This will take the input as the user dataframe and return the fully transformed version:
def transform_user(df_user):
    df_user = drop_duplicates(df_user, "ind")
    df_user = df_user.withColumn("user_name", concat(col("first_name"), lit(" "), col("last_name")))
    df_user = desired_cols(df_user, ["ind", "user_name", "age", "date_joined"])
    df_user = casting_col(df_user, "ind", "integer")
    df_user = casting_col(df_user, "age", "integer")
    return df_user