In [0]:
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import array
from pyspark.sql.functions import concat_ws
import pyspark

In [0]:
# Clean pin data
def clean_pin_data(pin_df: pyspark.sql.dataframe.DataFrame):
    '''
    Cleans the DataFrame that contains information about Pinterest posts.

    Parameters
    ----------
    pin_df : pyspark.sql.dataframe.DataFrame
        The DataFrame to be cleaned.
        The DataFrame must contain the columns: category, description, downloaded, follower_count, image_src, index, is_image_or_video, poster_name, save_location, tag_list, title, unique_id.
    
    Returns
    -------
    df : pyspark.sql.dataframe.DataFrame
        The cleaned Pinterest post data.
    '''
    df = pin_df.dropDuplicates().alias('df')
    #df = pin_df.alias('df')

    # replace several non-values with 'None' in different columns
    df = df  \
        .replace('User Info Error', None, ['follower_count', 'poster_name'])  \
        .replace('N,o, ,T,a,g,s, ,A,v,a,i,l,a,b,l,e', None, 'tag_list') \
        .replace('No description available Story format', None, 'description') \
        .replace('Image src error.', None, 'image_src') \
        .replace('No Title Data Available', None, 'title')

    # convert all 'k' and 'M' in column 'follower_count' to '000' and '000000' respectively
    df = df\
        .withColumn('follower_count', regexp_replace('follower_count', 'k', '000')) \
        .withColumn('follower_count', regexp_replace('follower_count', 'M', '000000'))

    # cast 'follower_count' column to integer
    df = df.withColumn('follower_count', df.follower_count.cast('int'))
    # cast index to int
    df = df.withColumn('index', df.index.cast('int'))

    # remove the "Local save in " bit in the save_location column so that only the path is given
    df = df\
        .withColumn('save_location', regexp_replace('save_location', 'Local save in ', ''))
    # rename column
    df = df.withColumnRenamed('index', 'ind')

    # reorder dataframe columns
    df = df.select('ind','unique_id','title','description','follower_count','poster_name','tag_list','is_image_or_video','image_src','save_location','category')

    return df


In [0]:
# Clean geo data
def clean_geo_data(geo_df: pyspark.sql.dataframe.DataFrame):
    '''
    Cleans the DataFrame that contains information about geolocation.

    Parameters
    ----------
    geo_df : pyspark.sql.dataframe.DataFrame
        The DataFrame to be cleaned.
        The DataFrame must contain the columns: country, ind, latitude, longitude, timestamp.
    
    Returns
    -------
    df : pyspark.sql.dataframe.DataFrame
        The cleaned geolocation data.
    '''
    df = geo_df.dropDuplicates().alias('df')
    #df = geo_df.alias('df')

    # cast latitude and longitude to 'float', then merge them into an array column called 'coordinates'
    df = df\
        .withColumn('latitude', df.latitude.cast('float'))\
        .withColumn('longitude', df.longitude.cast('float'))
    df = df\
        .withColumn('coordinates', array(df.latitude, df.longitude))
    # cast column named 'timestamp' to the type 'timestamp. cast columnd 'ind' to type 'int'
    df = df\
        .withColumn('timestamp', df.timestamp.cast('timestamp'))\
        .withColumn('ind',df.ind.cast('int'))
    # drop 'longitude' and 'latitude' columns and reorder the columns
    df = df\
        .select('ind', 'country', 'coordinates', 'timestamp')
    
    return df

In [0]:
# Clean user data
def clean_user_data(user_df: pyspark.sql.dataframe.DataFrame):
    '''
    Cleans the DataFrame that contains information about users.

    Parameters
    ----------
    geo_df : pyspark.sql.dataframe.DataFrame
        The DataFrame to be cleaned.
        The DataFrame must contain the columns: age, date_joined, first_name, ind, last_name.
    
    Returns
    -------
    df : pyspark.sql.dataframe.DataFrame
        The cleaned users data.
    '''

    df = user_df.dropDuplicates().alias('df')
    #df = user_df.alias('df')

    # create column 'user_name' made up by concatenating 'first_name' and 'last_name'
    df = df\
        .withColumn('user_name', concat_ws(' ', df.first_name, df.last_name))

    # cast 'ind' and 'age' to an 'int' type, and date_joined' to a 'timestamp' type.
    df = df\
        .withColumn('date_joined', df.date_joined.cast('timestamp'))\
        .withColumn('ind', df.ind.cast('int'))\
        .withColumn('age', df.age.cast('int'))

    # drop 'first_name' and 'last_name', and reorder the columns
    df = df\
        .select('ind', 'user_name', 'age', 'date_joined')

    return df

In [0]:
%run /Users/joelcosta94i@gmail.com/data_extraction

In [0]:
# Load cleaned data
# Load the data, then clean it, and return the cleaned data
def load_cleaned_data():
    '''
    Loads the data from the S3 bucket, cleans it, then returns it as a triple of dataframes
    
    Returns
    -------
    df_pin : pyspark.sql.dataframe.DataFrame
        The dataframe containing the cleaned pin data.
    df_geo : pyspark.sql.dataframe.DataFrame
        The dataframe containing the cleaned geo data.
    df_user : pyspark.sql.dataframe.DataFrame
        The dataframe containing the cleaned user data.
    '''
    df_pin = create_df("0a6a638f5991.pin")
    df_geo = create_df("0a6a638f5991.geo")
    df_user = create_df("0a6a638f5991.user")
    df_pin = clean_pin_data(df_pin)
    df_geo = clean_geo_data(df_geo)
    df_user = clean_user_data(df_user)
    return df_pin, df_geo, df_user