In [0]:
# Create single DataFrame
def create_df(topic: str):
    '''
    Creates a dataframe from the data in the S3 bucket from a given topic.

    Parameters
    ----------
    topic: str
        The name of the Kafka topic, which coincides with the name of the directory inside bucket-name/topics.
    
    Returns
    -------
    df : pyspark.sql.dataframe.DataFrame
        The dataframe containing the data stored in the bucket.
    '''
    file_location = "/mnt/jc_bucket/topics/" + topic + "/partition=0/*.json" 
    file_type = "json"
    # Ask Spark to infer the schema
    infer_schema = "true"
    # Read in JSONs from mounted S3 bucket
    df = spark.read.format(file_type) \
    .option("inferSchema", infer_schema) \
    .load(file_location)
    # Display Spark dataframe to check its content
    return df