In [1]:
import pandas as pd
import json

# Create function that creates a dataframe from the json response
- Grab the channel ID from JSON data
- Create dictionary that stores all of the video data from the multiple JSON files and also stores the most recent channel statistics (essentially we are taking the JSON response data (which is multiple files) and storing all of their information together in one dictionary (instead of multiple files and dictionaries))
- Store the video data (video ID is key and value is a dictionary with viewCount, commentCount etc.) in a variable.
- Store the video IDs in a list
- Store the video data values (dictionary with all their data) in a list
- Create a list of video data dictionaries
- Add the video id to the video data dictionaries
- Use `pd.json_normalize` to create a dataframe from the video data dictionaries

In [2]:
file_path = "../data/processed/all_json_data.json"

with open(file_path, 'r') as f:
    json_response_data = json.load(f)

In [3]:
def json_response_to_dataframe(json_response_data):
    """Takes JSON response data, performs some processing operations and outputs a dataframe.
    
    
    Processing Operations
    -----------------------
    - Grabs and stores the channel ID from the JSON response.
    - Creates empty dictionary that will house all the JSON data
    - Loops through the json_response_data and appends a dictionary of video data to the dictionary
    - Loops through the json_response_data and appends the most up to date channel statistics to
        the dictionary.
    - Create lists of the video IDs and the video dictionaries, joins these and then creates a dataframe.
    
    
    
    Parameters
    ----------
    json_response_data (json file):
        json data file. This will be all of the indivdual json files combined into one 
        file.
        
    Returns
    -------
    df_processed (dataframe):
        pandas dataframe of the JSON response data. Columns and column names are untouched.
        
    Example Use
    -----------
        
    
    Notes
    -----
    
    
    """
    # get channel ID by grabbing it from the first JSON file (the key of each file is the channel ID) 
    channel_id = [key for key in json_response_data[0].keys()][0]

    
    # create empty dictionary
    json_data = {}

    # create key for dictionary and give this key two values which are empty dictionaries
    json_data[channel_id] = {'channel_statistics': {}, 
                                'video_data': {}
                                }
    
    ##### Video data #####
    # loop through the json_response_data, access and then append the video data to our json_data dictionary 
    # as a value for the 'video_data' key
    for i, v in enumerate(json_response_data):

        # accss video data from the json response data
        video_data_ = json_response_data[i][channel_id]["video_data"]

        # store this in the json_data dictionary
        json_data[channel_id]["video_data"].update(video_data_)
        
      
    ##### Channel statistics #####
    # Next we need to access the channel statistics data, and then append it as a value for the 'channel_statistics' key in the 
    # json_data dictionary 
    # NOTE: the channel statistics data is NOT the same in each JSON file (the video count varies so set the video
    # count to 0 and use this as a starting value for the logic to be used in the for loop to make sure we get the 
    # most up to date channel statistics
    
    # set videoCount to 0
    json_data[channel_id]["channel_statistics"]["videoCount"] = 0

    
    # loop through the json_response_data, access channel statistics and select the one to append to dictionary
    for i, v in enumerate(json_response_data):

        # accss channel statistics from the first file in the json response data
        channel_statistics_ = json_response_data[i][channel_id]["channel_statistics"]
        # convert the string to integer
        channel_statistics_["videoCount"] = int(channel_statistics_["videoCount"])

        # add channel statistics to json_data if the video count is higher than previous file
        # NOTE: videoCount might need to be changed to viewCount as the logic could be affected if videos are
        # deleted from channel
        if channel_statistics_["videoCount"] > json_data[channel_id]["channel_statistics"]["videoCount"]:
            # update json_data dictionary with new videoCount and the up to date channel statistics
            json_data[channel_id]["channel_statistics"] = channel_statistics_
    

    ##### Create DataFrame #####
    # Using the dictionary created above with pd.json_normalize() won't work so we need to do a bit of 
    # further processing to get the data in a format that we can use pd.json_normalize()
    
    # store a list of the "video_data" dictionaries
    video_data = json_data[channel_id]["video_data"]
    # get list of video IDs and a list of the video data dictionaries dictionaries
    # store the video ids in a list
    video_ids = [vid_id for vid_id in video_data.keys()]
    video_data_dictionaries = [video_data_dict for video_data_dict in video_data.values()]
    
    # loop through the list of video data dictionaries and add the video ID to the dictionary 
    for index, data_dict in enumerate(video_data_dictionaries):
        data_dict['videoID'] = video_ids[index] 
    
    # video_data_dictionaries now has the data in a format we can use with pd.json_normalize()
    # to create a dataframe
    df = pd.json_normalize(video_data_dictionaries)
    
    
    return df

    


In [4]:
df = json_response_to_dataframe(json_response_data=json_response_data)
df.head()

Unnamed: 0,videoID,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,snippet.thumbnails.default.height,snippet.thumbnails.medium.url,snippet.thumbnails.medium.width,...,statistics.favoriteCount,statistics.commentCount,contentDetails.duration,contentDetails.dimension,contentDetails.definition,contentDetails.caption,contentDetails.licensedContent,contentDetails.projection,contentDetails.regionRestriction.blocked,contentDetails.contentRating.ytRating
0,uGtc9Bu9Txk,2020-12-31T23:00:10Z,UCArk93C2pbOvkv6jWz-3kAg,🍾 FORD KIERNAN HOGMANAY SPECIAL! | Open Goal,SUBSCRIBE to Open Goal - https://bit.ly/2QGY26...,https://i.ytimg.com/vi/uGtc9Bu9Txk/default.jpg,120,90,https://i.ytimg.com/vi/uGtc9Bu9Txk/mqdefault.jpg,320,...,0,112,PT1H21M55S,2d,hd,False,True,rectangular,,
1,vAGmV-mRRT0,2020-12-24T07:00:08Z,UCArk93C2pbOvkv6jWz-3kAg,CHRISTMAS EVE SPECIAL | Right in the Coupon w/...,SUBSCRIBE to Open Goal - https://bit.ly/2QGY26...,https://i.ytimg.com/vi/vAGmV-mRRT0/default.jpg,120,90,https://i.ytimg.com/vi/vAGmV-mRRT0/mqdefault.jpg,320,...,0,30,PT44M31S,2d,hd,False,True,rectangular,,
2,m54LPZSlStQ,2020-12-22T17:00:20Z,UCArk93C2pbOvkv6jWz-3kAg,EPIC CUP FINAL REVIEW | Keeping the Ball on th...,SUBSCRIBE to Open Goal - https://bit.ly/2QGY26...,https://i.ytimg.com/vi/m54LPZSlStQ/default.jpg,120,90,https://i.ytimg.com/vi/m54LPZSlStQ/mqdefault.jpg,320,...,0,63,PT1H6M12S,2d,hd,False,True,rectangular,,
3,1RcFj1Sfs58,2020-12-21T17:00:03Z,UCArk93C2pbOvkv6jWz-3kAg,JAMIE CARRAGHER | Open Goal Meets...,SUBSCRIBE to Open Goal - https://bit.ly/2QGY26...,https://i.ytimg.com/vi/1RcFj1Sfs58/default.jpg,120,90,https://i.ytimg.com/vi/1RcFj1Sfs58/mqdefault.jpg,320,...,0,661,PT2H22M14S,2d,hd,False,True,rectangular,,
4,MaJhyEr6csQ,2020-12-17T17:25:28Z,UCArk93C2pbOvkv6jWz-3kAg,CELTIC v HEARTS & ST MIRREN END RANGERS UNBEAT...,SUBSCRIBE to Open Goal - https://bit.ly/2QGY26...,https://i.ytimg.com/vi/MaJhyEr6csQ/default.jpg,120,90,https://i.ytimg.com/vi/MaJhyEr6csQ/mqdefault.jpg,320,...,0,49,PT45M40S,2d,hd,False,True,rectangular,,


# Export processed dataframe

In [5]:
path_to_save = "../data/processed/json_data_df.csv"
df.to_csv(path_to_save, index=False)