In [1]:
import os
# os.chdir('..') # this resolves ImportError: attempted relative import with no known parent package
import pandas as pd
import json
from src.processing.process_json_data import combine_all_json_files, json_response_to_dataframe, drop_columns, remove_prefixes
from src.processing.chop_dataframe import single_metric_dataframes, split_into_video_series
from src.processing.feature_engineering import create_year_month_columns, create_video_counts_columns, duration_to_hhmmss
from src.visualisation.visualisations import prep_df_for_visualisation, viz_line_chart, viz_video_counts

# Load and process JSON data
1. Due to limitations on API requests the data is contained in many JSON files - first we will combine all of these files into one JSON file and save it in a folder.
2. Load the file that contains all of the JSON data and store it in a variable.
3. Turn the JSON file into a dataframe


In [2]:
# set path to folder that contains all the json files we want to combine into one
json_folder_path = "data/raw/JSON_response"

# create 'all_json_data.json' file that combines all the indivisual json files
combine_all_json_files(json_folder_path) # creates 'all_json_data.json' file which contains all the json data in one file

# load json data
file_path = "data/processed/all_json_data.json"
with open(file_path, 'r') as f:
    json_response_data = json.load(f)

# turn json response into a dataframe
df = json_response_to_dataframe(json_response_data)

df.head(2)

Unnamed: 0,videoID,snippet.publishedAt,snippet.channelId,snippet.title,snippet.description,snippet.thumbnails.default.url,snippet.thumbnails.default.width,snippet.thumbnails.default.height,snippet.thumbnails.medium.url,snippet.thumbnails.medium.width,...,statistics.favoriteCount,statistics.commentCount,contentDetails.duration,contentDetails.dimension,contentDetails.definition,contentDetails.caption,contentDetails.licensedContent,contentDetails.projection,contentDetails.regionRestriction.blocked,contentDetails.contentRating.ytRating
0,uGtc9Bu9Txk,2020-12-31T23:00:10Z,UCArk93C2pbOvkv6jWz-3kAg,🍾 FORD KIERNAN HOGMANAY SPECIAL! | Open Goal,SUBSCRIBE to Open Goal - https://bit.ly/2QGY26...,https://i.ytimg.com/vi/uGtc9Bu9Txk/default.jpg,120,90,https://i.ytimg.com/vi/uGtc9Bu9Txk/mqdefault.jpg,320,...,0,112,PT1H21M55S,2d,hd,False,True,rectangular,,
1,vAGmV-mRRT0,2020-12-24T07:00:08Z,UCArk93C2pbOvkv6jWz-3kAg,CHRISTMAS EVE SPECIAL | Right in the Coupon w/...,SUBSCRIBE to Open Goal - https://bit.ly/2QGY26...,https://i.ytimg.com/vi/vAGmV-mRRT0/default.jpg,120,90,https://i.ytimg.com/vi/vAGmV-mRRT0/mqdefault.jpg,320,...,0,30,PT44M31S,2d,hd,False,True,rectangular,,


# Clean dataframe
- Remove prefixes from column names
- Drop columns we don't need/want
- Drop duplicate video IDs
- Chnage count data (views, comments, likes) to integer data type

In [3]:
df.columns

Index(['videoID', 'snippet.publishedAt', 'snippet.channelId', 'snippet.title',
       'snippet.description', 'snippet.thumbnails.default.url',
       'snippet.thumbnails.default.width', 'snippet.thumbnails.default.height',
       'snippet.thumbnails.medium.url', 'snippet.thumbnails.medium.width',
       'snippet.thumbnails.medium.height', 'snippet.thumbnails.high.url',
       'snippet.thumbnails.high.width', 'snippet.thumbnails.high.height',
       'snippet.thumbnails.standard.url', 'snippet.thumbnails.standard.width',
       'snippet.thumbnails.standard.height', 'snippet.thumbnails.maxres.url',
       'snippet.thumbnails.maxres.width', 'snippet.thumbnails.maxres.height',
       'snippet.channelTitle', 'snippet.tags', 'snippet.categoryId',
       'snippet.liveBroadcastContent', 'snippet.localized.title',
       'snippet.localized.description', 'snippet.defaultAudioLanguage',
       'statistics.viewCount', 'statistics.likeCount',
       'statistics.favoriteCount', 'statistics.commentCou

In [4]:
# remove prefixes from column names
df.columns = df.columns.map(remove_prefixes)

# drop columns that are not needed
df = drop_columns(df)

# drop duplicate video IDs
df.drop_duplicates(subset=["videoID"], inplace=True)

# convert viewCount etc. to integer data type
df[["viewCount", "likeCount", "commentCount", "favoriteCount"]] = df[["viewCount", "likeCount", "commentCount", "favoriteCount"]].astype(int)

# transform duration into usable formats
df['duration_timedelta'] = df['duration'].apply(lambda x: duration_to_hhmmss(x)[0])
df['duration_string'] = df['duration'].apply(lambda x: duration_to_hhmmss(x)[1])

# Add features
For easy visualisation we are going to:
- Create year, month and year-month columns
- Create video count columns (how many videos released per month)

In [5]:
df = create_year_month_columns(df)
df = create_video_counts_columns(df)
df.head(2)

Dataframe created without these columns.
Dataframe created without these columns.


Unnamed: 0,videoID,publishedAt,channelId,title,description,channelTitle,tags,viewCount,likeCount,favoriteCount,commentCount,duration,duration_seconds,duration_string,publishedAtYear,publishedAtMonth,publishedAtYearMonth,videoCountMonth,videoCountYear
0,wA7v2RtVdSA,2017-05-26 12:49:13+00:00,UCArk93C2pbOvkv6jWz-3kAg,Open Goal Trailer,,Open Goal,"[open goal, kevin thomson, darren o'dea, charl...",8709,37,0,2,PT48S,0 days 00:00:48,00:00:48,2017,5,2017-05,1,57
1,Fkmm0E_qgm8,2017-06-02 08:49:46+00:00,UCArk93C2pbOvkv6jWz-3kAg,Si Ferry Meets...Kevin Thomson Episode 1 - Exc...,"In Part 1 of Si Ferry Meets… Kevin Thomson, th...",Open Goal,"[Kevin Thomson, Rangers, Rangers Fc, Hibernian...",47975,152,0,7,PT6M45S,0 days 00:06:45,00:06:45,2017,6,2017-06,21,57


# Prepare dataframe for visualisation
We want to viusualise metrics over time, to do this we will group the dataframe by month to give us the volume of views, likes, comments etc. at a monthly level.

In [6]:
df_group_month = prep_df_for_visualisation(df)
df_group_month.head(2)

Unnamed: 0,publishedAtYearMonth,publishedAtYear,publishedAtMonth,viewCount,likeCount,commentCount,videoCountMonth,videoCountYear
0,2017-05,2017,5,8709,37,2,1,57
1,2017-06,2017,6,767417,2577,87,21,57


# Visualise data

# Viualise podcast series - Keeping the ball on ground

In [None]:
# prep dataframes - split into series