# Popularity of trending videos for YouTube: Late 2017 - Mid 2018 

## A [BME Visual Analysis of Measurement Data homework](https://www.mit.bme.hu/oktatas/targyak/vimiav16) - Ádám Sike in collaboration with Tong Gao

This is a homework about comparing different video categories in 10 coutries around the world.
The data is about trending videos between the end of 2017 and the summer of 2018.

Unzip data_fixed.zip into the ```data``` subfolder.

Src: [Trending YouTube Video Statistics (Kaggle)](https://www.kaggle.com/datasets/datasnaek/youtube-new?datasetId=4549) and [2017 Population of the Earth (geoba.se)](http://www.geoba.se/population.php?pc=world&type=028&year=2017&page=1)

In [None]:
%pip install jupyter-dash
%pip install pandas

In [None]:
# Read data from all of the files provided
import pandas as pd
import bamboolib as bam
import json

pd.set_option('display.float_format', lambda x: '%.2f' % x) # Set numeric value print format
pd.options.plotting.backend = "plotly" # Use plotly to generate visualizations

# COLS = ['video_id':str,
# 'trending_date':str,
# 'title':str,
# 'channel_title':str,
# 'category_id':int,
# 'publish_time':str,
# 'tags':str,
# 'views':int,
# 'likes':int,
# 'dislikes':int,
# 'comment_count':int,
# 'thumbnail_link':str,
# 'comments_disabled':bool,
# 'ratings_disabled':bool,
# 'video_error_or_removed':bool,
# 'description':str,
# ]


countries = ["CA","DE","FR","GB","IN","JP","KR","RU","MX","US"] # TODO fix Korea if we can

countries_iso_a3 = ["CAN","DEU","FRA","GBR","IND","JPN","KOR","RUS","MEX","USA"] # for the maps


c = {'country': ["CAN","DEU","FRA","GBR","IND","JPN","KOR","RUS","MEX","USA"],
     'region': ["NA","EU","EU","EU","AS","AS","AS","EU","NA","NA"]}

country_to_continent = pd.DataFrame(data=c)


# Population for each country from http://www.geoba.se/population.php?pc=world&type=028&year=2017&page=1
# useful?
POPULATIONS = {
    "CA":  35623680, #CA
    "DE":  80594016, #DE
    "FR":  66822008, #FR
    "GB":  64769452, #GB/UK
    "IN":1281935872, #IN
    "JP": 126451400, #JP
    "KR":  49237468, #KR
    "MX": 121116960, #MX
    "RU": 134547680, #RU
    "US": 328131072
}

video_lists = []
raw_categories = []
categories = []
joined_data = []

for country_code in countries:
    video_lists.append(pd.read_csv(f"data/{country_code}videos.csv",on_bad_lines='skip'))
    with open(f"data/{country_code}_category_id.json") as data_file:    
        data = json.load(data_file) 
    row_category_normalized = pd.json_normalize(data, ["items"], )
    raw_categories.append(row_category_normalized)
    print("Read files for country: " + country_code)



In [None]:
# Make categories usuable - flatten their json tree and remove unnecessary data
for i in range(len(countries)):
    print("Make categories usable for: " + countries[i])
    raw_category = raw_categories[i]
    ids = raw_category["id"]
    titles = raw_category["snippet.title"]
    dict = {"id":ids,"category_title":titles}
    category = pd.DataFrame(dict)
    category['id'] = category['id'].astype(str)
    categories.append(category)

In [None]:
# Fix bad data
# Some files had to be converted, Russia had be manually edited as well
for i in range(len(countries)):
    print("Fixing bad data for country: " + countries[i])
    video_list = video_lists[i]

    #Force column dtypes... For some reason types are not forced when reading the data from the files...
    video_list['video_id'] = video_list['video_id'].astype(str)
    video_list['trending_date'] = video_list['trending_date'].astype(str)
    video_list['title'] = video_list['title'].astype(str)
    video_list['channel_title'] = video_list['channel_title'].astype(str)
    video_list['category_id'] = video_list['category_id'].astype(str)
    video_list['publish_time'] = video_list['publish_time'].astype(str)
    video_list['tags'] = video_list['tags'].astype(str)
    #We had errors with RU views
    video_list['views'] = pd.to_numeric(video_list['views'], errors='coerce')
    video_list = video_list.dropna(subset=['views'])

    video_list['views'] = video_list['views'].astype(int)
    video_list['likes'] = video_list['likes'].astype(int)
    video_list['dislikes'] = video_list['dislikes'].astype(int)
    video_list['comment_count'] = video_list['comment_count'].astype(int)
    video_list['thumbnail_link'] = video_list['video_id'].astype(str)
    video_list['comments_disabled'] = video_list['video_id'].astype(bool)
    video_list['ratings_disabled'] = video_list['video_id'].astype(bool)
    video_list['video_error_or_removed'] = video_list['video_id'].astype(bool)
    video_list['description'] = video_list['video_id'].astype(str)


In [None]:

# Tags are malformed, let's fix them, so we can count them properly
for i in range(len(countries)):
    print("Fixing tags: " + countries[i])
    video_list = video_lists[i]
    fixed_tags = []
    for row in range(len(video_list.index)):
        row_raw_tags = video_list['tags'][1]
        row_tags_split = row_raw_tags.split('|')
        for i in range(len(row_tags_split)):
            row_tags_split[i] = row_tags_split[i].strip('\"') # replace quotation marks
        # Tags may have "[None]" in them
        if len(row_tags_split) == 1: 
            if not row_tags_split[0] or row_tags_split[0] == "[none]": # empy lists are falsy
                row_tags_split = []
        fixed_tags.append(row_tags_split)

    fixed_tags_tupple = {'tags': fixed_tags}
    fixed_tags_df = pd.DataFrame(fixed_tags_tupple)
    video_list['tags'] = fixed_tags_df['tags'].values

    # Fix trending dates (from string to date)
    video_list['trending_date'] = pd.to_datetime(video_list['trending_date'],format='%y.%d.%m')

    # Fix publish time (from string to date)
    video_list['publish_time'] = pd.to_datetime(video_list['publish_time'])

In [None]:
# Insert more data
for i in range(len(countries)):
    print("Insert more data into: " + countries[i])

    population = POPULATIONS[countries[i]]

    #Insert count of tags
    video_list = video_lists[i]
    tag_count = []
    for row in video_list['tags']:
        tag_count.append(len(row))
    video_list["tag_count"]=tag_count

    # Insert length of description. NOTE: I've had to replace descriptions for the RU country
    description_lengths = []
    video_list['description'] = video_list['description'].fillna('') # We don't want NaN values, as they are considered float
    for row in video_list['description']:
        description_lengths.append(len(row))
    video_list["description_length"] = description_lengths

    # Add country category for each set
    country = [countries_iso_a3[i]] * len(video_list.index)
    video_list["country"] = country

    # Add per capita data
    for data in ["views","likes","dislikes","comment_count"]:
        video_list[f"{data}_per_capita"] = video_list[data].div(population)

    # Add likes/dislikes ratio
    video_list["likes_per_dislikes"] = video_list["likes"].div(video_list["dislikes"])

In [None]:
# Join categories and videos
for i in range(len(countries)):
    video_list = video_lists[i]
    category = categories[i]
    joined = video_list.join(category.set_index("id"), on="category_id")
    joined_data.append(joined)

# Merge all of them into one BIG set
full_data = pd.concat(joined_data)


In [None]:
# Forcing more datatypes
full_data['category_title'] = full_data['category_title'].astype('category')
full_data['country'] = full_data['country'].astype('category')

In [None]:
# Map continents
full_data = full_data.join(country_to_continent.set_index("country"), on="country")

In [None]:
# View it in bamboolib, make sure it is correct
country_to_continent

In [None]:
# View it in bamboolib
full_data

In [None]:
#Get list of all categories, so we can use them later
category_titles = full_filtered['category_title'].unique()
category_titles = category_titles.sort_values().dropna()
print(category_titles)

In [None]:
#Total views, likes, dislikes and comments per capita boxplots
from plotly.subplots import make_subplots
import plotly.express as px


for data in ["views","likes","dislikes","comment_count"]:
    subfig = px.box(full_data.sample(n=50000, replace=False, random_state=123).sort_index(), x='country', y=f'{data}_per_capita', color='region', title=f'Trending on YouTube - {data}')
    subfig.update_yaxes(type='log', tickformat='.1e')
    subfig.update_yaxes(title_text=f'Video {data} per capita')
    subfig.update_xaxes(title_text='Countries')
    subfig.show()
    print("")

In [None]:
# Like to dislike ratio boxplots
# Should show how different countries like to rate videos?
from plotly.subplots import make_subplots
import plotly.express as px

data = "likes_per_dislikes"

subfig = px.box(full_data.sample(n=50000, replace=False, random_state=123).sort_index(), x='country', y=data, color='region', title='Trending on YouTube - Likes to Dislikes')
subfig.update_yaxes(type='log', tickformat='.1e')
subfig.update_yaxes(title_text='Video likes to dislikes ratios')
subfig.update_xaxes(title_text='Countries')
subfig.show()
print("")

In [None]:
import pandas as pd; import numpy as np
# Step: Group by and aggregate for all categories
all_cat_vldc_per_country = full_filtered.groupby(['region','country']).agg({col: ['mean', 'median'] for col in ['views_per_capita', 'likes_per_capita', 'dislikes_per_capita', 'comment_count_per_capita']})
all_cat_vldc_per_country.columns = ['_'.join(multi_index) for multi_index in all_cat_vldc_per_country.columns.ravel()]
all_cat_vldc_per_country = all_cat_vldc_per_country.reset_index()

all_cat_vldc_per_country

In [None]:
# needed, replacement for plotly.plotly
%pip install chart_studio

In [None]:
# Set up stuff for stacked bar charts

import pandas as pd; import numpy as np
import plotly.express as px

# Step: Drop more columns
full_filtered_plus = full_filtered.drop(columns=['trending_date', 'title', 'channel_title', 'publish_time', 'comments_disabled', 'ratings_disabled', 'video_error_or_removed', 'tag_count', 'description_length'])

# Step: Drop rows where category_title is missing
full_filtered_plus = full_filtered_plus.loc[~(full_filtered_plus['category_title'].isna())]

# Step: Group by and aggregate
vldc_full = full_filtered_plus.groupby(['country', 'category_title']).agg({col: ['sum','mean', 'median', 'var', 'skew'] for col in ['views', 'likes', 'dislikes', 'comment_count', 'views_per_capita', 'likes_per_capita', 'dislikes_per_capita', 'comment_count_per_capita',"likes_per_dislikes"]})
vldc_full.columns = ['_'.join(multi_index) for multi_index in vldc_full.columns.ravel()]
vldc_full = vldc_full.reset_index()


measurement_names ={"views":"Views","likes":"Likes","dislikes":"Dislikes","comment_count":"Comment Count","likes_per_dislikes":"Likes to Dislikes ratio"}
aggr_names ={"mean":"Mean","median":"Median","var":"Variaton of","skew":"Skew of"}

measurement_colors ={"views":"#ced2fd","likes":"#ccffda","dislikes":"#fdcece","comment_count":"#eeccff","likes_per_dislikes":"gray"}


In [None]:
# Show bar charts for non-per-capital data
for meas in ["views","likes","dislikes","comment_count","likes_per_dislikes"]:
    for aggr in ["mean","median"]:
        y_val = f"{meas}_{aggr}"
        y_name = f"{aggr_names[aggr]} {measurement_names[meas]}"
        plot = px.bar(vldc_full,
                      color_discrete_sequence=px.colors.qualitative.G10,
                      labels={
                          'country':'Country',
                          'category_title':'Video Category',
                          y_val : y_name},
                      x='category_title',
                      y=y_val,
                      color='country',
                      title=f'2017 trending Youtube videos - {y_name} by Categories')
        plot.update_layout({
        'plot_bgcolor': measurement_colors[meas],
        'paper_bgcolor': "lightgray",
        })
        plot.show()
        print("")

In [None]:
# Show bar charts for per-capital ddata
for meas in ["views","likes","dislikes","comment_count"]:
    for aggr in ["mean","median"]:
        y_val = f"{meas}_per_capita_{aggr}"
        y_name = f"{aggr_names[aggr]} {measurement_names[meas]}"
        plot = px.bar(vldc_full,
                      color_discrete_sequence=px.colors.qualitative.G10,
                      labels={
                          'country':'Country',
                          'category_title':'Video Category',
                          y_val : f"{y_name} per Capita"},
                      x='category_title',
                      y= y_val,
                      color='country',
                      title=f'2017 trending Youtube videos - {y_name} per Capita by Categories')
        plot.update_layout({
        'plot_bgcolor': measurement_colors[meas],
        'paper_bgcolor': "lightgray",
        })
        plot.show()
        print("")

In [None]:
# Break up data for the map
import pandas as pd; import numpy as np

vldc_by_cats = {}

measurements = ["views","likes","dislikes","comment_count","likes_per_dislikes"]

aggrs = ["sum","mean","median","var","skew"]

for cat in category_titles:
    # Step: Keep rows where category_title is one of: Autos & Vehicles
    vldc_cat = vldc_full.loc[vldc_full['category_title'].isin([cat])]
    # Step: Drop columns
    vldc_cat = vldc_cat.drop(columns=['category_title'])

    vldc_by_cats[cat]=vldc_cat
test = vldc_by_cats["Comedy"]
test

In [None]:
# imports for dash
from jupyter_dash import JupyterDash
import dash
from dash import dcc
from dash import html
import pandas as pd
import plotly.express as px

In [None]:
# Create a map that updates
import plotly.graph_objects as go
from dash.dependencies import Input, Output, State

# load country color outlines
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

scopes = {"World": "world",
    "NA": "north america",
    "EU": "europe",
    "AS": "asia"}

zoom = {"World": {"lat":[-90,90],"lon":[-170,190],"zoom":1,"center":{"lat":55,"lon":6}},
"NA": {"lat":[0,90],"lon":[-170,-10],"zoom":1,"center":{"lat":50,"lon":-105}},
"EU": {"lat":[30,90],"lon":[-25,-55],"zoom":3,"center":{"lat":46,"lon":18}},
"AS": {"lat":[-0,90],"lon":[25,190],"zoom":1.5,"center":{"lat":30,"lon":90}}}


color_scales = {"views":["darkgray","blue"],
"likes":["darkgray","green"],
"dislikes":["darkgray","red"],
"comment_count":["darkgray","purple"],
"likes_per_dislikes":["darkgreen","lime"]}


def get_map(region: str,cat:str,meas:str,aggr:str,per_capita:bool,log10:bool):
    
#    if meas == "likes_per_dislikes" and (per_capita or aggr not in ["mean","median","skew"]):
#        return go.Figure()

    z_name = f"{meas}_per_capita_{aggr}" if  per_capita else f"{meas}_{aggr}"
    z = np.log10(vldc_by_cats[cat][z_name]) if log10 else vldc_by_cats[cat][z_name]
    colorspace = color_scales[meas]

    fig = go.Figure(go.Choropleth(
        locations=vldc_by_cats[cat]["country"],
        z=z,
        text=vldc_by_cats[cat][z_name],
        colorscale =colorspace))
    fig.update_geos(projection_type="natural earth",
        lataxis_range=zoom[region]["lat"],
        lonaxis_range=zoom[region]["lon"],
        showcountries=True,
        scope=scopes[region])
    fig.update_layout(height=450, margin={"r":0,"t":0,"l":0,"b":0},geo = {
            'projection_scale':zoom[region]["zoom"],
            'center':zoom[region]["center"] # screw the guys who did not make zoom or scale work in update geos
        })
    # fig.show()
    return fig

test_fig = get_map('World','Comedy','views','sum',True,True)


In [None]:
%pip install dash-mantine-components

In [None]:
from dash import Dash, dcc, html, Input, Output
import dash_mantine_components as dmc
# Constructing the Dash app and its callbacks
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

# in checkbox_reset, because it was broken...
# '''input[type="checkbox"] {
#      -webkit-appearance: checkbox !important;
#      -moz-appearance: checkbox !important;
#      -ms-appearance: checkbox !important;
#      -o-appearance: checkbox !important;
#      appearance: checkbox !important;
# }'''

app = JupyterDash(__name__, external_stylesheets=external_stylesheets)

# Create server variable with Flask server object for use with gunicorn
server = app.server

region = "World"
per_capita = False
log10 = False
aggr = "sum"
measurement = "views"
cat = "Comedy"

fig = get_map(region=region,cat=cat,meas=measurement,aggr=aggr,log10=log10,per_capita=per_capita)

app.layout = html.Div(style={'backgroundColor':'#fffae1'},children=[
    html.H1(style={'fontWeight':'bold'},
    children='Analysis of YouTube Trending Videos 2017'),

    html.H5(children='''
        An interactve map created using Pandas, Bamboolib and Dash and more!
    '''),

    dcc.Dropdown(['World', 'EU', 'NA', 'AS'], 'World', id='region-dropdown'),

    dcc.Dropdown(measurements, 'views', id='measurement-dropdown'),

    dcc.Dropdown(aggrs,  "sum", id='aggr-dropdown'),

    dcc.Dropdown(category_titles, 'Comedy', id='cat-dropdown'),


    dmc.Checkbox(
            id="checkbox-per-capita",
            label="Per Capita",
    ),
    dmc.Checkbox(
            id="checkbox-logaritmic",
            label="Log10 coloring",
    ),
    html.Div(id='dd-map-output',children=[ 
        dcc.Graph(
        id='map',
        figure=fig
    )]
    )
        

])

@app.callback(
    Output('dd-map-output', 'children'),
    Input('region-dropdown', 'value'),
    Input('cat-dropdown', 'value'),
    Input('measurement-dropdown', 'value'),
    Input('aggr-dropdown', 'value'),
    Input('checkbox-per-capita', 'checked'),
    Input('checkbox-logaritmic', 'checked'),
)
def update_output(reg,cat,meas,aggr,per_c,log10):
    global fig 
    fig = get_map(reg,cat,meas,aggr,per_c,log10)
    if (meas == "likes_per_dislikes"):
            return [ 
            
            dcc.Markdown('''
                ### *Warning!*
                *Measurments for likes/dislikes ratio might be broken (missing data)*
            '''),

            dcc.Graph(
            id='map',
            figure=fig
        )]
    else:
        return [ 
            dcc.Graph(
            id='map',
            figure=fig
        )]

    


In [None]:
# Run the app, display it in the notebook
# app.run_server(mode="inline",debug=True)
# Display it in a webpage
app.run_server(debug=True)