# Holidays 2023 EDA

## Table of Contents

* [1. Imports and Initializations](#imports-and-initializations)
* [2. Data Preparation](#data-preparation)
* [3. Visualizations](#visualizations)

## 1. Imports and Initializations <a class="anchor" id="imports-and-initializations"></a>

In [1]:
#!pip install lets_plot -U

In [2]:
import colorcet as cc

from lets_plot import *
from lets_plot.mapping import as_discrete
from lets_plot.geo_data import *

The geodata is provided by © OpenStreetMap contributors and is made available here under the Open Database License (ODbL).


In [3]:
LetsPlot.setup_html()

In [4]:
def generate_month_colors(colormap):
    import numpy as np
    n = 12
    colors = [colormap[i] for i in np.linspace(0, len(colormap) - 1, n, dtype=int)]
    months = ['January', 'February', 'March', 'April', 'May', 'June', \
              'July', 'August', 'September', 'October', 'November', 'December']
    return {months[i]: colors[i] for i in range(n)}

top_size = 10
const_color = "black"
const_fill = "#3b528b"
month_color = generate_month_colors(cc.bmy)
religion_color = {
    "Orthodox": "#440154",
    "Christian": "#3b528b",
    "Hebrew": "#21918c",
    "Muslim": "#5ec962",
    "Hinduism": "#fde725",
}

## 2. Data Preparation <a class="anchor" id="data-preparation"></a>

In [5]:
def read_data():
    import os
    import pandas as pd
    data_dir = "Data/countries"
    paths = (os.path.join(data_dir, f) for f in os.listdir(data_dir))
    dfs = []
    for file_path in filter(os.path.isfile, paths):
        dfs.append(pd.read_csv(file_path))
    return pd.concat(dfs)

def get_data():
    import pandas as pd
    name_replace = {
        "New Year": "New Year's Day",
    }
    type_replace = {
        "['Season']": "Season",
        "['Clock change/Daylight Saving Time']": "DST",
        "['Sporting event']": "Sport",
    }
    df = read_data()
    df.columns = ["date", "holiday_name" ,"type", "country_name", "country_code"]
    df["date"] = pd.to_datetime(df["date"], format='mixed', utc=True)
    df["month"] = df["date"].dt.month
    df["month_name"] = df["date"].dt.month_name()
    df["week"] = df["date"].dt.isocalendar().week
    df["day_of_year"] = df["date"].dt.dayofyear
    df["day_of_month"] = df["date"].dt.day
    df["day_of_week"] = df["date"].dt.dayofweek
    df["day_of_week_name"] = df["date"].dt.day_name()
    df["date_name"] = df["month_name"].str.cat(df["day_of_month"].astype(str), sep=", ")
    df["holiday_name"] = df["holiday_name"].replace(name_replace)
    df["is_national"] = df["type"].str.contains("'National holiday'").map({True: "yes", False: "no"})
    df["religion"] = df["type"].str.extract(r"(Orthodox|Christian|Hebrew|Muslim|Hinduism)")
    df["is_religious"] = (~df["religion"].isna()).map({True: "yes", False: "no"})
    df["holiday_type"] = df["type"].map(type_replace).fillna("Holiday").astype(str)
    df.drop(columns=["type", "country_code"], inplace=True)
    df.drop_duplicates(subset=["country_name", "holiday_name", "day_of_year"], inplace=True)
    return df

def filter_df(df, col, value):
    result = df[df[col] == value].reset_index(drop=True)
    result.drop(columns=[col], inplace=True)
    return result

def get_distinct_holidays_df(df):
    return df.drop_duplicates(subset=["holiday_name", "day_of_year"]).reset_index(drop=True)

def get_countries_gdf(df, country_col):
    return geocode_countries(df[country_col].unique()).ignore_not_found().inc_res().get_boundaries()

def get_top_df(df, col):
    return df[col].value_counts().to_frame().reset_index().iloc[:top_size]

In [6]:
full_df = get_data()
print(full_df.shape)
full_df.head()

(7098, 15)


Unnamed: 0,date,holiday_name,country_name,month,month_name,week,day_of_year,day_of_month,day_of_week,day_of_week_name,date_name,is_national,religion,is_religious,holiday_type
0,2023-01-01 00:00:00+00:00,New Year's Day,United Arab Emirates,1,January,52,1,1,6,Sunday,"January, 1",yes,,no,Holiday
1,2023-02-18 00:00:00+00:00,Leilat al-Meiraj (The Prophet's Ascension),United Arab Emirates,2,February,7,49,18,5,Saturday,"February, 18",no,,no,Holiday
2,2023-03-20 21:24:20+00:00,March Equinox,United Arab Emirates,3,March,12,79,20,0,Monday,"March, 20",no,,no,Season
3,2023-03-23 00:00:00+00:00,Ramadan Start,United Arab Emirates,3,March,12,82,23,3,Thursday,"March, 23",no,,no,Holiday
4,2023-04-20 00:00:00+00:00,Eid al-Fitr Holiday,United Arab Emirates,4,April,16,110,20,3,Thursday,"April, 20",yes,,no,Holiday


In [7]:
countries_gdf = get_countries_gdf(full_df, "country_name")
print(countries_gdf.shape)
countries_gdf.head()

(207, 3)


Unnamed: 0,country,found name,geometry
0,United Arab Emirates,United Arab Emirates,"MULTIPOLYGON (((53.62326 24.16635, 53.75808 24..."
1,South Sudan,South Sudan,"MULTIPOLYGON (((27.44883 5.01951, 27.20060 5.7..."
2,Liechtenstein,Liechtenstein,"MULTIPOLYGON (((9.60705 47.06077, 9.47167 47.0..."
3,Nauru,Nauru,"MULTIPOLYGON (((166.93052 -0.50625, 166.95887 ..."
4,Fiji,Fiji,"MULTIPOLYGON (((-180.00000 -16.96671, -179.881..."


In [8]:
holidays_df = filter_df(full_df, "holiday_type", "Holiday")
print(holidays_df.shape)
holidays_df.head()

(6089, 14)


Unnamed: 0,date,holiday_name,country_name,month,month_name,week,day_of_year,day_of_month,day_of_week,day_of_week_name,date_name,is_national,religion,is_religious
0,2023-01-01 00:00:00+00:00,New Year's Day,United Arab Emirates,1,January,52,1,1,6,Sunday,"January, 1",yes,,no
1,2023-02-18 00:00:00+00:00,Leilat al-Meiraj (The Prophet's Ascension),United Arab Emirates,2,February,7,49,18,5,Saturday,"February, 18",no,,no
2,2023-03-23 00:00:00+00:00,Ramadan Start,United Arab Emirates,3,March,12,82,23,3,Thursday,"March, 23",no,,no
3,2023-04-20 00:00:00+00:00,Eid al-Fitr Holiday,United Arab Emirates,4,April,16,110,20,3,Thursday,"April, 20",yes,,no
4,2023-04-21 00:00:00+00:00,Eid al-Fitr,United Arab Emirates,4,April,16,111,21,4,Friday,"April, 21",yes,,no


In [9]:
distinct_holidays_df = get_distinct_holidays_df(holidays_df)
print(distinct_holidays_df.shape)
distinct_holidays_df.head()

(3177, 14)


Unnamed: 0,date,holiday_name,country_name,month,month_name,week,day_of_year,day_of_month,day_of_week,day_of_week_name,date_name,is_national,religion,is_religious
0,2023-01-01 00:00:00+00:00,New Year's Day,United Arab Emirates,1,January,52,1,1,6,Sunday,"January, 1",yes,,no
1,2023-02-18 00:00:00+00:00,Leilat al-Meiraj (The Prophet's Ascension),United Arab Emirates,2,February,7,49,18,5,Saturday,"February, 18",no,,no
2,2023-03-23 00:00:00+00:00,Ramadan Start,United Arab Emirates,3,March,12,82,23,3,Thursday,"March, 23",no,,no
3,2023-04-20 00:00:00+00:00,Eid al-Fitr Holiday,United Arab Emirates,4,April,16,110,20,3,Thursday,"April, 20",yes,,no
4,2023-04-21 00:00:00+00:00,Eid al-Fitr,United Arab Emirates,4,April,16,111,21,4,Friday,"April, 21",yes,,no


In [10]:
national_holidays_df = filter_df(full_df, "is_national", 'yes')
print(national_holidays_df.shape)
national_holidays_df.head()

(3365, 14)


Unnamed: 0,date,holiday_name,country_name,month,month_name,week,day_of_year,day_of_month,day_of_week,day_of_week_name,date_name,religion,is_religious,holiday_type
0,2023-01-01 00:00:00+00:00,New Year's Day,United Arab Emirates,1,January,52,1,1,6,Sunday,"January, 1",,no,Holiday
1,2023-04-20 00:00:00+00:00,Eid al-Fitr Holiday,United Arab Emirates,4,April,16,110,20,3,Thursday,"April, 20",,no,Holiday
2,2023-04-21 00:00:00+00:00,Eid al-Fitr,United Arab Emirates,4,April,16,111,21,4,Friday,"April, 21",,no,Holiday
3,2023-04-22 00:00:00+00:00,Eid al-Fitr Holiday,United Arab Emirates,4,April,16,112,22,5,Saturday,"April, 22",,no,Holiday
4,2023-04-23 00:00:00+00:00,Eid al-Fitr Holiday,United Arab Emirates,4,April,16,113,23,6,Sunday,"April, 23",,no,Holiday


In [11]:
distinct_national_holidays_df = get_distinct_holidays_df(national_holidays_df)
print(distinct_national_holidays_df.shape)
distinct_national_holidays_df.head()

(1670, 14)


Unnamed: 0,date,holiday_name,country_name,month,month_name,week,day_of_year,day_of_month,day_of_week,day_of_week_name,date_name,religion,is_religious,holiday_type
0,2023-01-01 00:00:00+00:00,New Year's Day,United Arab Emirates,1,January,52,1,1,6,Sunday,"January, 1",,no,Holiday
1,2023-04-20 00:00:00+00:00,Eid al-Fitr Holiday,United Arab Emirates,4,April,16,110,20,3,Thursday,"April, 20",,no,Holiday
2,2023-04-21 00:00:00+00:00,Eid al-Fitr,United Arab Emirates,4,April,16,111,21,4,Friday,"April, 21",,no,Holiday
3,2023-04-22 00:00:00+00:00,Eid al-Fitr Holiday,United Arab Emirates,4,April,16,112,22,5,Saturday,"April, 22",,no,Holiday
4,2023-04-23 00:00:00+00:00,Eid al-Fitr Holiday,United Arab Emirates,4,April,16,113,23,6,Sunday,"April, 23",,no,Holiday


## 3. Visualizations <a class="anchor" id="visualizations"></a>

In [12]:
def get_map_plot(data, title, trans=None):
    return ggplot() + \
        geom_map(aes(fill="count"), data=data, show_legend=False, \
                 map=countries_gdf, map_join=["country_name", "country"],
                 color='white', size=.5,
                 tooltips=layer_tooltips().title("@country_name")\
                          .format("@count", 'd').line("holidays count|@count")) + \
        scale_fill_viridis(begin=.5, trans=trans) + \
        ggtitle(title) + \
        theme_void() + theme(plot_background=geom_rect(fill="#deebf7"))

gggrid([
    get_map_plot(holidays_df["country_name"].value_counts().to_frame().reset_index(), title="All holidays", trans='sqrt'),
    get_map_plot(national_holidays_df["country_name"].value_counts().to_frame().reset_index(), title="Only national holidays"),
    get_map_plot(holidays_df[
                     holidays_df["holiday_name"].str.contains("New Year", case=False)
                 ]["country_name"].value_counts().to_frame().reset_index(), title="New Year holidays"),
], ncol=2) + ggsize(800, 640)

The first plot tells us that if you count all holidays (not just weekends), the US has a suspiciously high number of holidays (more than the number of days in a year).

The second plot shows how many national holidays (usually weekends) there are.

The third graph illustrates the number of New Year holidays by country.

In [13]:
us_holidays = holidays_df[holidays_df["country_name"] == "United States"].reset_index(drop=True)

ggplot(us_holidays) + \
    geom_dotplot(aes("week", fill="month_name"), method='histodot', binwidth=1, boundary=1e-3, \
                 color=const_color, stroke=.5, show_legend=False, \
                 tooltips=layer_tooltips().title("@month_name")\
                          .format("@week", 'd').line("week number|@week")\
                          .format("@..count..", 'd').line("week holidays count|@..count..")) + \
    scale_fill_manual(values=month_color) + \
    xlab("week number") + ylab("count") + \
    ggtitle("US holidays") + \
    ggsize(600, 450)

Let's look closer at US. Here are all the holidays by week.

In [14]:
ggplot(distinct_national_holidays_df) + \
    geom_pie(aes(fill=as_discrete("month_name", order_by="month", order=1)), 
             hole=.65, size=40, stroke=2, show_legend=False, 
             tooltips=layer_tooltips().line("count|@..count..").line("proportion|@..proppct.."), 
             labels=layer_labels().line("@month_name").size(10)) + \
    scale_fill_manual(values=month_color) + \
    ggtitle("Proportion of distinct holidays count per month") + \
    theme_void() + ggsize(600, 450)

The least festive month is December and the most festive month is April, if we take into account number of distinct holidays.

In [15]:
top_of_days_with_holiday_df = get_top_df(national_holidays_df, "day_of_year").sort_values(by="day_of_year").merge(
    national_holidays_df[["day_of_year", "date_name"]].drop_duplicates(), on="day_of_year"
)

ggplot(top_of_days_with_holiday_df) + \
    geom_bar(aes("count", as_discrete("day_of_year")), \
             stat='identity', orientation='y', \
             color=const_color, fill=const_fill, \
             labels=layer_labels().line("@date_name").size(16)) + \
    scale_fill_viridis(option='inferno') + \
    ggtitle("Number of all holidays in this day") + \
    theme(axis_title_y='blank', axis_text_y='blank') + \
    ggsize(600, 450)

Which days of the year have the most national holidays (of all countries)? January 1 is the most festive day. December 25 and May 1 are not far behind.

In [16]:
ggplot(distinct_national_holidays_df.sort_values(by="day_of_week")) + \
    geom_bar(aes("day_of_week_name"), \
             color=const_color, fill=const_fill, \
             labels=layer_labels().line("@..sumpct..").size(16)) + \
    scale_fill_viridis() + \
    ggtitle("Proportion of distinct holidays count per week") + \
    theme(axis_title_x='blank') + ggsize(600, 450)

Looking at the distribution of national holidays by day of the week (relevant only for the year 2023), the most holidays fall on Monday and the least on Saturday.

In [17]:
top_of_holidays_df = national_holidays_df[national_holidays_df["holiday_name"].isin(
    get_top_df(national_holidays_df, "holiday_name")["holiday_name"].to_list()
)].sort_values(by="day_of_year").reset_index(drop=True)

ggplot(top_of_holidays_df) + \
    geom_count(aes("day_of_year", "holiday_name", group="date_name"), \
               alpha=.5, shape=21, color=const_color, fill=const_fill, show_legend=False, \
               tooltips=layer_tooltips().title("@holiday_name")\
                        .line("date|@date_name")\
                        .line("number of countries|@..n..")) + \
    ggmarginal('r', size=.2, layer=geom_bar(aes(y="holiday_name"), \
                                            orientation='y', width=.4, \
                                            color=const_color, fill=const_fill, \
                                            labels=layer_labels().line("@..count.."))) + \
    scale_size(range=[2, 20]) + \
    xlab("day of year") + ylab("holiday name") + \
    ggtitle("Top {0} of common national holidays".format(top_size), \
            "How many holidays with the same name exists in the different countries?") + \
    ggsize(800, 320)

The most common holiday is New Year. Some holidays may have different dates depending on the country. The most dispersed holiday is Independence Day - of course, it is different date for each country.

This plot should be treated with caution - I can only distinguish holidays by name, but in the source data it happens that the same holiday is called slightly differently in different countries.

In [18]:
def get_religious_comparison_plot(df, title):
    return ggplot(df) + \
        geom_bar(aes("is_religious"), color=const_color, fill=const_fill, \
                 tooltips='none', labels=layer_labels().line("@..sumpct..").size(16)) + \
        xlab("is religious") + \
        ggtitle(title)

gggrid([
    get_religious_comparison_plot(distinct_holidays_df, "Overall distinct holidays"),
    get_religious_comparison_plot(distinct_national_holidays_df, "Distinct national holidays"),
])

Most holidays are secular. Especially if we consider only national holidays.

In [19]:
def get_religions_df():
    import pandas as pd
    def get_df(df, national):
        return df[df["is_religious"] == "yes"]["religion"].value_counts()\
            .to_frame().reset_index().assign(subset=lambda _: national)
    return pd.concat([
        get_df(distinct_holidays_df, "overall"),
        get_df(distinct_national_holidays_df, "national only"),
    ]).reset_index(drop=True)

ggplot(get_religions_df()) + \
    geom_bar(aes("religion", weight="count", fill="religion"), \
             tooltips=layer_tooltips().title("@religion")\
                                      .line("count|@..count..")\
                                      .line("proportion|@..proppct..")) + \
    facet_grid(x="subset") + \
    ggtitle("Number of distinct religious holidays") + \
    scale_fill_viridis() + \
    theme(axis_title_x='blank')

If we consider only religious holidays, then Hebrew has the most of them. However, if we look only at national holidays, then the greatest diversity is among the Orthodox.