# Data Narrative Critique
## IS 457 - Data Storytelling
### Group Members:
* Moritz Staudinger
* Faiz Mohammed Faiz
* Sunethra Kannan
* Aditya Madduluri

In this analysis, we reproduced the findings of "Some People Are Too Superstitious To Have A Baby On Friday The 13th" (https://fivethirtyeight.com/features/some-people-are-too-superstitious-to-have-a-baby-on-friday-the-13th/)

In [84]:
import pandas as pd
import plotly as plt
import plotly.express as px
import plotly.graph_objects as go

The datasets are available at: 
https://github.com/fivethirtyeight/data/tree/master/births

In [85]:
df_ssa = pd.read_csv('US_births_2000-2014_SSA.csv')  
df_nchs = pd.read_csv('US_births_1994-2003_CDC_NCHS.csv') 

We can see that both files have the same columns and columnnames, this is important for combining them later

In [86]:
df_ssa.head()

Unnamed: 0,year,month,date_of_month,day_of_week,births
0,2000,1,1,6,9083
1,2000,1,2,7,8006
2,2000,1,3,1,11363
3,2000,1,4,2,13032
4,2000,1,5,3,12558


In [87]:
df_nchs.head()

Unnamed: 0,year,month,date_of_month,day_of_week,births
0,1994,1,1,6,8096
1,1994,1,2,7,7772
2,1994,1,3,1,10142
3,1994,1,4,2,11248
4,1994,1,5,3,11053


Generating the average births per day of the year, without depending on the day of the week.
Further calculating the average births as in the original report only for the 6th and 20th day of the month

In [88]:
# Only Execute this block if holidays should be excluded -> Will also affect the 2 plot, so be careful
#Excluding Holidays (Martin Luther King Day, Presidents Day, Labor Day, Columbus Day)
df_ssa=df_ssa[~((df_ssa["month"]==1) & (df_ssa["date_of_month"]==20) & (df_ssa["day_of_week"]==1))] 
df_ssa=df_ssa[~((df_ssa["month"]==2) & (df_ssa["date_of_month"]==20) & (df_ssa["day_of_week"]==1))] 
df_ssa=df_ssa[~((df_ssa["month"]==9) & (df_ssa["date_of_month"]==6) & (df_ssa["day_of_week"]==1))] 
df_ssa=df_ssa[~((df_ssa["month"]==10) & (df_ssa["date_of_month"]==13) & (df_ssa["day_of_week"]==1))] 
df_ssa # 9 rows to delete

Unnamed: 0,year,month,date_of_month,day_of_week,births
0,2000,1,1,6,9083
1,2000,1,2,7,8006
2,2000,1,3,1,11363
3,2000,1,4,2,13032
4,2000,1,5,3,12558
...,...,...,...,...,...
5474,2014,12,27,6,8656
5475,2014,12,28,7,7724
5476,2014,12,29,1,12811
5477,2014,12,30,2,13634


In [89]:
df_ssa_holidays = df_ssa[(df_ssa["month"]==1) & (df_ssa["date_of_month"]==20) & (df_ssa["day_of_week"]==1)] 


births_per_day_and_weekday_ssa = df_ssa[["date_of_month", "day_of_week", "births"]].groupby(by=['date_of_month', 'day_of_week']).sum().reset_index()
births_per_day_and_weekday_ssa["births"] = births_per_day_and_weekday_ssa["births"]/(df_ssa["year"].unique().size)
average_births_ssa = births_per_day_and_weekday_ssa["births"].mean()
average_births_6_20_ssa = births_per_day_and_weekday_ssa[(births_per_day_and_weekday_ssa['date_of_month']==6) | (births_per_day_and_weekday_ssa['date_of_month']==20)]["births"].mean()

In [90]:
print(births_per_day_and_weekday_ssa[births_per_day_and_weekday_ssa["date_of_month"]==13])
print(average_births_ssa)
print(average_births_6_20_ssa)

    date_of_month  day_of_week        births
84             13            1  16601.400000
85             13            2  22299.333333
86             13            3  21226.733333
87             13            4  23452.266667
88             13            5  19916.600000
89             13            6  14251.266667
90             13            7  13302.533333
19075.95821812596
19335.75238095238


Repeating this process for the other dataset as well

In [91]:
births_per_day_and_weekday_nchs = df_nchs[["date_of_month", "day_of_week", "births"]].groupby(by=['date_of_month', 'day_of_week']).sum().reset_index()
births_per_day_and_weekday_nchs["births"] = births_per_day_and_weekday_nchs["births"]/(df_nchs["year"].unique().size)
average_births_nchs = births_per_day_and_weekday_nchs["births"].mean()
average_births_6_20_nchs = births_per_day_and_weekday_nchs[(births_per_day_and_weekday_nchs['date_of_month']==6) | (births_per_day_and_weekday_nchs['date_of_month']==20)]["births"].mean()

In [92]:
print(births_per_day_and_weekday_nchs[births_per_day_and_weekday_nchs["date_of_month"]==13])
print(average_births_nchs)
print(average_births_6_20_nchs)

    date_of_month  day_of_week   births
84             13            1  18381.2
85             13            2  19469.5
86             13            3  20216.3
87             13            4  22539.5
88             13            5  18247.7
89             13            6  15602.3
90             13            7  13311.9
18305.132258064514
18780.678571428576


Combining the two datasets and calculating the average births.
Using the days of the week as further parameter and do not use the total value (as the authors seem to have used the same). The values seem to be better when adapting it on a per weekday basis

In [93]:
births_per_day_and_weekday = births_per_day_and_weekday_nchs[["date_of_month", 'day_of_week']]
births_per_day_and_weekday["births"] = (births_per_day_and_weekday_nchs["births"]+births_per_day_and_weekday_ssa["births"])/2
average_births = births_per_day_and_weekday["births"].mean()
average_births_by_day = births_per_day_and_weekday[["births", "day_of_week"]].groupby(by=["day_of_week"]).mean()
average_births_6_20 = births_per_day_and_weekday[(births_per_day_and_weekday['date_of_month']==6) | (births_per_day_and_weekday['date_of_month']==20)]["births"].mean()
average_births_6_20_by_day = births_per_day_and_weekday[(births_per_day_and_weekday['date_of_month']==6) | (births_per_day_and_weekday['date_of_month']==20)][["births", "day_of_week"]].groupby(by=["day_of_week"]).mean()

In [94]:
average_births_6_20_by_day

Unnamed: 0_level_0,births
day_of_week,Unnamed: 1_level_1
1,18060.1
2,21309.591667
3,21368.125
4,23791.891667
5,20427.991667
6,15024.991667
7,13424.816667


In [95]:
births_per_day_and_weekday["average_births_6_20"] = average_births_6_20
births_per_day_and_weekday["average_births"] = average_births
# per weekday
births_per_day_and_weekday = births_per_day_and_weekday.join(average_births_6_20_by_day, on=["day_of_week"], rsuffix="_avg_per_day_6_20")
births_per_day_and_weekday = births_per_day_and_weekday.join(average_births_by_day, on=["day_of_week"], rsuffix="_avg_per_day")

In [96]:
#Seems better!
births_per_day_and_weekday[births_per_day_and_weekday["date_of_month"]==13][["births", "average_births_6_20","average_births","births_avg_per_day_6_20","births_avg_per_day"]]

Unnamed: 0,births,average_births_6_20,average_births,births_avg_per_day_6_20,births_avg_per_day
84,17491.3,19058.215476,18690.545238,18060.1,19252.655914
85,20884.416667,19058.215476,18690.545238,21309.591667,21445.341935
86,20721.516667,19058.215476,18690.545238,21368.125,21068.185484
87,22995.883333,19058.215476,18690.545238,23791.891667,20944.171505
88,19082.15,19058.215476,18690.545238,20427.991667,20645.897312
89,14926.783333,19058.215476,18690.545238,15024.991667,14567.377957
90,13307.216667,19058.215476,18690.545238,13424.816667,12910.186559


Calculating the percentage change, to the average births (for all variants)

In [97]:
births_per_day_and_weekday["percentage_6_20"] = ((births_per_day_and_weekday["births"]-births_per_day_and_weekday["average_births_6_20"])/births_per_day_and_weekday["average_births_6_20"])*100
births_per_day_and_weekday["percentage_total"] = ((births_per_day_and_weekday["births"]-births_per_day_and_weekday["average_births"])/births_per_day_and_weekday["average_births"])*100
#per weekday
births_per_day_and_weekday["percentage_6_20_day_of_week"] = ((births_per_day_and_weekday["births"]-births_per_day_and_weekday["births_avg_per_day_6_20"])/births_per_day_and_weekday["births_avg_per_day_6_20"])*100
births_per_day_and_weekday["percentage_total_day_of_week"] = ((births_per_day_and_weekday["births"]-births_per_day_and_weekday["births_avg_per_day"])/births_per_day_and_weekday["births_avg_per_day"])*100

Mapping the day of the week to its name, as it is better readable in the plot then

In [98]:
dayOfWeek={1:'Monday', 2:'Tuesday', 3:'Wednesday', 4:'Thursday', 5:'Friday', 6:'Saturday', 7:'Sunday'}
births_per_day_and_weekday['weekday'] = births_per_day_and_weekday['day_of_week'].map(dayOfWeek)

In [99]:
#Seems better!
births_per_day_and_weekday[births_per_day_and_weekday["date_of_month"]==13][["percentage_6_20","percentage_total", "percentage_6_20_day_of_week", "percentage_total_day_of_week" ]]

Unnamed: 0,percentage_6_20,percentage_total,percentage_6_20_day_of_week,percentage_total_day_of_week
84,-8.221732,-6.41632,-3.149484,-9.148639
85,9.582226,11.737867,-1.995228,-2.615604
86,8.727476,10.866304,-3.026042,-1.645461
87,20.661262,23.034845,-3.345713,9.796099
88,0.125586,2.095202,-6.588223,-7.574131
89,-21.677959,-20.13725,-0.653633,2.467193
90,-30.175956,-28.802416,-0.87599,3.075324


Recreating the original plot, with only taking the births on the 6th and 20th into account. Looks fairly similar, but differs on Monday. This could be, because we did not exclude holidays in our visualization, as the authors just linked a list of holidays and used this arbitrary dates.

In [100]:
fig = px.bar(births_per_day_and_weekday[births_per_day_and_weekday["date_of_month"]==13], 
             x='weekday', 
             y='percentage_6_20_day_of_week', 
             title="The Friday the 13th effect<br> <sub>Difference in the share of U.S. births on the 13th of each month <br> from the average of births per weekday on the 6th and the 20th, 1994-2014</sub>",
                labels=dict(weekday=" ", percentage_6_20_day_of_week="Difference in percent to avg 6th and 20th weekday"),
             color_discrete_sequence=["violet"],)
fig.update_layout(title_x=0.5, paper_bgcolor='rgba(240,240,240,240)',
    plot_bgcolor='rgba(0,0,0,0)')
fig.update_xaxes(showgrid=True, side="top", title_standoff=100, gridcolor="#D2D2D2")
fig.update_yaxes(showgrid=True, gridcolor="#D2D2D2")
fig.add_annotation(x=0, y=-6.5,
            text="Excluding holidays",
            showarrow=False,
            yshift=10)
fig.show()

Therefore, we created a plot, that takes all weekdays into account, and not just the 6th and 20th and we can see that the values differ significantly

In [101]:
fig = px.bar(births_per_day_and_weekday[births_per_day_and_weekday["date_of_month"]==13], 
             x='weekday', 
             y='percentage_total_day_of_week', 
             title="The Friday the 13th effect<br> <sub>Difference in the share of U.S. births on the 13th of each month <br> from the average of births per weekday, 1994-2014</sub>",
             labels=dict(weekday=" ", percentage_total_day_of_week="Difference in percent to avg weekday"),
            color_discrete_sequence=["violet"],)
fig.update_layout(title_x=0.5, paper_bgcolor='rgba(240,240,240,240)',
    plot_bgcolor='rgba(0,0,0,0)')
fig.update_layout(title_x=0.5)
fig.update_xaxes(showgrid=True, side="top", title_standoff=100, gridcolor="#D2D2D2")
fig.update_yaxes(showgrid=True, gridcolor="#D2D2D2")
fig.update_layout(title_x=0.5)
fig.show()

## Comparing for all days of the year
Here we are lookin into the second part of the article, as in the article, only with the SSA dataset from 2000-2014

and do a similar preprocessing as before, but taking the month into account

In [102]:
df_ssa = pd.read_csv('US_births_2000-2014_SSA.csv')  
births_per_day_and_weekday_ssa = df_ssa[["month","date_of_month", "day_of_week", "births"]].groupby(by=['month','date_of_month']).mean().reset_index()

In [103]:
births_per_day_and_weekday_ssa

Unnamed: 0,month,date_of_month,day_of_week,births
0,1,1,3.933333,7735.333333
1,1,2,4.000000,9605.533333
2,1,3,3.600000,11341.000000
3,1,4,4.133333,11444.200000
4,1,5,4.200000,11112.133333
...,...,...,...,...
361,12,27,3.933333,12189.266667
362,12,28,4.466667,12192.533333
363,12,29,4.066667,12068.066667
364,12,30,4.133333,11917.933333


In [104]:
average_births_by_day = births_per_day_and_weekday_ssa[["births", "day_of_week","month"]].groupby(by=["day_of_week","month"]).mean()

average_births_6_20_by_day = births_per_day_and_weekday_ssa[(births_per_day_and_weekday_ssa['date_of_month']==6) | (births_per_day_and_weekday_ssa['date_of_month']==20)][["births", "day_of_week","month"]].groupby(by=["day_of_week","month"]).mean()

In [105]:
births_per_day_and_weekday_ssa

Unnamed: 0,month,date_of_month,day_of_week,births
0,1,1,3.933333,7735.333333
1,1,2,4.000000,9605.533333
2,1,3,3.600000,11341.000000
3,1,4,4.133333,11444.200000
4,1,5,4.200000,11112.133333
...,...,...,...,...
361,12,27,3.933333,12189.266667
362,12,28,4.466667,12192.533333
363,12,29,4.066667,12068.066667
364,12,30,4.133333,11917.933333


In [106]:
births_per_day_and_weekday_ssa = births_per_day_and_weekday_ssa.join(average_births_6_20_by_day, on=["day_of_week","month"], rsuffix="_avg_per_day_6_20")
births_per_day_and_weekday_ssa = births_per_day_and_weekday_ssa.join(average_births_by_day, on=["day_of_week","month"], rsuffix="_avg_per_day")

births_per_day_and_weekday_ssa["percentage_6_20_day_of_week"] = ((births_per_day_and_weekday_ssa["births"]-births_per_day_and_weekday_ssa["births_avg_per_day_6_20"])/births_per_day_and_weekday_ssa["births_avg_per_day_6_20"])*100
births_per_day_and_weekday_ssa["percentage_total_day_of_week"] = ((births_per_day_and_weekday_ssa["births"]-births_per_day_and_weekday_ssa["births_avg_per_day"])/births_per_day_and_weekday_ssa["births_avg_per_day"])*100

births_per_day_and_weekday_ssa["date"]=pd.to_datetime("2020" + "/" + births_per_day_and_weekday_ssa["month"].apply(str) + "/" + births_per_day_and_weekday_ssa["date_of_month"].apply(str))

In [107]:
births_per_day_and_weekday_ssa[births_per_day_and_weekday_ssa["date_of_month"]==1]

Unnamed: 0,month,date_of_month,day_of_week,births,births_avg_per_day_6_20,births_avg_per_day,percentage_6_20_day_of_week,percentage_total_day_of_week,date
0,1,1,3.933333,7735.333333,,10194.013333,,-24.118862,2020-01-01
31,2,1,4.133333,11296.266667,,11408.7,,-0.985505,2020-02-01
60,3,1,3.933333,11446.533333,,11271.466667,,1.553184,2020-03-01
91,4,1,4.133333,10315.8,,10725.986667,,-3.824232,2020-04-01
121,5,1,3.8,11295.866667,,11310.373333,,-0.12826,2020-05-01
152,6,1,4.466667,11423.2,,11369.56,,0.471786,2020-06-01
182,7,1,4.133333,11854.666667,,11754.106667,,0.855531,2020-07-01
213,8,1,3.866667,12216.933333,,12318.413333,,-0.823807,2020-08-01
244,9,1,4.066667,10931.933333,,11767.466667,,-7.100367,2020-09-01
274,10,1,3.733333,11955.6,,11527.56,,3.713188,2020-10-01


We are then recreating the plot, by using 4 different subplots and merging them (Feb 14th needed an individual plot to be readable).
Our results are again fairly similar, but the differences for Feb 14th and 29th are not that significant (without performing a significance test)

In [108]:
# A line graph with the births per day
fig1 = px.line(births_per_day_and_weekday_ssa, 
             x='date', 
             y='percentage_total_day_of_week', color_discrete_sequence=["black"])
# Scatter of all 13th
fig2 = go.Figure()
fig2.add_trace(go.Scatter(
    mode="markers",
    marker=dict(color='black'),
    x=births_per_day_and_weekday_ssa[(births_per_day_and_weekday_ssa["date_of_month"]==13)]["date"],
    y=births_per_day_and_weekday_ssa[(births_per_day_and_weekday_ssa["date_of_month"]==13)]["percentage_total_day_of_week"],
    name="Friday the 13th",
    showlegend=True
))

#All Important dates
important_days = [pd.to_datetime("2020/01/01"),pd.to_datetime("2020/04/01"),pd.to_datetime("2020/07/04"),pd.to_datetime("2020/10/31"),pd.to_datetime("2020/12/25"),pd.to_datetime("2020/12/31") ]
important_days_text = ["Jan 1", "April 1", "July 4", "Oct 31", "Dec 25", "Dec 31"]
fig3 = go.Figure()
fig3.add_trace(go.Scatter(
    mode="markers+text",
    marker=dict(color='red'),
    x=births_per_day_and_weekday_ssa[(births_per_day_and_weekday_ssa["date"].isin(important_days) )]["date"],
    y=births_per_day_and_weekday_ssa[(births_per_day_and_weekday_ssa["date"].isin(important_days) )]["percentage_total_day_of_week"],
    text = important_days_text,
    textposition="bottom center",
    showlegend=False
))
#Feb 14
fig4 = go.Figure()
fig4.add_trace(go.Scatter(
    mode="markers+text",
    marker=dict(color='red'),
    x=births_per_day_and_weekday_ssa[(births_per_day_and_weekday_ssa["date"].isin([pd.to_datetime("2020/02/14"),pd.to_datetime("2020/02/29")] ))]["date"],
    y=births_per_day_and_weekday_ssa[(births_per_day_and_weekday_ssa["date"].isin([pd.to_datetime("2020/02/14"),pd.to_datetime("2020/02/29")] ))]["percentage_total_day_of_week"],
    text = ["Feb 14", "Feb 29"],
    textposition="top center",
    showlegend=False
))


fig = go.Figure(data=fig1.data+fig2.data+fig3.data+fig4.data)
fig.update_xaxes(
    showgrid=True, 
    side="bottom", 
    title_standoff=100, 
    gridcolor="#D2D2D2",
    ticktext=["Jan", "April", "July", "Oct", "Jan"],
    tickvals=["2020-01-01", "2020-04-01", "2020-07-01","2020-10-01","2020-12-31" ],
)
fig.update_layout(
    title="Fewer babies are born on Fridays the 13th of each month <br> <sub>US births relative to average adjusted to month and day of the week</sub>",
    yaxis_title="Percentage change from the average births per day",
    xaxis_title="Day of the Year",
    title_x=0.5, 
    paper_bgcolor='rgba(240,240,240,240)',
    plot_bgcolor='rgba(0,0,0,0)')
fig.update_yaxes(showgrid=True, gridcolor="#D2D2D2")
fig.update_layout(title_x=0.5)
fig.add_annotation(
            x=pd.to_datetime("2020/11/27"), y=-10,
            text="Thanksgiving",
            showarrow=False,
            yshift=-5,
            xshift=2)
fig.add_annotation(
            x=pd.to_datetime("2020/09/03"), y=-8,
            text="Labor Day",
            showarrow=False,
            yshift=-5,
            xshift=2)
fig.add_annotation(
            x=pd.to_datetime("2020/05/26"), y=-5,
            text="Memorial Day",
            showarrow=False,
            yshift=-5,
            xshift=2)
fig.show()