In [1]:
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_white"

In [2]:
df = pd.read_csv("netflix_content_2023.csv")
df.head(3)

Unnamed: 0,Title,Available Globally?,Release Date,Hours Viewed,Language Indicator,Content Type
0,The Night Agent: Season 1,Yes,2023-03-23,812100000,English,Show
1,Ginny & Georgia: Season 2,Yes,2023-01-05,665100000,English,Show
2,The Glory: Season 1 // 더 글로리: 시즌 1,Yes,2022-12-30,622800000,Korean,Show


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24812 entries, 0 to 24811
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Title                24812 non-null  object
 1   Available Globally?  24812 non-null  object
 2   Release Date         8166 non-null   object
 3   Hours Viewed         24812 non-null  object
 4   Language Indicator   24812 non-null  object
 5   Content Type         24812 non-null  object
dtypes: object(6)
memory usage: 1.1+ MB


In [4]:
df["Hours Viewed"] = df["Hours Viewed"].replace(',','',regex=True).astype(float)
df['Release Date'] = pd.to_datetime(df['Release Date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24812 entries, 0 to 24811
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Title                24812 non-null  object        
 1   Available Globally?  24812 non-null  object        
 2   Release Date         8166 non-null   datetime64[ns]
 3   Hours Viewed         24812 non-null  float64       
 4   Language Indicator   24812 non-null  object        
 5   Content Type         24812 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 1.1+ MB


In [5]:
df.shape

(24812, 6)

In [6]:
df.describe()

Unnamed: 0,Release Date,Hours Viewed
count,8166,24812.0
mean,2020-05-31 06:57:55.679647488,6384084.0
min,2010-04-01 00:00:00,100000.0
25%,2018-11-30 00:00:00,300000.0
50%,2020-08-15 00:00:00,1100000.0
75%,2022-03-18 00:00:00,4700000.0
max,2023-12-31 00:00:00,812100000.0
std,,20670860.0


In [7]:
content_type_viewership = df.groupby('Content Type')['Hours Viewed'].sum()

fig = go.Figure(data=[go.Bar(x=content_type_viewership.index, y=content_type_viewership.values, marker_color=['skyblue', 'salmon'])])

fig.update_layout(
    title='Total Viewership Hours by Content Type (2023)',
    xaxis_title='Content Type',
    yaxis_title='Total Hours Viewed (in billions)',
    xaxis_tickangle=0,
    height=500,
    width=800
)

fig.show()

In [8]:
Language_type_viewership = df.groupby("Language Indicator")["Hours Viewed"].sum().sort_values(ascending=False)

fig = go.Figure(data=[go.Bar (x= Language_type_viewership.index, y = Language_type_viewership.values, marker_color='lightcoral')])

fig.update_layout(
    title='Total viewership Hours by Language(2023)',
    xaxis_title='Language',
    yaxis_title='Total Hours Viewed (in billions)',
    xaxis_tickangle=0,
    height=600,
    width=1000
)
fig.show()

In [9]:
df['Release Month'] = df['Release Date'].dt.month

monthly_viewership = df.groupby('Release Month')['Hours Viewed'].sum()

fig = go.Figure(data=[
    go.Scatter(
        x=monthly_viewership.index,
        y=monthly_viewership.values,
        mode='lines+markers',
        marker=dict(color='red'),
        line=dict(color='blue')
    )
])

fig.update_layout(
    title = 'Total vieweships Hours by Release Month (2023)',
    xaxis_title = 'Month',
    yaxis_title = 'Total Hours Viewed (in billions)',
    xaxis=dict(
        tickmode='array',
        tickvals=list(range(1,13)),
        ticktext=['jan','Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    ),
    height=600,
    width=1000
)
fig.show()

In [10]:
top_5_titles = df.nlargest(5, 'Hours Viewed')

top_5_titles[['Title', 'Hours Viewed', 'Content Type', 'Release Month']]

Unnamed: 0,Title,Hours Viewed,Content Type,Release Month
0,The Night Agent: Season 1,812100000.0,Show,3.0
1,Ginny & Georgia: Season 2,665100000.0,Show,1.0
18227,King the Land: Limited Series // 킹더랜드: 리미티드 시리즈,630200000.0,Movie,6.0
2,The Glory: Season 1 // 더 글로리: 시즌 1,622800000.0,Show,12.0
18214,ONE PIECE: Season 1,541900000.0,Show,8.0


In [11]:
monthly_viewership_by_type = df.pivot_table(
    index='Release Month',
    columns='Content Type',
    values='Hours Viewed',
    aggfunc='sum')
fig = go.Figure()

for content_type in monthly_viewership_by_type.columns:
    fig.add_trace(
        go.Scatter(
            x=monthly_viewership_by_type.index,
            y=monthly_viewership_by_type[content_type],
            mode = 'lines+markers',
            name=content_type
        )
    )
fig.update_layout(
    title='Viewership trends by Contents Type and Release Month (2023)',
    xaxis_title='Month',
    yaxis_title='Total Hours Viewed (in billions)',
    xaxis=dict(
        tickmode='array',
        tickvals=list(range(1,13)),
        ticktext=['jan','Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    ),
    height=600,
    width=1000,
    legend_title='content Type'
)
fig.show()

In [12]:
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'
    
season_order = ['Winter', 'Spring', 'Summer','Fall']

df['Release season'] = df['Release Month'].apply(get_season)

season_viewership = df.groupby('Release season')['Hours Viewed'].sum()

season_viewership = season_viewership.reindex(
    ['Winter', 'Spring', 'Summer','Fall']
)

fig = go.Figure(data=[
    go.Bar(
        x=season_viewership.index,
        y=season_viewership.values,
        marker_color = 'orange'
    )
])

fig.update_layout(
    title = 'Total Viewership Hours by Release Season (2023)',
    xaxis_title = 'season',
    yaxis_title = 'Total Hours Viewed (in billions)',
    xaxis_tickangle = 0,
    xaxis=dict(
        categoryorder = 'array',
        categoryarray = season_order
    ),
    height = 500,
    width = 800,
)
fig.show()

In [13]:
monthly_release = df['Release Month'].value_counts().sort_index()

monthly_viewership = df.groupby('Release Month')['Hours Viewed'].sum()

fig= go.Figure()

fig.add_trace(
    go.Bar(
        x = monthly_release.index,
        y = monthly_release.values,
        name = 'Number of releases',
        marker_color = 'goldenrod',
        opacity=0.7,
        yaxis='y1'
    )
)

fig.add_trace(
    go.Scatter(
        x = monthly_viewership.index,
        y = monthly_viewership.values,
        name = 'viewership Hours',
        mode = 'lines+markers',
        marker = dict(color = 'red'),
        line = dict(color = 'red'),
        yaxis='y2'
    )
)

fig.update_layout(
    title = 'Monthly release Patterns and Viewership Hours (2023)',
    xaxis = dict(
        title='Month',
        tickmode='array',
        tickvals=list(range(1,13)),
        ticktext=['jan','Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    ),
    yaxis = dict(
        title = 'number of releases',
        #showgrid = False,
        side='left'
    ),
    yaxis2 = dict(
        title = 'Total Hours viewed (in billions)',
        overlaying='y',
        #showgrid =False,
        side='right',
        #showgrid =False
    ),
    legend=dict(
        x=1.05,
        y=1,
        orientation='v',
        xanchor='left',
    ),
    height = 600,
    width = 1000
)
fig.show()

In [14]:
df['Releases Day'] = df['Release Date'].dt.day_name()

days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

weekday_releases = df['Releases Day'].value_counts().reindex(days)

weekly_viewership = df.groupby('Releases Day')['Hours Viewed'].sum().reindex(days)

fig = go.Figure()

fig.add_trace(
    go.Bar(
        x = weekday_releases.index,
        y = weekday_releases.values,
        name = 'Number of Releases',
        marker_color = 'blue',
        opacity=0.6,
        yaxis ='y1'
    )
)

fig.add_trace(
    go.Scatter(
        x = weekly_viewership.index,
        y = weekly_viewership.values,
        name = 'viewership Hours',
        mode = 'lines+markers',
        marker = dict(color = 'red'),
        line = dict(color = 'red'),
        yaxis ='y2'
    )
)

fig.update_layout(
    title = 'weekly releases Pattern and viewership Hours (2023)',
    xaxis = dict(
        title = 'Day of the weeky',
        categoryorder='array',
        categoryarray=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    ),
    yaxis=dict(
        title='Number of releases',
        showgrid=False,
        side='left'
    ),
    yaxis2=dict(
        title='viewership hours (in billons)',
        overlaying='y',
        showgrid=False,
        side='right'
    ),
    legend=dict(
        x=1.05,
        y=1,
        orientation='v',
        xanchor='left'
    ),
    height = 600,
    width = 1000
)

fig.show()
