In [133]:
import pandas as pd
from matplotlib import pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_white"

In [134]:
#get the dataset from csv file
df = pd.read_csv(r"/content/sample_data/netflix_content_2023.csv")
df.head()

Unnamed: 0,Title,Available Globally?,Release Date,Hours Viewed,Language Indicator,Content Type
0,The Night Agent: Season 1,Yes,2023-03-23,812100000,English,Show
1,Ginny & Georgia: Season 2,Yes,2023-01-05,665100000,English,Show
2,The Glory: Season 1 // 더 글로리: 시즌 1,Yes,2022-12-30,622800000,Korean,Show
3,Wednesday: Season 1,Yes,2022-11-23,507700000,English,Show
4,Queen Charlotte: A Bridgerton Story,Yes,2023-05-04,503000000,English,Movie


In [135]:
df.shape

(24812, 6)

In [136]:
df.isnull().sum()

Unnamed: 0,0
Title,0
Available Globally?,0
Release Date,16646
Hours Viewed,0
Language Indicator,0
Content Type,0


In [137]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24812 entries, 0 to 24811
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Title                24812 non-null  object
 1   Available Globally?  24812 non-null  object
 2   Release Date         8166 non-null   object
 3   Hours Viewed         24812 non-null  object
 4   Language Indicator   24812 non-null  object
 5   Content Type         24812 non-null  object
dtypes: object(6)
memory usage: 1.1+ MB


In [138]:
df['Release Date'] = df['Release Date'].fillna('2023-03-23')

In [139]:
df['Release Date']

Unnamed: 0,Release Date
0,2023-03-23
1,2023-01-05
2,2022-12-30
3,2022-11-23
4,2023-05-04
...,...
24807,2023-03-23
24808,2019-07-30
24809,2022-07-26
24810,2020-09-28


In [140]:
# Let me start with cleaning and preprocessing the “Hours Viewed” column to prepare it for analysis:
df['Hours Viewed'] = df['Hours Viewed'].replace(',', '', regex = True).astype('float')
df[['Title','Hours Viewed']].head()
df['Hours Viewed'].dtypes

dtype('float64')

In [141]:
Content_viewership_Type = df.groupby(['Content Type'])['Hours Viewed'].sum()

fig = go.Figure(data = [
    go.Bar(
        x= Content_viewership_Type.index,
        y = Content_viewership_Type.values,
        marker_color = ['skyblue', 'purple'],

    )
])

fig.update_layout(
        title = 'Total viewership hours by content type',
        xaxis_title = 'Content Type',
        yaxis_title = 'Total Hours viewed',
        height = 800,
        width = 400
)

fig.show()


In [142]:
df['Language Indicator'].value_counts()

Unnamed: 0_level_0,count
Language Indicator,Unnamed: 1_level_1
English,17268
Non-English,3252
Japanese,2297
Korean,1582
Hindi,374
Russian,39


In [143]:
# Next, let’s analyze the distribution of viewership across different languages to understand which languages are contributing the most to Netflix’s content consumption:
viewership_language=df.groupby(['Language Indicator'])['Hours Viewed'].max().sort_values(ascending = False)
fig = go.Figure(data =[
  go.Bar(
      x = viewership_language.index,
      y= viewership_language.values
  )

])
fig.update_layout(
    xaxis_title = 'Language Indicator',
    yaxis_title = 'Hours Viewed',
    title = 'viewership accross diffrent language',
    height = 600,
    width = 1000
)
fig.show()

In [144]:
df

Unnamed: 0,Title,Available Globally?,Release Date,Hours Viewed,Language Indicator,Content Type
0,The Night Agent: Season 1,Yes,2023-03-23,812100000.0,English,Show
1,Ginny & Georgia: Season 2,Yes,2023-01-05,665100000.0,English,Show
2,The Glory: Season 1 // 더 글로리: 시즌 1,Yes,2022-12-30,622800000.0,Korean,Show
3,Wednesday: Season 1,Yes,2022-11-23,507700000.0,English,Show
4,Queen Charlotte: A Bridgerton Story,Yes,2023-05-04,503000000.0,English,Movie
...,...,...,...,...,...,...
24807,We Are Black and British: Season 1,No,2023-03-23,100000.0,English,Show
24808,Whitney Cummings: Can I Touch It?,Yes,2019-07-30,100000.0,English,Movie
24809,Whitney Cummings: Jokes,No,2022-07-26,100000.0,English,Movie
24810,"Whose Vote Counts, Explained: Limited Series",Yes,2020-09-28,100000.0,English,Movie


In [145]:
# convert the "Release Date" to a datetime format and extract the month
import pandas as pd
import plotly.graph_objects as go
df['Release Date'] = pd.to_datetime(df['Release Date'])
df['Release Month'] = df['Release Date'].dt.month
# df
#aggregate viewership hours by release month
monthly_viewership= df.groupby(['Release Month'])['Hours Viewed'].sum()

fig = go.Figure(data =[
    go.Scatter(
        x = monthly_viewership.index,
        y = monthly_viewership.values,
        marker = dict(color ='blue'),
        mode = 'lines+markers',
        line = dict(color='blue')
    )
])
fig.update_layout(
    xaxis_title ='Relase Month',
    yaxis_title ='Hours Viewed',
    title = 'viewership hours by release month',
    width = 1000,
    height = 600,
    xaxis = dict(
      tickmode = 'array',
      tickvals = list(range(1, 13)),
      ticktext = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun','Jul',
                  'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    )
)
fig.show()

In [146]:
fig = go.Figure(data = [
    go.Bar(
        x = monthly_viewership.index,
        y = monthly_viewership.values,
        marker_color = 'blue'
    )

])
fig.show()

In [147]:
# convert the "Release Date" to a datetime format and extract the day of week
df['Release Date'] = pd.to_datetime(df['Release Date'])
df['Release Day'] = df['Release Date'].dt.day_of_week
print(df['Release Day'].unique())
# aggregate viewership hours by release Week
monthly_viewership= df.groupby(['Release Day'])['Hours Viewed'].sum()

fig = go.Figure(data =[
    go.Scatter(
        x = monthly_viewership.index,
        y = monthly_viewership.values,
        marker = dict(color ='blue'),
        mode = 'lines+markers',
        line = dict(color='blue')
    )
])
fig.update_layout(
    xaxis_title ='Release Day',
    yaxis_title ='Hours Viewed',
    title = 'viewership hours by release Week',
    width = 1000,
    height = 600,
    xaxis = dict(
      tickmode = 'array',
      tickvals = list(range(0, 7)),
      ticktext = ['Sun', 'Mon', 'Tue', 'Wed', 'Thus', 'Fri','Sat']
    )
)
fig.show()

[3 4 2 6 1 5 0]


In [148]:
# extract the top 5 titles based on viewership hours

data = df.nlargest(5, 'Hours Viewed')


In [149]:
df.groupby('Title')['Hours Viewed'].max().sort_values(ascending = False).head()

Unnamed: 0_level_0,Hours Viewed
Title,Unnamed: 1_level_1
The Night Agent: Season 1,812100000.0
Ginny & Georgia: Season 2,665100000.0
King the Land: Limited Series // 킹더랜드: 리미티드 시리즈,630200000.0
The Glory: Season 1 // 더 글로리: 시즌 1,622800000.0
ONE PIECE: Season 1,541900000.0


In [150]:
df

Unnamed: 0,Title,Available Globally?,Release Date,Hours Viewed,Language Indicator,Content Type,Release Month,Release Day
0,The Night Agent: Season 1,Yes,2023-03-23,812100000.0,English,Show,3,3
1,Ginny & Georgia: Season 2,Yes,2023-01-05,665100000.0,English,Show,1,3
2,The Glory: Season 1 // 더 글로리: 시즌 1,Yes,2022-12-30,622800000.0,Korean,Show,12,4
3,Wednesday: Season 1,Yes,2022-11-23,507700000.0,English,Show,11,2
4,Queen Charlotte: A Bridgerton Story,Yes,2023-05-04,503000000.0,English,Movie,5,3
...,...,...,...,...,...,...,...,...
24807,We Are Black and British: Season 1,No,2023-03-23,100000.0,English,Show,3,3
24808,Whitney Cummings: Can I Touch It?,Yes,2019-07-30,100000.0,English,Movie,7,1
24809,Whitney Cummings: Jokes,No,2022-07-26,100000.0,English,Movie,7,1
24810,"Whose Vote Counts, Explained: Limited Series",Yes,2020-09-28,100000.0,English,Movie,9,0


In [163]:
# aggregate viewership hours by content type and release month
monthly_viewership = df.pivot_table(index = 'Release Month',
                                    columns = 'Content Type',
                                    values = 'Hours Viewed',
                                    aggfunc='sum')
print(monthly_viewership.index)
fig= go.Figure()
for i in monthly_viewership.columns:
  fig.add_trace(
# add_trace method: This is particularly useful when you want to overlay multiple datasets on the same graph.
      go.Scatter(
          x = monthly_viewership.index,
          y = monthly_viewership[i],
          mode = 'markers+lines',
          name = i
      )
  )
fig.update_layout(
    title='Viewership Trends by Content Type and Release Month (2023)',
    xaxis_title='Month',
    yaxis_title='Total Hours Viewed (in billions)',
    xaxis=dict(
        tickmode='array',
        tickvals=list(range(1, 13)),
        ticktext=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    ),
    height=600,
    width=1000,
    legend_title='Content Type'
)

fig.show()

Index([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype='int32', name='Release Month')


In [152]:
# aggregate viewership hours by content type and release month
monthly_viewership = df.pivot_table(index = 'Release Month',
                                    columns = 'Content Type',
                                    values = 'Hours Viewed',
                                    aggfunc='sum')
fig= go.Figure()
for i in monthly_viewership.columns:
  fig.add_trace(
# add_trace method: This is particularly useful when you want to overlay multiple datasets on the same graph.
      go.Bar(
          x = monthly_viewership.index,
          y = monthly_viewership[i],
          # mode = 'lines+markers',
          name = i
      )
  )
fig.update_layout(
    title='Viewership Trends by Content Type and Release Month (2023)',
    xaxis_title='Month',
    yaxis_title='Total Hours Viewed (in billions)',
    xaxis=dict(
        tickmode='array',
        tickvals=list(range(1, 13)),
        ticktext=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    ),
    height=700,
    width=1000,
    legend_title='Content Type',
    barmode = 'group'
)

fig.show()

In [153]:
# define seasons based on release months
def get_season(month):
  if month in [11,12,1]:
    return 'Winter'
  elif month in [2,3,4]:
    return 'Summer'
  elif month in [5,6,7]:
    return 'Spring'
  else:
    return 'Rainy'



In [154]:
df

Unnamed: 0,Title,Available Globally?,Release Date,Hours Viewed,Language Indicator,Content Type,Release Month,Release Day
0,The Night Agent: Season 1,Yes,2023-03-23,812100000.0,English,Show,3,3
1,Ginny & Georgia: Season 2,Yes,2023-01-05,665100000.0,English,Show,1,3
2,The Glory: Season 1 // 더 글로리: 시즌 1,Yes,2022-12-30,622800000.0,Korean,Show,12,4
3,Wednesday: Season 1,Yes,2022-11-23,507700000.0,English,Show,11,2
4,Queen Charlotte: A Bridgerton Story,Yes,2023-05-04,503000000.0,English,Movie,5,3
...,...,...,...,...,...,...,...,...
24807,We Are Black and British: Season 1,No,2023-03-23,100000.0,English,Show,3,3
24808,Whitney Cummings: Can I Touch It?,Yes,2019-07-30,100000.0,English,Movie,7,1
24809,Whitney Cummings: Jokes,No,2022-07-26,100000.0,English,Movie,7,1
24810,"Whose Vote Counts, Explained: Limited Series",Yes,2020-09-28,100000.0,English,Movie,9,0


In [155]:
# apply the season categorization to the dataset
df['Season_category'] = df['Release Month'].apply(get_season)
df.head(60)

Unnamed: 0,Title,Available Globally?,Release Date,Hours Viewed,Language Indicator,Content Type,Release Month,Release Day,Season_category
0,The Night Agent: Season 1,Yes,2023-03-23,812100000.0,English,Show,3,3,Summer
1,Ginny & Georgia: Season 2,Yes,2023-01-05,665100000.0,English,Show,1,3,Winter
2,The Glory: Season 1 // 더 글로리: 시즌 1,Yes,2022-12-30,622800000.0,Korean,Show,12,4,Winter
3,Wednesday: Season 1,Yes,2022-11-23,507700000.0,English,Show,11,2,Winter
4,Queen Charlotte: A Bridgerton Story,Yes,2023-05-04,503000000.0,English,Movie,5,3,Spring
5,You: Season 4,Yes,2023-02-09,440600000.0,English,Show,2,3,Summer
6,La Reina del Sur: Season 3,No,2022-12-30,429600000.0,English,Show,12,4,Winter
7,Outer Banks: Season 3,Yes,2023-02-23,402500000.0,English,Show,2,3,Summer
8,Ginny & Georgia: Season 1,Yes,2021-02-24,302100000.0,English,Show,2,2,Summer
9,FUBAR: Season 1,Yes,2023-05-25,266200000.0,English,Show,5,3,Spring


In [156]:
# aggregate viewership hours by release season
seasonal_viewership = df.groupby(['Season_category'])['Hours Viewed'].max()
seasonal_viewership

Unnamed: 0_level_0,Hours Viewed
Season_category,Unnamed: 1_level_1
Rainy,541900000.0
Spring,630200000.0
Summer,812100000.0
Winter,665100000.0


In [157]:
# order the seasons as 'Winter','Summer', 'Rainy', 'Spring'
seasonal_list = ['Winter',  'Summer', 'Rainy', 'Spring']
# The reindex method in pandas is used to conform a DataFrame to a new index, with optional filling logic for missing values. This can be useful when you need to align data to a specific index or when you want to reorder the rows or columns.
seasonal_viewership = seasonal_viewership.reindex(seasonal_list)
fig = go.Figure(data = [
    go.Bar(
    x = seasonal_viewership.index,
    y = seasonal_viewership.values,
    marker_color = 'pink'
    )
])
fig.update_layout(
    xaxis_title = 'seasonal_list',
    yaxis_title = 'Hours Viewed',
    title  = 'Hourly Views on based on seasonal',
    height = 700,
    width = 800,
    xaxis=dict(
      categoryorder='array',
      categoryarray=seasonal_list
    )
)
fig.show()


In [158]:
df.columns

Index(['Title', 'Available Globally?', 'Release Date', 'Hours Viewed',
       'Language Indicator', 'Content Type', 'Release Month', 'Release Day',
       'Season_category'],
      dtype='object')

In [159]:
# Now, let’s analyze the number of content releases and their viewership hours across months

month_releases = df['Release Month'].value_counts().sort_index()

monthly_viewership = df.groupby('Release Month')['Hours Viewed'].sum()

fig = go.Figure()

fig.add_trace(
    go.Bar(
        x = month_releases.index,
        y =month_releases.values,
        name = 'Number of Releases',
        marker_color = 'gold',
        yaxis='y1',
        opacity=0.7

    )
)
fig.add_trace(
    go.Scatter(
      x =monthly_viewership.index,
      y = monthly_viewership.values,
      name='Viewership Hours',
      mode = 'lines+markers',
      marker = dict(color = 'red'),
      line = dict(color = 'red'),
      yaxis='y2'
    )
)

fig.update_layout(
    title = 'Relationship between month and hours viewed',
    xaxis = dict(
        title = 'Month',
        tickmode = 'array',
        tickvals = list(range(1, 13)),
        ticktext=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

    ),
    yaxis = dict(
        title = 'Number of Releases',
        showgrid = True,
        side = 'left'
    ),
     yaxis2=dict(
        title='Total Hours Viewed (in billions)',
        overlaying='y',
        side='right',
        showgrid=False
    ),
    legend=dict(
        x=1.15,
        y=1,
        orientation='v',
        xanchor='left'
    ),
    height=600,
    width=900
)

fig.show()

In [160]:
# Now, let’s analyze the number of content releases and their viewership hours across weekdays
# df['Release Date'] = pd.to_datetime(df['Release Date'])
df['Release Day'] = df['Release Date'].dt.day_name()


Weekly_releases = df['Release Day'].value_counts().reindex(
    ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'
])
Weekly_Viewership = df.groupby('Release Day')['Hours Viewed'].sum().reindex(
    ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'
])
fig = go.Figure()

fig.add_trace(
    go.Bar(
        x=Weekly_releases.index,
        y=Weekly_releases.values,
        name='Number of Releases',
        marker_color='blue',
        opacity=0.6,
        yaxis='y1'
    )
)

fig.add_trace(
    go.Scatter(
        x=Weekly_Viewership.index,
        y=Weekly_Viewership.values,
        name='Viewership Hours',
        mode='lines+markers',
        marker=dict(color='red'),
        line=dict(color='red'),
        yaxis='y2'
    )
)

fig.update_layout(
    title='Weekly Release Patterns and Viewership Hours (2023)',
    xaxis=dict(
        title='Day of the Week',
        categoryorder='array',
        categoryarray=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    ),
    yaxis=dict(
        title='Number of Releases',
        showgrid=False,
        side='left'
    ),
    yaxis2=dict(
        title='Total Hours Viewed (in billions)',
        overlaying='y',
        side='right',
        showgrid=False
    ),
    legend=dict(
        x=1.05,
        y=1,
        orientation='v',
        xanchor='left'
    ),
    height=600,
    width=1000
)

fig.show()



In [161]:
df

Unnamed: 0,Title,Available Globally?,Release Date,Hours Viewed,Language Indicator,Content Type,Release Month,Release Day,Season_category
0,The Night Agent: Season 1,Yes,2023-03-23,812100000.0,English,Show,3,Thursday,Summer
1,Ginny & Georgia: Season 2,Yes,2023-01-05,665100000.0,English,Show,1,Thursday,Winter
2,The Glory: Season 1 // 더 글로리: 시즌 1,Yes,2022-12-30,622800000.0,Korean,Show,12,Friday,Winter
3,Wednesday: Season 1,Yes,2022-11-23,507700000.0,English,Show,11,Wednesday,Winter
4,Queen Charlotte: A Bridgerton Story,Yes,2023-05-04,503000000.0,English,Movie,5,Thursday,Spring
...,...,...,...,...,...,...,...,...,...
24807,We Are Black and British: Season 1,No,2023-03-23,100000.0,English,Show,3,Thursday,Summer
24808,Whitney Cummings: Can I Touch It?,Yes,2019-07-30,100000.0,English,Movie,7,Tuesday,Spring
24809,Whitney Cummings: Jokes,No,2022-07-26,100000.0,English,Movie,7,Tuesday,Spring
24810,"Whose Vote Counts, Explained: Limited Series",Yes,2020-09-28,100000.0,English,Movie,9,Monday,Rainy


In [162]:
# define significant holidays and events in 2023
important_dates = [
    '2023-01-01',  # new year's day
    '2023-02-14',  # valentine's ay
    '2023-07-04',  # independence day (US)
    '2023-10-31',  # halloween
    '2023-12-25'   # christmas day
]

# convert to datetime
important_dates = pd.to_datetime(important_dates)

# check for content releases close to these significant holidays (within a 3-day window)
holiday_releases = df[df['Release Date'].apply(
    lambda x: any((x - date).days in range(-3, 4) for date in important_dates)
)]

# aggregate viewership hours for releases near significant holidays
holiday_viewership = holiday_releases.groupby('Release Date')['Hours Viewed'].sum()

holiday_releases[['Title', 'Release Date', 'Hours Viewed']]

Unnamed: 0,Title,Release Date,Hours Viewed
2,The Glory: Season 1 // 더 글로리: 시즌 1,2022-12-30,622800000.0
6,La Reina del Sur: Season 3,2022-12-30,429600000.0
11,Kaleidoscope: Limited Series,2023-01-01,252500000.0
29,Perfect Match: Season 1,2023-02-14,176800000.0
124,Lady Voyeur: Limited Series // Olhar Indiscret...,2022-12-31,86000000.0
...,...,...,...
22324,The Romantics: Limited Series,2023-02-14,1000000.0
22327,Aggretsuko: Season 5 // アグレッシブ烈子: シーズン5,2023-02-16,900000.0
22966,The Lying Life of Adults: Limited Series // La...,2023-01-04,900000.0
22985,Community Squad: Season 1 // División Palermo:...,2023-02-17,800000.0
