<a href="https://colab.research.google.com/github/Mathimalar07/Netflix_Data_Analysis-2021-2024-/blob/main/Netflix_data_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_white"

netflix_data = pd.read_csv("/content/20241227globaldata.csv")
netflix_data.head()

Unnamed: 0,week,category,weekly_rank,show_title,season_title,weekly_hours_viewed,runtime,weekly_views,cumulative_weeks_in_top_10,is_staggered_launch,runtime_override_flag,episode_launch_dtls
0,2024-12-22,Films (English),1,Carry-On,,110200000,2.0,55100000.0,2,False,"[{is_staggered_launch=0, is_live_title=0}]",
1,2024-12-22,Films (English),2,The Six Triple Eight,,30900000,2.1667,14300000.0,1,False,"[{is_staggered_launch=0, is_live_title=0}]",
2,2024-12-22,Films (English),3,Dr. Seuss' The Grinch,,16300000,1.4333,11400000.0,10,False,"[{is_staggered_launch=0, is_live_title=0}]",
3,2024-12-22,Films (English),4,That Christmas,,17800000,1.6,11100000.0,3,False,"[{is_staggered_launch=0, is_live_title=0}]",
4,2024-12-22,Films (English),5,Disaster Holiday,,11600000,1.55,7500000.0,2,False,"[{is_staggered_launch=0, is_live_title=0}]",


In [4]:
netflix_data['weekly_hours_viewed'] = netflix_data['weekly_hours_viewed'].replace(',', '', regex=True).astype(float)

netflix_data[['show_title', 'weekly_hours_viewed']].head()

Unnamed: 0,show_title,weekly_hours_viewed
0,Carry-On,110200000.0
1,The Six Triple Eight,30900000.0
2,Dr. Seuss' The Grinch,16300000.0
3,That Christmas,17800000.0
4,Disaster Holiday,11600000.0


In [11]:
category_viewership = netflix_data.groupby('category')['weekly_hours_viewed'].sum()

In [29]:
fig = go.Figure(data=[
    go.Bar(
        x=category_viewership.index,
        y=category_viewership.values,
        marker_color=['skyblue', 'salmon','maroon','purple']
    )
])
fig.update_layout(title='Total Viewership Hours by Category (2021-2024)',
    xaxis_title='Category',
    yaxis_title='Total Hours Viewed (in crores)',
    xaxis_tickangle=0,
    height=500,
    width=800)
fig.show()

In [30]:
netflix_data['Release Week'] = pd.to_datetime(netflix_data['week'])
netflix_data['Release Month'] = netflix_data['Release Week'].dt.month

# aggregate viewership hours by release month
monthly_viewership = netflix_data.groupby('Release Month')['weekly_hours_viewed'].sum()

fig = go.Figure(data=[
    go.Scatter(
        x=monthly_viewership.index,
        y=monthly_viewership.values,
        mode='lines+markers',
        marker=dict(color='blue'),
        line=dict(color='blue')
    )
])

fig.update_layout(
    title='Total Viewership Hours by Release Month (2021-2024)',
    xaxis_title='Month',
    yaxis_title='Total Hours Viewed (in crores)',
    xaxis=dict(
        tickmode='array',
        tickvals=list(range(1, 13)),
        ticktext=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    ),
    height=600,
    width=1000
)

fig.show()

In [16]:
# extract the top 5 titles based on viewership hours
top_5_titles = netflix_data.nlargest(5, 'weekly_hours_viewed')

top_5_titles[['show_title', 'weekly_hours_viewed','category','week']]

Unnamed: 0,show_title,weekly_hours_viewed,category,week
6750,Squid Game,571760000.0,TV (Non-English),2021-10-03
6790,Squid Game,448730000.0,TV (Non-English),2021-09-26
6710,Squid Game,412940000.0,TV (Non-English),2021-10-10
4300,Wednesday,411290000.0,TV (English),2022-12-04
4340,Wednesday,341230000.0,TV (English),2022-11-27


In [35]:
# aggregate viewership hours by content type and release month
monthly_viewership_by_type = netflix_data.pivot_table(index='Release Month',
                                                      columns='category',
                                                      values='weekly_hours_viewed',
                                                      aggfunc='sum')

fig = go.Figure()

for content_type in monthly_viewership_by_type.columns:
    fig.add_trace(
        go.Scatter(
            x=monthly_viewership_by_type.index,
            y=monthly_viewership_by_type[content_type],
            mode='lines+markers',
            name=content_type
        )
    )

fig.update_layout(
    title='Viewership Trends by category and Release Month (2021-2024)',
    xaxis_title='Month',
    yaxis_title='Total Hours Viewed (in crores)',
    xaxis=dict(
        tickmode='array',
        tickvals=list(range(1, 13)),
        ticktext=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    ),
    height=600,
    width=1000,
    legend_title='Category'
)

fig.show()

In [32]:
# define seasons based on release months
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

# apply the season categorization to the dataset
netflix_data['Release Season'] = netflix_data['Release Month'].apply(get_season)

# aggregate viewership hours by release season
seasonal_viewership = netflix_data.groupby('Release Season')['weekly_hours_viewed'].sum()

# order the seasons as 'Winter', 'Spring', 'Summer', 'Fall'
seasons_order = ['Winter', 'Spring', 'Summer', 'Fall']
seasonal_viewership = seasonal_viewership.reindex(seasons_order)

fig = go.Figure(data=[
    go.Bar(
        x=seasonal_viewership.index,
        y=seasonal_viewership.values,
        marker_color='orange'
    )
])

fig.update_layout(
    title='Total Viewership Hours by Release Season (2021-2024)',
    xaxis_title='Season',
    yaxis_title='Total Hours Viewed (in crores)',
    xaxis_tickangle=0,
    height=500,
    width=800,
    xaxis=dict(
        categoryorder='array',
        categoryarray=seasons_order
    )
)

fig.show()

In [33]:
monthly_releases = netflix_data['Release Month'].value_counts().sort_index()

monthly_viewership = netflix_data.groupby('Release Month')['weekly_hours_viewed'].sum()

fig = go.Figure()

fig.add_trace(
    go.Bar(
        x=monthly_releases.index,
        y=monthly_releases.values,
        name='Number of Releases',
        marker_color='goldenrod',
        opacity=0.7,
        yaxis='y1'
    )
)

fig.add_trace(
    go.Scatter(
        x=monthly_viewership.index,
        y=monthly_viewership.values,
        name='Viewership Hours',
        mode='lines+markers',
        marker=dict(color='red'),
        line=dict(color='red'),
        yaxis='y2'
    )
)

fig.update_layout(
    title='Monthly Release Patterns and Viewership Hours (2021-2024)',
    xaxis=dict(
        title='Month',
        tickmode='array',
        tickvals=list(range(1, 13)),
        ticktext=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    ),
    yaxis=dict(
        title='Number of Releases',
        showgrid=False,
        side='left'
    ),
    yaxis2=dict(
        title='Total Hours Viewed (in crores)',
        overlaying='y',
        side='right',
        showgrid=False
    ),
    legend=dict(
        x=1.05,
        y=1,
        orientation='v',
        xanchor='left'
    ),
    height=600,
    width=1000
)

fig.show()

In [26]:
netflix_data['week'] = pd.to_datetime(netflix_data['week'], errors='coerce')

# Now extract the day name
netflix_data['Release Day'] = netflix_data['week'].dt.day_name()


In [27]:
print(netflix_data['week'].dtype)  # Should print datetime64[ns]
print(netflix_data[['week', 'Release Day']].head())  # Check the values


datetime64[ns]
        week Release Day
0 2024-12-22      Sunday
1 2024-12-22      Sunday
2 2024-12-22      Sunday
3 2024-12-22      Sunday
4 2024-12-22      Sunday


In [34]:
netflix_data['Release Day'] = netflix_data['week'].dt.day_name()

weekday_releases = netflix_data['Release Day'].value_counts().reindex(
    ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
)

# aggregate viewership hours by day of the week
weekday_viewership = netflix_data.groupby('Release Day')['weekly_hours_viewed'].sum().reindex(
    ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
)

fig = go.Figure()

fig.add_trace(
    go.Bar(
        x=weekday_releases.index,
        y=weekday_releases.values,
        name='Number of Releases',
        marker_color='blue',
        opacity=0.6,
        yaxis='y1'
    )
)

fig.add_trace(
    go.Scatter(
        x=weekday_viewership.index,
        y=weekday_viewership.values,
        name='Viewership Hours',
        mode='lines+markers',
        marker=dict(color='red'),
        line=dict(color='red'),
        yaxis='y2'
    )
)

fig.update_layout(
    title='Weekly Release Patterns and Viewership Hours (2021-2024)',
    xaxis=dict(
        title='Day of the Week',
        categoryorder='array',
        categoryarray=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    ),
    yaxis=dict(
        title='Number of Releases',
        showgrid=False,
        side='left'
    ),
    yaxis2=dict(
        title='Total Hours Viewed (in crores)',
        overlaying='y',
        side='right',
        showgrid=False
    ),
    legend=dict(
        x=1.05,
        y=1,
        orientation='v',
        xanchor='left'
    ),
    height=600,
    width=1000
)

fig.show()

In [36]:
important_dates = [
    '2023-01-01', '2024-01-01', '2022-01-01', '2021-01-01',  # new year's day
    '2023-02-14', '2024-02-14', '2022-02-14', '2021-02-14',  # valentine's day
    '2023-07-15', '2024-07-15', '2022-07-15', '2021-07-15',  # independence day (India)
    '2023-11-12', '2024-10-31', '2022-10-24', '2021-11-04',  # diwali day
    '2023-12-25', '2024-12-25', '2022-12-25', '2021-12-25'  # christmas day
]

# convert to datetime
important_dates = pd.to_datetime(important_dates)

# check for content releases close to these significant holidays (within a 3-day window)
holiday_releases = netflix_data[netflix_data['week'].apply(
    lambda x: any((x - date).days in range(-3, 4) for date in important_dates)
)]

# aggregate viewership hours for releases near significant holidays
holiday_viewership = holiday_releases.groupby('week')['weekly_hours_viewed'].sum()

holiday_releases[['show_title', 'week', 'weekly_hours_viewed']]

Unnamed: 0,show_title,week,weekly_hours_viewed
0,Carry-On,2024-12-22,110200000.0
1,The Six Triple Eight,2024-12-22,30900000.0
2,Dr. Seuss' The Grinch,2024-12-22,16300000.0
3,That Christmas,2024-12-22,17800000.0
4,Disaster Holiday,2024-12-22,11600000.0
...,...,...,...
7195,Carrossel,2021-07-18,11620000.0
7196,Lupin,2021-07-18,9280000.0
7197,Hospital Playlist,2021-07-18,8530000.0
7198,"Nevertheless,",2021-07-18,8130000.0
