In [3]:
# Libraries in use
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import streamlit as st
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

In [69]:
# Dataset used
dataset = 'netflix_movies.csv'

# Dataframe over dataset
df = pd.read_csv(dataset)

#### **Data pre-processing**

In [None]:
# Check the dataframe information
df.info()

In [6]:
# Selects the columns whose type is object and transforms them into a list
object_columns = df.columns[df.dtypes==object].to_list()

# Do not use column 'date_added'
if 'date_added' in object_columns:
    object_columns.remove('date_added')

# Transform object columns into categorical columns
df[object_columns] = df[object_columns].astype('category')

In [71]:
# Transform the column to the datetime type
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')

In [None]:
# Identifies duplicate values
df.duplicated().sum()

In [None]:
# Identifying and displaying null values
df_missing = (
    df.isna().sum() # counts the null values in the dataframe
    .to_frame('missing_count') # Take the Series and turn it into a DataFrame
    .join( # concat
        (
            df.isna().sum() # counts the null values in the dataframe
            / df.shape[0] # Divide the count of null values by the size of the dataframe
        ) 
        .to_frame('missing_pct') # Transforms the Series into Dataframe of the percentage of each null value in the table
    )
    .sort_values('missing_count',ascending=False) # Table sorted by missing_count dataframe
)

df_missing.style.format('{:.2%}',subset=['missing_pct']) # improves the formatting of the percentage in the dataframe

In [96]:
# It is not interesting to delete null values, only rows with full null values.
# Because, we can miss some values in our analysis.
# If you want to drop all rows with null values
# df.dropna(inplace=True)

Show the all Movies & TV Shows released by month -> Graphs

In [72]:
# Selects in the 'date_added' column only the months of the data and counts how many times they appear
month = df['date_added'].dt.month.value_counts().sort_index()

In [None]:
# Plot bar charts
"""
    Show the all Movies & TV Shows released by month
"""
%matplotlib inline
sns.set_theme(style='ticks')
plt.figure(figsize=(12, 6))
month.plot(kind='bar', color='teal', layout='constrained')
plt.title('Movie and Series Releases by Month')
plt.xlabel('Month')
plt.ylabel('Number of Launches')
plt.xticks(ticks=range(12), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Plot bar charts
"""
    Show the all Movies & TV Shows released by month
"""
# Use Seaborn's darkgrid theme for better visuals
sns.set_theme(style="ticks")

# Figure setup
plt.figure(figsize=(12, 6))

# Create bar plot
ax = month.plot(kind='bar', color='teal')
#sns.color_palette('coolwarm', n_colors=12)
# Title and labels
plt.title('Movie and Series Releases by Month', fontsize=16, fontweight='bold')
plt.xlabel('Month', fontsize=12)
plt.ylabel('Number of Launches', fontsize=12)

# Customizing x-ticks
plt.xticks(ticks=range(12), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], rotation=45)

# Add data labels
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', (p.get_x() * 1.005, p.get_height() * 1.005), fontsize=10, color='black')

# Apply tight layout for better spacing
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Plot bar charts
"""
    Show the all Movies & TV Shows released by month
"""
# Create a figure with a specific background color
fig, ax = plt.subplots(figsize=(12, 6))

# Set background color of the plot area (axes)
ax.set_facecolor('lightgray')  # Light gray background for the plot

# Set background color for the figure area
fig.patch.set_facecolor('whitesmoke')  # Soft white background for the figure

# Plot the bar chart with a custom color palette
ax.bar(range(12), month, color='teal')

# Set title and labels
ax.set_title('Movie and Series Releases by Month', fontsize=16, fontweight='bold')
ax.set_xlabel('Month', fontsize=12)
ax.set_ylabel('Number of Launches', fontsize=12)

# Customizing x-ticks
ax.set_xticks(range(12))
ax.set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], rotation=45)

# Add data labels
for i, value in enumerate(month):
    ax.text(i, value + 2, f'{value}', ha='center', fontsize=10)

# Adjust layout for proper spacing
plt.tight_layout()

# Show the plot
plt.show()

Count the all types of unique rating & which rating are with most number -> Graphs

In [100]:
# Sample DataFrame
values_rating = pd.DataFrame(df['rating'].value_counts()).reset_index()
values_rating.columns = ['Rating', 'Count']

In [None]:
# Plot bar 
""" 
    Count the all types of unique rating & which rating are with most number
"""
# Set up the figure and axes
plt.figure(figsize=(10, 6))

# Use a barplot with Seaborn for better aesthetics
ax = sns.barplot(x='Rating', y='Count',hue='Rating', data=values_rating, palette='deep',legend=False, order=values_rating['Rating'])

# Set titles and labels
ax.set_title('Unique Rating Values and Their Frequency', fontsize=16, fontweight='bold')
ax.set_xlabel('Rating', fontsize=12)
ax.set_ylabel('Number of Ratings', fontsize=12)

# Annotate bars with the counts
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='baseline', fontsize=11, color='black', xytext=(0, 5), textcoords='offset points')

# Display the plot
plt.tight_layout()
plt.show()

In [None]:
# Plot horizontal bar 
""" 
    Count the all types of unique rating & which rating are with most number
"""
# Set up the figure and axes for a horizontal bar chart
plt.figure(figsize=(10, 6))

# Horizontal barplot
ax = sns.barplot(x='Count', y='Rating', hue=values_rating['Rating'], data=values_rating, palette='deep', order=values_rating['Rating'], legend=False)

# Set titles and labels
ax.set_title('Unique Rating Values and Their Frequency (Horizontal)', fontsize=16, fontweight='bold')
ax.set_xlabel('Number of Ratings', fontsize=12)
ax.set_ylabel('Rating', fontsize=12)

# Annotate bars with the counts
for p in ax.patches:
    ax.annotate(f'{int(p.get_width())}', (p.get_width() + 1, p.get_y() + p.get_height() / 2),
                va='center', fontsize=11, color='black')

# Display the plot
plt.tight_layout()
plt.show()


Salman Khan, Shah Rukh Khan, Akshay Kumar quantity movies - Graph

In [None]:
# Plot barplot
""" 
    Salman Khan, Shah Rukh Khan, Akshay Kumar all movies
"""

# Create a mask to identify the actors in the cast
mask = df['cast'].str.contains('Salman Khan|Shah Rukh Khan|Akshay Kumar', na=False)

# Filter the DataFrame
filtered_df = df[mask]

# Separate the actors into a new column
filtered_df['actor'] = filtered_df['cast'].str.extract('(Salman Khan|Shah Rukh Khan|Akshay Kumar)')

# Count films by actor
movies_count = filtered_df.groupby('actor').size().reset_index(name='Count')

# Plot the data on a bar chart
plt.figure(figsize=(8, 5))
ax = sns.barplot(x=movies_count['actor'], y=movies_count['Count'], hue=movies_count['actor'], palette='deep', legend=False)

# Add title and labels
ax.set_title('Number of Films by Actors', fontsize=16, fontweight='bold')
ax.set_xlabel('Actors', fontsize=12)
ax.set_ylabel('Number of films', fontsize=12)

# Add the values to the tops of the bars
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='baseline', fontsize=11, color='black', xytext=(0, 5), textcoords='offset points')

plt.tight_layout()
plt.show()

Year on Year show added on Netflix by its type -> Graph

In [None]:
# DataFrame filtered for series only (TV Shows)
tv_show = df[df['type'] == 'TV Show']

# Extract the year from the addition date
tv_show['year_added'] = tv_show['date_added'].dt.year

# Separate the genders (listed_in can have multiple values, so we separate them with a comma)
tv_show['genres'] = tv_show['listed_in'].str.split(', ')

# Explode the genres column to have a row for each genre per series
tv_show_exploded = tv_show.explode('genres')

# Count the number of series by gender and by year
tv_show_grouped = tv_show_exploded.groupby(['year_added', 'genres']).size().reset_index(name='count')

# Remove years with NaN values (if any)
tv_show_grouped = tv_show_grouped.dropna(subset=['year_added'])

# Generate a color palette with the number of unique genres
num_genres = tv_show_grouped['genres'].nunique()
palette = sns.color_palette("hsv", num_genres).as_hex()

# Create a dictionary to map the genres to colors
unique_genres = tv_show_grouped['genres'].unique()
color_map = {genre: color for genre, color in zip(unique_genres, palette)}

# Create the interactive chart using Plotly Express
fig = px.line(tv_show_grouped, 
                x='year_added', 
                y='count', 
                color='genres', 
                title='Series Added to Netflix by Genre (Year on Year)', 
                labels={'year_added': 'Year', 'count': 'Number of Tv Shows', 'genres': 'Genres'},
                color_discrete_map=color_map,
                markers=True)

# Exibir o gráfico
fig.show()

Who director has made the most TV Shows

In [None]:
# Drop null values in column director
tv_show = tv_show.dropna(subset=['director'])

# Transform column 'director' to string
tv_show['director'] = tv_show['director'].astype('string')

# Count number of TV Show by director
directors_count = tv_show['director'].value_counts().reset_index()
directors_count.columns = ['director', 'tv_show_count']

# Filter directors
top_directors = directors_count.head(10)

# Generate a color palette with the number of unique directors
num_directors = top_directors['director'].nunique()
palette = sns.color_palette("hsv", num_directors).as_hex()

# Create a dictionary to map the directors to colors
unique_directors = top_directors['director'].unique()
color_map = {director: color for director, color in zip(unique_directors, palette)}

fig = px.bar(
    data_frame=top_directors,
    x='director',
    y='tv_show_count',
    title='Directors with the most TV show productions',
    labels={'tv_show_count': 'Number of Tv Shows', 'director': 'Directors'},
    text='tv_show_count',
    color_discrete_map=color_map,
    color='director'
)
# Update layout to rotate labels on the x-axis
fig.update_layout(xaxis_tickangle=-45)
fig.show()