# Netflix TV and Movies EDA

In [1]:
!pip install lets_plot -U

Collecting lets_plot
  Downloading lets_plot-4.6.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting pypng (from lets_plot)
  Downloading pypng-0.20220715.0-py3-none-any.whl.metadata (13 kB)
Collecting palettable (from lets_plot)
  Downloading palettable-3.3.3-py2.py3-none-any.whl.metadata (3.3 kB)
Downloading lets_plot-4.6.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m59.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading palettable-3.3.3-py2.py3-none-any.whl (332 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m332.3/332.3 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypng-0.20220715.0-py3-none-any.whl (58 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.1/58.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypng, palettable, lets_plot
Succe

In [2]:
import pandas as pd

from lets_plot import *
from lets_plot.geo_data import *

The geodata is provided by © OpenStreetMap contributors and is made available here under the Open Database License (ODbL).


In [3]:
LetsPlot.setup_html()

In [4]:
df = pd.read_csv("/kaggle/input/netflix-tv-shows-and-movies/NetFlix.csv")
df = df[df["release_year"] >= 2000]
print(df.shape)
df.head()

(7338, 12)


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,genres,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,14-Aug-20,2020,TV-MA,4,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s10,Movie,1920,Vikram Bhatt,"Rajneesh Duggal, Adah Sharma, Indraneil Sengup...",India,15-Dec-17,2008,TV-MA,143,"Horror Movies, International Movies, Thrillers",An architect and his wife move into a castle t...
2,s100,Movie,3 Heroines,Iman Brotoseno,"Reza Rahadian, Bunga Citra Lestari, Tara Basro...",Indonesia,05-Jan-19,2016,TV-PG,124,"Dramas, International Movies, Sports Movies",Three Indonesian women break records by becomi...
3,s1000,Movie,Blue Mountain State: The Rise of Thadland,Lev L. Spiro,"Alan Ritchson, Darin Brooks, James Cade, Rob R...",United States,01-Mar-16,2016,R,90,Comedies,New NFL star Thad buys his old teammates' belo...
4,s1001,TV Show,Blue Planet II,,David Attenborough,United Kingdom,03-Dec-18,2017,TV-G,1,"British TV Shows, Docuseries, Science & Nature TV",This sequel to the award-winning nature series...


In [5]:
ggplot(df) + \
    geom_pie(aes(slice='..count..', fill='type'), size=20, \
             labels=layer_labels(['..proppct..']).format('..proppct..', '{.1f}%'), \
             tooltips=layer_tooltips().line('Total projects count|@..count..')) + \
    ggsize(600, 400) + \
    ggtitle("Ratio of movies to TV-shows") + \
    theme(axis='blank', panel_grid='blank')

In [6]:
movie_ratins_ordered = ["NR", "UR", "G", "PG", "PG-13", "R", "NC-17"]
tv_ratins_ordered = ["NR", "UR", "TV-Y", "TV-Y7", "TV-Y7-FV", "TV-G", "TV-PG", "TV-14", "TV-MA"]

movies_df = df[(df.type == "Movie")&(df.rating.isin(movie_ratins_ordered))]
movies_plot = ggplot(movies_df, aes(x="rating", fill='..count..')) + \
    geom_bar() + \
    scale_x_discrete(breaks=movie_ratins_ordered) + scale_y_log10(limits=[0, 1100]) + \
    scale_fill_viridis(name="Movies count", limits=[0, 1100], option='plasma') + \
    ggtitle("Movies count by rating") + \
    theme(axis_text=element_text(size=8, angle=0.0))

tv_df = df[(df.type == "TV Show")&(df.rating.isin(tv_ratins_ordered))]
tv_plot = ggplot(tv_df, aes(x="rating", fill='..count..')) + \
    geom_bar() + \
    scale_x_discrete(breaks=tv_ratins_ordered) + scale_y_log10(limits=[0, 1100]) + \
    scale_fill_viridis(name="TV-shows count", limits=[0, 1100], option='plasma') + \
    ggtitle("TV-shows count by rating") + \
    theme(axis_text=element_text(size=8, angle=0.0))

gggrid([movies_plot, tv_plot])

In [7]:
top_size = 100
cast_top_df = pd.concat([df.country.to_frame(), df.cast.str.split(",", expand=True)], axis='columns')\
    .melt(id_vars=["country"], value_name="name")[["country", "name"]].dropna()\
    .value_counts().to_frame("projects_count").reset_index().iloc[:top_size]
cast_top_df["country_projects_count"] = cast_top_df.groupby("country")["projects_count"].transform('sum')
cast_top_df

ggplot(cast_top_df, aes(x="country")) + \
    geom_pie(aes(fill=as_discrete("name", order_by='..count..'), \
                 weight="projects_count", size="country_projects_count"), \
             show_legend=False, tooltips=layer_tooltips().title("@name")\
                                         .format("@..count..", 'd').line("projects count|@..count..")) + \
    scale_fill_viridis(option='plasma') + \
    scale_size(range=[5, 20]) + \
    ggsize(800, 300) + \
    ggtitle("Top {0} most involved actors and actresses".format(top_size)) + \
    theme(axis_line_x='blank', axis_ticks_x='blank', axis_title_x='blank', \
          axis_title_y='blank', axis_text_y='blank', panel_grid='blank', axis_tooltip='blank')

In [8]:
by_country_df = df["country"].str.split(", ", expand=True).stack(future_stack=True).value_counts()\
    .to_frame("count").reset_index().rename(columns={"index": "country"})
by_country_df = by_country_df[~by_country_df["country"].str.contains(",")]

gdf = geocode_countries(by_country_df["country"]).ignore_all_errors().inc_res(4).get_boundaries()

ggplot() + \
    geom_livemap() + \
    geom_map(aes(fill="count"), data=by_country_df, map=gdf, map_join="country", \
             size=0, alpha=.75, show_legend=False, \
             tooltips=layer_tooltips().title("@country").format("@count", 'd').line("projects count|@count")) + \
    scale_fill_gradient(low="#edf8fb", high="#006d2c", trans='log10') + \
    ggtitle("Number of projects by country") + \
    ggsize(1100, 720)

In [9]:
movies_df = df[(df["type"] == "Movie")&(df["genres"] != "Movies")]
by_genre_df = pd.melt(
    movies_df["genres"].str.split(", ", expand=True).assign(duration=movies_df["duration"]),
    id_vars=["duration"], value_vars=[0, 1, 2], value_name="genre"
)[["genre", "duration"]].dropna(subset=["genre"])
by_genre_df = by_genre_df.assign(
    duration_mean=by_genre_df["genre"].map(by_genre_df.groupby("genre")["duration"].mean())
).sort_values(by="duration_mean", ascending=False)

ggplot(by_genre_df, aes("duration", "genre")) + \
    geom_area_ridges(aes(group="genre", fill="duration_mean"), \
                     scale=4, sampling=sampling_pick(by_genre_df.shape[0]), \
                     tooltips=layer_tooltips().title("@genre")\
                                              .line("duration|@duration")) + \
    scale_x_log10() + \
    scale_fill_viridis(name="mean duration", option='plasma') + \
    ggsize(800, 600) + \
    ggtitle("Mean Netflix movie duration") + \
    theme(axis_line_x='blank')