In [12]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [13]:
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [14]:
# from pandas_profiling import ProfileReport
# prof = ProfileReport(df)
# prof

In [15]:
df = pd.read_csv("/kaggle/input/netflix-shows/netflix_titles.csv")
df.head()

In [16]:
df.info()


In [17]:
df.shape

In [18]:
df.isna().sum()


In [19]:
df['rating'].unique()

In [20]:
df[df['rating'].isna()]

In [21]:
df = df.drop(['director', 'cast'], axis=1)
df.columns

In [22]:
df[df['date_added'].isna()]

In [23]:
df = df[df['date_added'].notna()]

In [24]:
df['country'] = df['country'].fillna(df['country'].mode()[0])


In [25]:
df.isna().sum()

In [26]:
df['rating'] = df['rating'].fillna(df['rating'].mode()[0])

In [27]:
df['duration'] = df['duration'].fillna(df['duration'].mode()[0])

In [28]:
df.isna().sum()

In [29]:
df.head()


In [30]:
df['year_added'] = df['date_added'].apply(lambda x: x.split(" ")[-1])
df['year_added'].head()

In [31]:
df['month_added'] = df['date_added'].apply(lambda x: x.split(" ")[0])
df['month_added'].tail()

In [32]:
ratings_ages = {
    'TV-PG': 'Older Kids',
    'TV-MA': 'Adults',
    'TV-Y7-FV': 'Older Kids',
    'TV-Y7': 'Older Kids',
    'TV-14': 'Teens',
    'R': 'Adults',
    'TV-Y': 'Kids',
    'NR': 'Adults',
    'PG-13': 'Teens',
    'TV-G': 'Kids',
    'PG': 'Older Kids',
    'G': 'Kids',
    'UR': 'Adults',
    'NC-17': 'Adults'
}

In [33]:
df['target_ages'] = df['rating'].replace(ratings_ages)
df['target_ages'].unique()

In [34]:
for i in df["rating"]:
    if i == "74 min" or i == "84 min" or i == "66 min":
        a = df[df["rating"] == i].index
        df.drop(a,inplace=True)
        
    

In [35]:
df['target_ages'] = df['rating'].replace(ratings_ages)
df['target_ages'].unique()

In [36]:
df['principal_country'] = df['country'].apply(lambda x: x.split(",")[0])
df['principal_country'].head()

In [37]:
df.dtypes


In [38]:
df['type'] = pd.Categorical(df['type'])
# target_ages is another category (4 classes)
df['target_ages'] = pd.Categorical(df['target_ages'], categories=['Kids', 'Older Kids', 'Teens', 'Adults'])

# Year added should be integer so we can compare with `released_year`
df['year_added'] = pd.to_numeric(df['year_added'])

In [39]:
df.dtypes

In [40]:
sns.set(style="darkgrid")
ax = sns.countplot(x="type", data=df, palette="Set2")

In [41]:
movie_df = df[df['type'] == 'Movie']
show_df = df[df['type'] == 'TV Show']

In [42]:
df

In [43]:
def generate_rating_df(df):
    rating_df = df.groupby(['rating', 'target_ages']).agg({'show_id': 'count'}).reset_index()
    rating_df = rating_df[rating_df['show_id'] != 0]
    rating_df.columns = ['rating', 'target_ages', 'counts']
    rating_df = rating_df.sort_values('target_ages')
    return rating_df

In [44]:
rating_df = generate_rating_df(df)
fig = px.bar(rating_df, x='rating', y='counts', color='target_ages')
fig.show()

In [45]:

movie_rating_df = generate_rating_df(movie_df)
show_rating_df = generate_rating_df(show_df)

fig = make_subplots(rows=1, cols=2, specs=[[{"type": "pie"}, {"type": "pie"}]])

fig.add_trace(
    go.Pie(labels=movie_rating_df['target_ages'], values=movie_rating_df['counts']),
    row=1, col=1
)

fig.add_trace(
    go.Pie(labels=show_rating_df['target_ages'], values=show_rating_df['counts']),
    row=1, col=2
)

fig.update_traces(textposition='inside', hole=.4, hoverinfo="label+percent+name")
fig.update_layout(
    title_text="Rating distribution by Type of content",
    # Add annotations in the center of the donut pies.
    annotations=[dict(text='Movies', x=0.20, y=0.5, font_size=12, showarrow=False),
                 dict(text='TV Shows', x=0.82, y=0.5, font_size=12, showarrow=False)])
fig.show()

In [46]:
country_df = df['principal_country'].value_counts().reset_index()
country_df = country_df[country_df['principal_country'] /  country_df['principal_country'].sum() > 0.01]

fig = px.pie(country_df, values='principal_country', names='index')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()