In [2]:
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
data = pd.read_csv('../input/netflix-shows/netflix_titles.csv')
data.head()

In [4]:
data.info()

In [5]:
sns.set(style='darkgrid')
fig, axes = plt.subplots(1, 2, figsize=(14, 5), dpi=200)
axes[0].bar(data['type'].unique(), data['type'].value_counts(), linewidth=0.7, alpha=0.7, 
            color=['#d72631', '#077b8a'], width=0.55)

for i in data['type'].unique():
    axes[0].annotate(f"{data['type'].value_counts()[i]}", (i, data['type'].value_counts()[i] + 200), 
                     ha='center', va='center', family='serif')
    
axes[0].set_ylim([0, 7000])
axes[0].set_xticklabels(data['type'].unique(), family='serif', size=12)
axes[0].set_yticklabels(np.arange(0, 7001, 1000),family='serif', size=12)    
    
axes[1].pie(data['type'].value_counts(), explode=[0, 0.15], labels=data['type'].unique(), 
            colors=['#e3b448', '#3a6b35'], autopct='%1.1f%%', startangle=90,
            wedgeprops={'edgecolor': '#322e2f', 'linestyle': 'solid', 'linewidth': 0.4, 'alpha': 0.8}, 
            textprops={'family': 'serif', 'size': 12})
fig.text(0.4, 0.95, 'Movies vs TV Show', family='serif', size=20, weight='bold')
plt.show()

In [6]:
fig, ax = plt.subplots(1, 1, figsize=(10, 5), dpi=200)
ratings = data['rating'].value_counts()[:10]

data_hight= [x / max(ratings.values) for x in ratings.values]

my_cmap = plt.cm.get_cmap('YlOrRd')
colors = my_cmap(data_hight)

ax.bar(ratings.index, ratings.values, width=0.55, alpha=0.9, color=colors)

for i in ratings.index:
    ax.annotate(f'{ratings[i]}', (i, ratings[i] + 200), ha='center', va='center', size=10, 
                weight='light', family='serif', color='#4a4a4a')

for i in ['top', 'right', 'left']:
    ax.spines[i].set_visible(False)
ax.set_ylim([0, 4000])
ax.set_xticklabels(ratings.index, family='serif', size=10)
ax.set_yticklabels(np.arange(0, 4001, 500), family='serif', size=10)
ax.grid(axis='y', linestyle='-', alpha=0.5)
fig.text(0.15, 0.95, 'Netflix Content Ratings', family='serif', size=20, weight='bold')
plt.show()

In [7]:
date = data[['date_added']].dropna()
date['year'] = date['date_added'].apply(lambda x: str(x).split(' ')[-1])
date['month'] = date['date_added'].apply(lambda x: str(x).split(' ')[0])
month_list = ['January', 'February', 'March', 'April', 'May', 'June', 'July',
              'August', 'September', 'October', 'November', 'December']

netflix_date = date.groupby('year')['month'].value_counts().unstack().fillna(0)[month_list].T
fig, ax = plt.subplots(1, 1, figsize=(13,5), dpi=200)

sns.heatmap(netflix_date, cmap='RdPu_r', annot=True, ax=ax, edgecolor='white', linewidth=2, 
            fmt='g', annot_kws={'family': 'serif'})

plt.xticks(np.arange(0.5, len(netflix_date.columns), 1), netflix_date.columns, family='serif', size=12)
plt.yticks(np.arange(0.5, len(netflix_date.index), 1), netflix_date.index, family='serif', size=12)
plt.xlabel('')
plt.ylabel('')
fig.text(0.15, 0.95, 'Netflix Contents Each Year', family='serif', size=20, weight='bold')
plt.show()

In [8]:
fig, ax = plt.subplots(1, 1, figsize=(13,5), dpi=200)
for month in month_list:
    month_data = netflix_date.loc[month]
    plt.plot(month_data.index, month_data.values, label=month, linewidth=2)

plt.xticks(netflix_date.columns, family='serif', size=12)
plt.yticks(np.arange(0, 251, 50), family='serif', size=12)
plt.legend(prop={'family': 'serif', 'size': 10})
fig.text(0.15, 0.95, 'Netflix Contents Each Year(Line Chart)', family='serif', size=20, weight='bold')
plt.show()

In [9]:
content_types = data['listed_in'].apply(lambda x: x.split(', '))
content_counter = Counter()
for content in content_types.iteritems():
    content_counter.update(content[1])
items = []
popularity = []

for item in content_counter.most_common(len(content_counter)):
    items.append(item[0])
    popularity.append(item[1])

popularity_norm = [x / max(popularity) for x in popularity]
popularity_cmap = plt.cm.get_cmap('YlGnBu')
colors = popularity_cmap(popularity_norm)

#             'Greys', 'Purples', 'Blues', 'Greens', 'Oranges', 'Reds',
#             'YlOrBr', 'YlOrRd', 'OrRd', 'PuRd', 'RdPu', 'BuPu',
#             'GnBu', 'PuBu', 'YlGnBu', 'PuBuGn', 'BuGn', 'YlGn'

fig, ax = plt.subplots(1, 1, figsize=(13, 7), dpi=200)
ax.barh(items[:10], popularity[:10], color=colors, alpha=0.9)

for i in range(10):
    ax.annotate(f'{popularity[i]}', (popularity[i] + 70, items[i]), ha='center', va='center', 
                family='serif', size=12, color='#4a4a4a')
ax.set_xlim([0, 3000])
ax.set_xticklabels(np.arange(0,3001, 500), family='serif', size=12)
ax.set_yticklabels(items[:10], family='serif', size=12)
fig.text(0.25, 0.92, 'Top 10 Most Popular Content Gernes', weight='bold', family='serif', size=20)
plt.show()

In [10]:
cast = data[['cast', 'type']].dropna()
cast['cast'] = cast['cast'].apply(lambda x: x.split(', '))
cast_type = pd.DataFrame(columns=['cast', 'type'])
for info in cast.itertuples():
    for cast_name in info[1]:
        cast_type = cast_type.append({'cast': cast_name, 'type': info[2]}, ignore_index=True)

In [11]:
top20_cast = cast_type['cast'].value_counts()[:20]
top20_cast_movie = cast_type[(cast_type['cast'].isin(top20_cast.index)) & (cast_type['type'] == 'Movie')]['cast'].value_counts()
top20_cast_tv = top20_cast - top20_cast_movie

fig, ax = plt.subplots(1, 1, figsize=(10, 10), dpi=200)
movie_norm = [x / max(top20_cast_movie) for x in top20_cast_movie]
movie_cmap = plt.cm.get_cmap('Purples')
movie_colors = movie_cmap(movie_norm)
tv_norm = [x / max(top20_cast_tv) for x in top20_cast_tv]
tv_cmap = plt.cm.get_cmap('Blues')
tv_colors = tv_cmap(tv_norm)

#             'Greys', 'Purples', 'Blues', 'Greens', 'Oranges', 'Reds',
#             'YlOrBr', 'YlOrRd', 'OrRd', 'PuRd', 'RdPu', 'BuPu',
#             'GnBu', 'PuBu', 'YlGnBu', 'PuBuGn', 'BuGn', 'YlGn'

ax.barh(top20_cast_movie.index, top20_cast_movie.values, label='Movie', 
        alpha=0.9, color=movie_colors)
ax.barh(top20_cast_tv.index, -top20_cast_tv.values, label='TV Show', 
        alpha=0.9, color=tv_colors)

for i in top20_cast.index:
    ax.annotate(f'{top20_cast_movie[i]}', (top20_cast_movie[i] + 1, i), ha='center', va='center', 
                family='serif', size=9, color='#4a4a4a')
    ax.annotate(f'{top20_cast_tv[i]}', (-top20_cast_tv[i] - 1, i), ha='center', va='center', 
                family='serif', size=9, color='#4a4a4a')

ax.set_yticklabels(top20_cast.index, family='serif', size=10)
ax.set_xticks([])
fig.text(0.2, 0.9, f'Top 20 Cast with Most Content', weight='bold', family='serif', size=20)
plt.legend(prop={'family': 'serif'})
plt.show()

In [237]:
data['duration'] = data['duration'].fillna('0')
duration_df = data[data['duration'].str.endswith(' min')]
duration_df['duration'] = duration_df['duration'].apply(lambda x: int(x.split(' ')[0]))

fig, ax = plt.subplots(1, 1, figsize=(13, 5), dpi=200)
bins = np.arange(0, 201, 10)

num_each_bin, bins, patches =  ax.hist(duration_df['duration'], bins=bins, alpha=0.7)
duration_norm = [x / max(num_each_bin) for x in num_each_bin]
duration_cmap = plt.cm.get_cmap('spring_r')
duration_colors = duration_cmap(duration_norm)

for i in range(20):
    patches[i].set_facecolor(duration_colors[i])
for i in range(20):
    ax.annotate(f'{int(num_each_bin[i])}', ((bins[i] + bins[i + 1]) // 2, int(num_each_bin[i]) + 50), 
                ha='center', va='center', size=10, family='serif', color='#4a4a4a')

ax.set_ylim([0, 1500])
ax.set_xticks(bins)
ax.set_xticklabels(bins, family='serif')
ax.set_yticklabels(np.arange(0, 1401, 200), family='serif')
fig.text(0.35, 0.9, 'Regular Content Duration', weight='bold', family='serif', size=20)
plt.show()