In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot

netflix_df = pd.read_csv('/kaggle/input/netflix-shows/netflix_titles.csv')

netflix_df['director'] = netflix_df['director'].fillna("No Director")
netflix_df['cast'] = netflix_df['cast'].fillna("No Cast")
netflix_df['country'] = netflix_df['country'].fillna("Country Unavailable")
netflix_df.dropna(subset=['date_added',  'rating', 'duration'], inplace=True)

netflix_df['country'] = netflix_df['country'].str.strip()

netflix_df['date_added'].dropna()
netflix_df['date_added'] = pd.to_datetime(netflix_df['date_added'].str.strip())
netflix_df['month_added'] = netflix_df['date_added'].dt.month
netflix_df['year_added'] = netflix_df['date_added'].dt.year

In [None]:
netflix_df.head()
netflix_df.dtypes
netflix_df.isnull().sum()

# Content type on Netflix

In [None]:
plt.figure(figsize=(12,6))
plt.pie(netflix_df.type.value_counts(), labels=netflix_df.type.value_counts().index, autopct="%.1f%%", colors=["black", "red"], explode=(0.025, 0.025))
plt.title("Type of content in Netflix")
plt.legend(loc="upper left", bbox_to_anchor=(1.0, 1.0))
plt.show()

70% of content on Netflix is movies, and the remaining 30% is TV shows. 

# Growth in content over the year

In [None]:
netflix_df_year = netflix_df.groupby(['year_added', 'type']).size().reset_index(name='count')
all_df_year = netflix_df.groupby('year_added').size().reset_index(name='count')

fig, ax = plt.subplots(figsize=(13, 7))
sns.lineplot(data=all_df_year, x='year_added', y='count', color='blue', label='All Content')
sns.lineplot(data=netflix_df_year, x='year_added', y='count', hue='type', palette={'Movie': 'green', 'TV Show': 'orange'})
plt.title('Total content added across all years')
plt.ylabel('Releases')
plt.xlabel('Year')
plt.show()

The growth in the number of movies on Netflix is much higher than TV shows. About 1300 new movies were added in both 2018 and 2019. The growth in movie content started in 2015.


In [None]:
netflix_df.head()

pivot_table = netflix_df.pivot_table(index='month_added', columns='year_added', aggfunc='size')

plt.figure(figsize=(13, 7))
sns.heatmap(pivot_table, linewidth=0.5, cmap='viridis')
plt.show()

# pivot_table.head()

# Countries by the Amount of the Produces Content

In [None]:
filtered_countries = netflix_df.country.str.split(', ', expand=True).stack().reset_index(level=1, drop=True)
filtered_countries = filtered_countries[filtered_countries != 'Country Unavailable']

plt.figure(figsize=(13, 7))

sns.countplot(y = filtered_countries, order = filtered_countries.value_counts().index[:15])

plt.title('Top 15 Countries Contributor on Netflix')
plt.xlabel('Titles')
plt.ylabel('Country')
plt.show()

# Top directors and actors  on Netflix

In [None]:
filtered_directors = netflix_df[netflix_df.director != "No Director"].set_index("title").director.str.split(",", expand=True).stack().str.strip().reset_index(level=1, drop=True)

plt.figure(figsize=(13, 7))
plt.title("Top 15 Director Based on The Number of Titles")

sns.countplot(y=filtered_directors, order=filtered_directors.value_counts().index[:15])

plt.show()

In [None]:
filtered_casts = netflix_df[netflix_df.cast != "No Cast"].set_index("title").cast.str.split(",", expand=True).stack().str.strip().reset_index(level=1, drop=True)

plt.figure(figsize=(13, 7))
plt.title("Top 15 Cast Based on The Number of Titles")

sns.countplot(y=filtered_casts, order=filtered_casts.value_counts().index[:15])

plt.show()

In [None]:
top_directors = filtered_directors.value_counts().index[:15]
top_casts = filtered_casts.value_counts().index[:15]

temp_df = netflix_df.set_index("title")
temp_df['cast'] = temp_df['cast'].str.split(',')
temp_df['director'] = temp_df['director'].str.split(',')

temp_df = temp_df.explode('cast')
temp_df = temp_df.explode('director')

temp_df['cast'] = temp_df['cast'].str.strip()
temp_df['director'] = temp_df['director'].str.strip()

filtered_cast_director = temp_df[temp_df.cast.isin(top_casts) & temp_df.director.isin(top_directors)]

pivot_table = filtered_cast_director.pivot_table(index='director', columns='cast', aggfunc='size', fill_value=0)

filtered_cast_director.head()
pivot_table.head()

plt.figure(figsize=(13, 7))
sns.heatmap(pivot_table, annot=True, cmap="coolwarm", linewidth=0.5)
plt.title('Heatmap of Collaborations Between Top Directors and Top Cast Members')
plt.xlabel('Cast')
plt.ylabel('Director')
plt.xticks(rotation=45)
plt.show()

# Duration of movies

In [None]:
netflix_movies = netflix_df[netflix_df['type'] == 'Movie']
netflix_movies.loc[:, 'duration'] = netflix_movies['duration'].str.replace(' min', '')
netflix_movies.loc[:, 'duration'] = netflix_movies['duration'].astype(int)

sns.set(style="darkgrid")
sns.kdeplot(netflix_movies['duration'], shade=True)

# netflix_movies.head()

# Content Recommedaion System 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Common stop words in English include:

# a, an, and, are, as, at
# be, but, by
# for, if, in, is, it
# of, on, or
# that, the, to
# with

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(netflix_df['description'])


In [None]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
indices = pd.Series(netflix_df.index, index=netflix_df['title']).drop_duplicates()
indices

In [None]:
def get_recommendations(title, cosine_sim=cosine_sim) :
    idx = indices[title]
    
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    
    sim_scores = sim_scores[1: 11]
    
    movies_indices = [i[0] for i in sim_scores]
    
    
    return netflix_df['title'].iloc[movies_indices]

In [None]:
get_recommendations('Peaky Blinders')