In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('ggplot')
sns.set_style('whitegrid')

df = pd.read_csv('/kaggle/input/netflix-movies-and-tv-shows/netflix_titles.csv', encoding ='latin1')

# First Look

In [None]:
df.head()

**A lot of unnecessary columns**

In [None]:
# remove 'Unnamed' columns

df.drop(df.iloc[0, 12:].index, axis = 1, inplace = True)

In [None]:
df.info()

**There are a significant number of null values**

In [None]:
# % of Null Values

((df.isna().sum() / len(df)) * 100).round(2).sort_values(ascending = False)

dropna() is not an option because it will remove at least 30% of the data

# Cleaning and transformation 

In [None]:
# Replace null values in ['director'] with 'Unknown'

df['director'].fillna('Unknown', inplace = True)

In [None]:
result = 1 - (df.dropna().shape[0] / df.shape[0])
rounded_result = round(result, 2)
rounded_result

print(f'Using the dropna() function will delete: {rounded_result}% of the data')

**17%** is quite a big number.

similarly to ['director'], null values in the ['county'] and ['cast'] columns will be replaced

In [None]:
df['country'].fillna('Unknown', inplace = True)
df['cast'].fillna('Unknown', inplace = True)

result = 1 - (df.dropna().shape[0] / df.shape[0])
rounded_result = round(result, 4)
rounded_result

print(f'Using the dropna() function will delete: {rounded_result}% of the data')

**0.0019%** is a pretty good value

In [None]:
df.dropna(inplace = True)

In [None]:
display(df.head())
display(df.info())

In [None]:
df.type.unique()

It is better to divide the table into 2 (by type): 
* **show**
* **movie**

In [None]:
show = df[df['type'] == 'TV Show'].copy()
movie = df[df['type'] == 'Movie'].copy()

display(show.head(), movie.head())

# Analyzing TV Shows

In [None]:
shows_per_year = show.groupby('release_year').size()

fig, axes = plt.subplots(3, 1, figsize=(10, 12))

# lineplot
sns.lineplot(x=shows_per_year.index, y=shows_per_year.values, ax=axes[0])
axes[0].set_xlabel('Release year')
axes[0].set_ylabel('Number of shows')
axes[0].set_title('Number of shows released by year', fontweight = 'bold')

# histplot
sns.histplot(x = shows_per_year.index, bins= 20, ax=axes[1])
axes[1].set_xlabel('Release year')
axes[1].set_ylabel('Number of shows')
axes[1].set_title('Number of shows released by year', fontweight = 'bold')

# boxplot
sns.boxplot(x=show['release_year'], ax=axes[2])
axes[2].set_xlabel('Release year')
axes[2].set_ylabel('')
axes[2].set_title('Number of shows released by year', fontweight = 'bold')


plt.tight_layout()
plt.show()


From the year 1925 to the late 1980s, the number of shows rarely exceeded 2 per year, but then we can see a significant jump. This is especially noticeable from 2015 (159 releases) to 2021 (315 releases). The drop in the graph is explained by the fact that in 2024 there is only 1 release, and there is simply no data for 2022 and 2023

In [None]:
# conversion ['duration'] to numeric value
show.duration.unique()

In [None]:
show['duration'] = show['duration'].str.rstrip(' Seasons').astype(int).rename()
show.rename(columns={'duration': 'Number of Seasons'}, inplace=True)
show.head()

In [None]:
color_pal = sns.color_palette('tab20')

number_of_seasons = show['Number of Seasons'].value_counts()

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# barplot
sns.barplot(x=number_of_seasons.index, y=number_of_seasons, palette='viridis', ax=axes[0])
axes[0].set_title('Counting the number of seasons', fontweight='bold')
axes[0].set_ylabel('Number of values')
axes[0].set_xlabel('Number of Seasons')

# pie
axes[1].pie(number_of_seasons, labels=number_of_seasons.index, autopct='%1.1f%%', startangle=140, colors = color_pal)
axes[1].set_title('Distribution of Number of Seasons', fontweight='bold')

plt.tight_layout()
plt.show()


About 67% of all TV shows end in the 1st season, a smaller share of 15.9% in the 2nd season, 7.4% in the 3rd season, and only 9.6% of all TV shows last 4 or more seasons

## Which country is the most active in TV Show production?

In [None]:
show['country'].unique()

Since some shows were filmed in several countries at once, it would be wiser to split the ['country'] column and calculate each country separately. For example, if a show was filmed in both the United States and the United Kingdom, then these countries will receive +1 each.

In [None]:
show_country = show['country'].str.get_dummies(sep=',')
show_country

In [None]:
show_country_T = show_country.T
show_country_T['Total'] = show_country_T.sum(axis = 1)
show_country_T.drop(show_country_T.iloc[0,:-1].index, axis = 1, inplace = True)
show_country_T = show_country_T.sort_values('Total', ascending = False)


show_country_T = show_country_T.reset_index()
show_country_T['index'] = show_country_T['index'].str.strip()

show_country_T = (show_country_T.groupby('index').sum()).sort_values('Total', ascending = False)
show_country_T

In [None]:
plt.figure(figsize = (12,8))
sns.barplot(y = show_country_T.head(50).index, 
            x = show_country_T['Total'].head(50),
            palette = 'crest')

plt.title('Top 50 countries by the number of produced TV Shows', fontweight = 'bold')
plt.ylabel('Country')
plt.xlabel('Total amount')


#add horizontal line 
plt.axvline(show_country_T['Total'].mean(), 
            color ='red', 
            ls = '--', 
            alpha =0.5, 
            label = 'Mean number')

plt.legend(loc ='lower right')

#add numbers
for i, val in enumerate(show_country_T['Total'].head(50)):
    plt.text(val + 10, i, f'{val}', ha='left', va='center')

plt.tight_layout()
plt.show()

The USA is significantly ahead of all other countries in the production of TV Shows (932 shows). The difference between the first and second place 'Unknown' - the country was not specified - is more than 2 times. In addition to the fact that 'Unknown' is likely to be a combination of several countries, it would be advisable to simply ignore its value. Thus, the difference between the United States (1st place) and the United Kingdom (3rd place) is **3.4 times**. The average number of films produced for all countries rarely exceeds 30 films, so such incredible values of the United States are additionally striking. 

It's safe to say that US-produced TV shows are the most common on the Netflix platform, both for each individual country. In addition, it was not shown in the study, but the total share of TV Shows produced in the United States is **30.8%** of the total TV Show market.


## Shows age restrictions 

In [None]:
rating_count = show['rating'].value_counts()

plt.figure(figsize = (8,8))
plt.pie(rating_count, 
        colors = color_pal,
        autopct='%1.1f%%',
        textprops={'fontsize': 10, 'fontweight': 'bold', 'color' : 'white'})

plt.title('Age restrictions for TV shows', fontweight = 'bold')
plt.legend(rating_count.index)

plt.tight_layout()
plt.show()

* TV-MA, TV-14 and TV-PG are the most common, which indicates that TV Shows are oriented towards a more adult audience (82% of all content)
* TV-Y7, TV-Y and TV-G are age restrictions that are acceptable for a fairly young audience and account for about 17% of all content.

**It can be argued that the vast majority of TV programs are aimed at an adult audience (82%), but there are also materials for a younger audience (17%)**

# Analyzing Movies

In [None]:
movies_per_year = movie.groupby('release_year').size()

fig, axes = plt.subplots(3, 1, figsize=(10, 12))

# lineplot
sns.lineplot(x=movies_per_year.index, y=movies_per_year.values, ax=axes[0], color = color_pal[0])
axes[0].set_xlabel('Release year')
axes[0].set_ylabel('Number of movies')
axes[0].set_title('Number of movies released by year', fontweight = 'bold')

# histplot
sns.histplot(x = movies_per_year.index, bins= 20, ax=axes[1], color = color_pal[0])
axes[1].set_xlabel('Release year')
axes[1].set_ylabel('Number of movies')
axes[1].set_title('Number of movies released by year', fontweight = 'bold')

# boxplot
sns.boxplot(x=show['release_year'], ax=axes[2], color = color_pal[0])
axes[2].set_xlabel('Release year')
axes[2].set_ylabel('')
axes[2].set_title('Number of movies released by year', fontweight = 'bold')


plt.tight_layout()
plt.show()


The situation with TV shows is extremely similar, but we see that the film production market has been much more lively, because the maximum number of films produced per year is about 800, while shows are about 300. Here, too, from 1925 until the late 1980s, the number of films produced was extremely low, and since the late 1980s the market has been revived.

In [None]:
movie['duration'] = movie['duration'].str.rstrip(' min').astype(int)


plt.figure(figsize = (12,8))
sns.histplot(movie['duration'], 
             kde = True, 
             color = color_pal[0])

plt.title('Distribution of movie duration', fontweight = 'bold')
plt.ylabel('Count')
plt.xlabel('Duration (min)')


plt.tight_layout()
plt.show()

The length of the films fits the rules of normal distribution quite well, with the vast majority of films ranging from 1 to 2.5 hours

In [None]:
movie_duration_class = pd.DataFrame()

movie_duration_class['up to 1 hour'] = movie['duration'] <= 60
movie_duration_class['1-2 hours'] = (movie['duration'] >= 60) & (movie['duration'] <= 120)
movie_duration_class['2+ hours'] = movie['duration'] > 120

movie_duration_class_T = movie_duration_class.T
movie_duration_class_T['total'] = movie_duration_class_T.sum(axis = 1)

movie_duration_class_T = pd.DataFrame(movie_duration_class_T['total'].sort_values(ascending = False))
movie_duration_class_T

In [None]:
plt.figure(figsize = (6,6))
plt.pie(movie_duration_class_T['total'], 
        colors = color_pal,
        autopct='%1.1f%%',
        textprops={'fontsize': 10, 'fontweight': 'bold', 'color' : 'white'})

plt.title('Movie duration', fontweight = 'bold')
plt.legend(movie_duration_class_T.index)


plt.tight_layout()
plt.show()

73% of all movies last from 1 to 2 hours, about 19% last more than 2 hours and only 8% last up to 1 hour

In [None]:
movie.describe(include ='object')

## Which country is the most active in movies production?

In [None]:
movies_country = movie['country'].str.get_dummies(sep=',')

movies_country_T = movies_country.T
movies_country_T['Total'] = movies_country_T.sum(axis = 1)
movies_country_T.drop(movies_country_T.iloc[0,:-1].index, axis = 1, inplace = True)
movies_country_T = movies_country_T.sort_values('Total', ascending = False)

movies_country_T = movies_country_T.reset_index()
movies_country_T['index'] = movies_country_T['index'].str.strip()

movies_country_T = (movies_country_T.groupby('index').sum()).sort_values('Total', ascending = False)
movies_country_T

In [None]:
plt.figure(figsize = (12,8))
sns.barplot(y = movies_country_T.head(50).index, 
            x = movies_country_T['Total'].head(50),
            palette = 'crest')

plt.title('Top 50 countries by the number of produced movies', fontweight = 'bold')
plt.ylabel('Country')
plt.xlabel('Total amount')


#add horizontal line 
plt.axvline(movies_country_T['Total'].mean(), 
            color ='red', 
            ls = '--', 
            alpha = 0.5, 
            label = 'Mean number')

plt.legend(loc ='lower right')

#add numbers
for i, val in enumerate(movies_country_T['Total'].head(50)):
    plt.text(val + 10, i, f'{val}', ha='left', va='center')

plt.tight_layout()
plt.show()

The difference between the US (1st place) and India (2nd place) is more than 2.5 times. In total, 2750 films are produced in the US, 962 in India, and 534 films were produced in the UK. Summing up, we can see a certain similarity between films and TV shows in the countries - the leaders hardly change. The only difference is in production volumes. The average number of films in this case for each country is about 66 films, while the same figure for TV Shows is 45 shows. Also, about 35 films on Netflix were produced in the United States, while the same figure for NV shows was 30%. 
It can be seen that the leaders have hardly changed depending on the product, but it is also noticeable that in India, TV shows are produced much more actively than movies. 


## Movies age restrictions 

In [None]:
rating_count_m = movie['rating'].value_counts()

plt.figure(figsize = (8,8))
plt.pie(rating_count_m, 
        colors = color_pal,
        autopct='%1.1f%%',
        textprops={'fontsize': 10, 'fontweight': 'bold', 'color' : 'white'})

plt.title('Age restrictions for movies', fontweight = 'bold')
plt.legend(rating_count_m.index)

plt.tight_layout()
plt.show()

In the case of films, there is not much more diversification, as TV-MA (adult audience), TV-14 (teenage audience) and R are the most common (70%), but now there is a greater variety of products for younger audiences