In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
ds=pd.read_csv("/kaggle/input/netflix-shows/netflix_titles.csv")

# Initial Dataset

In [None]:
ds.head()

In [None]:
ds.shape,ds.columns

In [None]:
print(ds.info())

# Checking The NULL VALUES

In [None]:
ds.isnull().sum()

In [None]:
ds[ds['director'].isna()]

# Data Cleaning and Preprocessing¶
We will clean the dataset by handling missing values and formatting data for analysis:

Replace missing values in cast, country, etc. with mode

In [None]:
ds['director']=ds['director'].fillna('Unknown')

In [None]:
ds.isna().sum()

# Filling the missing data with their modes : By using the Append mode 'a'

In [None]:
for a in ['cast','country','date_added','rating','duration']:
    ds[a] = ds[a].fillna(ds[a].mode()[0])

In [None]:
print(f"Rows: {ds.shape[0]}, Columns: {ds.shape[1]}")
print("\nColumn Data Types:\n", ds.dtypes)
print("\nMissing Values:\n", ds.isna().sum())
print("\n Description for the dataset:\n",ds.describe())

In [None]:
ds['director'].nunique()

In [None]:
ds['type'].unique()

# Exploratory Data Analysis (EDA)
Let's analyze key aspects of the data like:

*Movies vs Tv Shows
*Top ten Directors
*Most Frequent Genres



# Total number of Movies vs TV Shows

In [None]:
type_counts = ds['type'].value_counts()
print("Classification:\n", type_counts)

## Bar Chart for the Movies vs TV show count

In [None]:
type_counts = ds['type'].value_counts()

plt.figure(figsize=(6, 4))
type_counts.plot(kind='bar', color=['#000000', '#FF0000'])
plt.title('Content Type Distribution')
plt.xlabel('Type')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
top_countries = ds['country'].value_counts().head(10)
print("Top 10 Countries by Number of Titles:\n", top_countries)

# Top Ten Directors Describing with Barchart

In [None]:
top_directors = ds[ds['director'] != 'Unknown']['director'].value_counts().head(10)
print("Top 10 Directors:\n", top_directors)

In [None]:
import matplotlib.pyplot as plt

top_directors = ds[ds['director'] != 'Unknown']['director'].value_counts().head(10)

plt.figure(figsize=(10, 6))
top_directors.plot(kind='bar', color='#FF0000')
plt.title('Top 10 Directors on Netflix')
plt.xlabel('Number of Titles')
plt.gca().invert_xaxis()
plt.grid(axis='x', linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

# The most frequent genre in Netflix

In [None]:

import matplotlib.pyplot as plt
from collections import Counter

all_genres = ds['listed_in'].dropna().str.split(', ')
flat_genres = [genre for sublist in all_genres for genre in sublist]

# Count the frequency of each genre
genre_counts = Counter(flat_genres)
top_genres = genre_counts.most_common(10)

# Prepare data for plotting
genres, counts = zip(*top_genres)

# Plot
plt.figure(figsize=(10, 6))
plt.barh(genres[::-1], counts[::-1], color='skyblue')  
plt.xlabel("Number of Titles")
plt.title("Top 10 Most Frequent Genres on Netflix")
plt.tight_layout()
plt.show()

# Top 10 Countries 

In [None]:
top_countries = ds['country'].value_counts().head(10)
print("Top 10 Countries by Number of Titles:\n", top_countries)

In [None]:
language_proxy = ds['country'].str.split(', ').explode().value_counts().head(10)
print("Top 10 Content-Producing Countries:\n",language_proxy)

# The ratings distributions by Type

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(14, 5), sharey=True)

for i, t in enumerate(['Movie', 'TV Show']):
    ds_type = ds[ds['type'] == t]
    rating_counts = ds_type['rating'].value_counts().head(10)
    axs[i].bar(rating_counts.index, rating_counts.values, color=plt.cm.Set3(i * 30))
    axs[i].set_title(f'Top Ratings - {t}s')
    axs[i].set_xlabel('Rating')
    axs[i].tick_params(axis='x', rotation=45)

axs[0].set_ylabel('Count')
fig.suptitle('Content Rating Distribution by Type', fontsize=30)
plt.tight_layout()
plt.show()

# Conclusion
    This analysis reveals that Netflix content and the ratings given are highly focusing on the Movies especially for the Dramatic genres.This insight can help the content creators to choose and understand market trends and viewers discover popular genres , directors and the Countries 