# Netflix Data Analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv(r"netflix_movies.csv")

In [None]:
df

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.head(5)

In [None]:
df.columns

In [None]:
df['type'].value_counts()

In [None]:
df['country'].value_counts().head(5)

In [None]:
df.sort_values(by = 'release_year').head(5)

In [None]:
bollywood = df[df['country']=='India']
bollywood

In [None]:
bollywood['director'].value_counts().head(1)

In [None]:
df.isnull().sum()

In [None]:
df['director']= df['director'].fillna('Unknown')

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
df['date_added'] = pd.to_datetime(df['date_added'].str.strip())

df['year_added']=df['date_added'].dt.year

df.head()

In [None]:
movies_by_year = df['year_added'].value_counts().sort_index()
movies_by_year.plot(kind='line', marker='o', color='red')

plt.title('Netflix new Content Pace')
plt.xlabel('Year')
plt.ylabel('Added Movies/TVShows')
plt.savefig('netflix_content_pace.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
errors_rating = df['rating'].str.contains('min', na=False)
print(errors_rating.sum())

In [None]:
df.loc[errors_rating,'duration'] = df.loc[errors_rating,'rating']

df.loc[errors_rating,'rating'] = None

display(df.loc[errors_rating, ['title', 'rating', 'duration']].head())

In [None]:
order_list = df['rating'].value_counts().index
order_list

In [None]:
plt.figure(figsize=(12,6))
plt.title('Ratings on Netflix')
sns.countplot(data=df, x='rating',order=order_list)
plt.savefig('netflix_ratings.png', dpi=300, bbox_inches='tight')

In [None]:
df['listed_in'].value_counts().head(10)

In [None]:
types = df['listed_in'].str.split(', ')

types_lone = types.explode()

top_types = types_lone.value_counts().head(10)
print(top_types)

In [None]:
sns.barplot(x=top_types.values, y=top_types.index)
plt.title('Top Netflix Types')
plt.xlabel('Quantity')
plt.ylabel('Types')
plt.savefig('netflix_top.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
films = df[df['type']=='Movie'].copy()

films['duration'] = films['duration'].str.replace(' min','')

films['duration'] = films['duration'].astype(float)

sns.histplot(films['duration'], bins=30, kde=True)

plt.title('How long are the Movies on Netflix?')
plt.xlabel('Duration(mins)')
plt.ylabel('Movie count')
plt.savefig('netflix_movie_duration.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
print('Average Movie Duration on Netflix: ', films['duration'].mean().round(2))