## **Import Libraries**

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
df = pd.read_csv("/kaggle/input/netflix-data-cleaning-analysis-and-visualization/netflix1.csv")
df.head()

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,PG-13,90 min,Documentaries
1,s3,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
2,s6,TV Show,Midnight Mass,Mike Flanagan,United States,9/24/2021,2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries"
3,s14,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,9/22/2021,2021,TV-PG,91 min,"Children & Family Movies, Comedies"
4,s8,Movie,Sankofa,Haile Gerima,United States,9/24/2021,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies"


In [3]:
df.tail()

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
8785,s8797,TV Show,Yunus Emre,Not Given,Turkey,1/17/2017,2016,TV-PG,2 Seasons,"International TV Shows, TV Dramas"
8786,s8798,TV Show,Zak Storm,Not Given,United States,9/13/2018,2016,TV-Y7,3 Seasons,Kids' TV
8787,s8801,TV Show,Zindagi Gulzar Hai,Not Given,Pakistan,12/15/2016,2012,TV-PG,1 Season,"International TV Shows, Romantic TV Shows, TV ..."
8788,s8784,TV Show,Yoko,Not Given,Pakistan,6/23/2018,2016,TV-Y,1 Season,Kids' TV
8789,s8786,TV Show,YOM,Not Given,Pakistan,6/7/2018,2016,TV-Y7,1 Season,Kids' TV


### Show_id doesn't indicate anything so we can drop this column as a part of data cleaning

In [4]:
df.drop(columns=['show_id'] , inplace=True)
df.head()

Unnamed: 0,type,title,director,country,date_added,release_year,rating,duration,listed_in
0,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,PG-13,90 min,Documentaries
1,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
2,TV Show,Midnight Mass,Mike Flanagan,United States,9/24/2021,2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries"
3,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,9/22/2021,2021,TV-PG,91 min,"Children & Family Movies, Comedies"
4,Movie,Sankofa,Haile Gerima,United States,9/24/2021,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies"


## **EDA**

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8790 entries, 0 to 8789
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type          8790 non-null   object
 1   title         8790 non-null   object
 2   director      8790 non-null   object
 3   country       8790 non-null   object
 4   date_added    8790 non-null   object
 5   release_year  8790 non-null   int64 
 6   rating        8790 non-null   object
 7   duration      8790 non-null   object
 8   listed_in     8790 non-null   object
dtypes: int64(1), object(8)
memory usage: 618.2+ KB


In [6]:
df.describe()

Unnamed: 0,release_year
count,8790.0
mean,2014.183163
std,8.825466
min,1925.0
25%,2013.0
50%,2017.0
75%,2019.0
max,2021.0


In [7]:
df[df['release_year'] == 1925]

Unnamed: 0,type,title,director,country,date_added,release_year,rating,duration,listed_in
421,TV Show,Pioneers: First Women Filmmakers*,Not Given,Pakistan,12/30/2018,1925,TV-14,1 Season,TV Shows


### Oldest TV show in the netflix dataset

In [8]:
df['type'].value_counts()

type
Movie      6126
TV Show    2664
Name: count, dtype: int64

### Most of the Netflix data are movies and that make sense as casting a movie take short duration while series or a TV show takes a long time to be casted

In [9]:
types_of_shows = df['type'].value_counts()
fig = px.bar(types_of_shows,color = types_of_shows.index,color_discrete_sequence=["#8a12f9", "#FFFFFF"],text_auto= True , template='plotly_dark')
fig.show()

## First the director column

In [10]:
df['director'].value_counts()

director
Not Given                         2588
Rajiv Chilaka                       20
Alastair Fothergill                 18
Raúl Campos, Jan Suter              18
Suhas Kadav                         16
                                  ... 
Matt D'Avella                        1
Parthiban                            1
Scott McAboy                         1
Raymie Muzquiz, Stu Livingston       1
Mozez Singh                          1
Name: count, Length: 4528, dtype: int64

### The is missing values in the director column (Not Given)

In [11]:
directors = df['director'].tolist()
for i in directors:
    print(i)

Kirsten Johnson
Julien Leclercq
Mike Flanagan
Bruno Garotti
Haile Gerima
Andy Devonshire
Theodore Melfi
Suhas Kadav
Christian Schwochow
Suhas Kadav
Suhas Kadav
Suhas Kadav
Not Given
Krysia Plonka, Kristian Mercado
Krysia Plonka, Kristian Mercado
Krysia Plonka, Kristian Mercado
Krysia Plonka, Kristian Mercado
Not Given
Not Given
Mark Thornton, Todd Kauffman
Mark Thornton, Todd Kauffman
Mark Thornton, Todd Kauffman
Mark Thornton, Todd Kauffman
Robert Cullen, José Luis Ucha
Kongkiat Komesiri
Pedro de Echave García, Pablo Azorín Williams
Anirban Majumder
Olivier Megaton
Alex Woo, Stanley Moore
S. Shankar
Dennis Dugan
Scott Stewart
Robert Luketic
Not Given
Not Given
Not Given
Not Given
Not Given
Not Given
Not Given
Not Given
Not Given
Not Given
Not Given
Not Given
Not Given
Not Given
Not Given
Not Given
Not Given
Not Given
Rajiv Menon
Adam Salky
K.S. Ravikumar
Ashwiny Iyer Tiwari, Abhishek Chaubey, Saket Chaudhary
Daniel Sandu
Cédric Jimenez
George Nolfi
Bunmi Ajakaiye
Ben Simms
Steven Spie

In [12]:
all_directors = []
for i in directors:
    if "," in i:
        sub_director = i.split(",")
        for j in sub_director:
            all_directors.append(j.strip())
    else:
        all_directors.append(i)

In [13]:
all_directors

['Kirsten Johnson',
 'Julien Leclercq',
 'Mike Flanagan',
 'Bruno Garotti',
 'Haile Gerima',
 'Andy Devonshire',
 'Theodore Melfi',
 'Suhas Kadav',
 'Christian Schwochow',
 'Suhas Kadav',
 'Suhas Kadav',
 'Suhas Kadav',
 'Not Given',
 'Krysia Plonka',
 'Kristian Mercado',
 'Krysia Plonka',
 'Kristian Mercado',
 'Krysia Plonka',
 'Kristian Mercado',
 'Krysia Plonka',
 'Kristian Mercado',
 'Not Given',
 'Not Given',
 'Mark Thornton',
 'Todd Kauffman',
 'Mark Thornton',
 'Todd Kauffman',
 'Mark Thornton',
 'Todd Kauffman',
 'Mark Thornton',
 'Todd Kauffman',
 'Robert Cullen',
 'José Luis Ucha',
 'Kongkiat Komesiri',
 'Pedro de Echave García',
 'Pablo Azorín Williams',
 'Anirban Majumder',
 'Olivier Megaton',
 'Alex Woo',
 'Stanley Moore',
 'S. Shankar',
 'Dennis Dugan',
 'Scott Stewart',
 'Robert Luketic',
 'Not Given',
 'Not Given',
 'Not Given',
 'Not Given',
 'Not Given',
 'Not Given',
 'Not Given',
 'Not Given',
 'Not Given',
 'Not Given',
 'Not Given',
 'Not Given',
 'Not Given',
 'N

In [14]:
ds = pd.Series(data= all_directors)
ds

0       Kirsten Johnson
1       Julien Leclercq
2         Mike Flanagan
3         Bruno Garotti
4          Haile Gerima
             ...       
9605          Not Given
9606          Not Given
9607          Not Given
9608          Not Given
9609          Not Given
Length: 9610, dtype: object

In [15]:
len(ds)

9610

### This means that there is duplicate directors which act as co-director in another movie

In [16]:
ds.value_counts()

Not Given              2588
Rajiv Chilaka            23
Jan Suter                21
Raúl Campos              19
Alastair Fothergill      19
                       ... 
Matt D'Avella             1
Parthiban                 1
Scott McAboy              1
Raymie Muzquiz            1
Mozez Singh               1
Name: count, Length: 4992, dtype: int64

## Now the country column

In [17]:
df['country'].value_counts()

country
United States     3240
India             1057
United Kingdom     638
Pakistan           421
Not Given          287
                  ... 
Iran                 1
West Germany         1
Greece               1
Zimbabwe             1
Soviet Union         1
Name: count, Length: 86, dtype: int64

### The united states made the highest number of movies, There is also missing values in the country column

### The date added is not very important as it just indicate the company when it put the movie in the platform

## release year column

In [18]:
df['release_year'].value_counts()

release_year
2018    1146
2017    1030
2019    1030
2020     953
2016     901
        ... 
1966       1
1959       1
1925       1
1947       1
1961       1
Name: count, Length: 74, dtype: int64

### Of course the released movies and TV shows nowadays are very big in the old days as the technology and the population increased a lot

In [19]:
fig = px.area(df['release_year'].value_counts(),color_discrete_sequence=["#a8f53d"],template='plotly_dark')
fig.show()

## rating & duration columns

In [20]:
df['rating'].value_counts()

rating
TV-MA       3205
TV-14       2157
TV-PG        861
R            799
PG-13        490
TV-Y7        333
TV-Y         306
PG           287
TV-G         220
NR            79
G             41
TV-Y7-FV       6
NC-17          3
UR             3
Name: count, dtype: int64

In [21]:
fig = px.bar(data_frame= df['rating'].value_counts(),
             template='plotly_dark',
             x= df['rating'].value_counts(),
             y= df['rating'].value_counts().index,
             orientation='h',
             title = 'Most ratings',
             labels = {"x":"frequency"
                       }
            )
fig.show()

In [22]:
df['duration'].value_counts()

duration
1 Season      1791
2 Seasons      421
3 Seasons      198
90 min         152
97 min         146
              ... 
5 min            1
16 min           1
186 min          1
193 min          1
11 Seasons       1
Name: count, Length: 220, dtype: int64

## We change the Not Given value to Nan

In [23]:
df.replace('Not Given', np.NaN , inplace= True)
df.head()

Unnamed: 0,type,title,director,country,date_added,release_year,rating,duration,listed_in
0,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,PG-13,90 min,Documentaries
1,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
2,TV Show,Midnight Mass,Mike Flanagan,United States,9/24/2021,2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries"
3,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,9/22/2021,2021,TV-PG,91 min,"Children & Family Movies, Comedies"
4,Movie,Sankofa,Haile Gerima,United States,9/24/2021,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies"


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8790 entries, 0 to 8789
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   type          8790 non-null   object
 1   title         8790 non-null   object
 2   director      6202 non-null   object
 3   country       8503 non-null   object
 4   date_added    8790 non-null   object
 5   release_year  8790 non-null   int64 
 6   rating        8790 non-null   object
 7   duration      8790 non-null   object
 8   listed_in     8790 non-null   object
dtypes: int64(1), object(8)
memory usage: 618.2+ KB


## drop missing values

In [25]:
df.dropna(inplace= True)
df.head()

Unnamed: 0,type,title,director,country,date_added,release_year,rating,duration,listed_in
0,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,PG-13,90 min,Documentaries
1,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
2,TV Show,Midnight Mass,Mike Flanagan,United States,9/24/2021,2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries"
3,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,9/22/2021,2021,TV-PG,91 min,"Children & Family Movies, Comedies"
4,Movie,Sankofa,Haile Gerima,United States,9/24/2021,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies"


## Top 5 directors

In [26]:
fig = px.bar(data_frame=df['director'].value_counts()[0:5],
             x= df['director'].value_counts()[0:5],
             y= df['director'].value_counts()[0:5].index,
             color=df['director'].value_counts()[0:5].index,
             text_auto=True,
             orientation= "h",
             template = 'plotly_dark'
            )
fig.show()