In [None]:
import pandas as pd

# Load the dataset
netflix = pd.read_csv('netflix_titles.csv')

In [None]:
netflix.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [None]:
print("The number of rows and columns from the data is:", netflix.shape)

The number of rows and columns from the data is: (8807, 12)


In [None]:
netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


## Step 1: Import & Explore the Dataset

**Analysis Questions:**

1. What are the column names in the dataset?
2. How many unique values does the `type` column have?
3. What percentage of the total content are "Movies"?

In [None]:
print(netflix.columns)

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')


In [None]:
print("The unique types of content inside 'type' column is:", netflix['type'].nunique())
print("The percentage of the total content are:", round(netflix['type'].value_counts(normalize=True)['Movie']* 100,2))

The unique types of content inside 'type' column is: 2
The percentage of the total content are: 69.62


## Step 2: Check for Missing Data

**Analysis Questions:**

4. Which three columns have the most missing values?
5. Are there more missing values in `director` or in `cast`?

In [None]:
print(netflix.isnull().sum().sort_values(ascending=False))
print("\n The three columns that have the most missing values are: Director, Cast, and Country.")
print(" There is more missing values in director column.")

director        2634
country          831
cast             825
date_added        10
rating             4
duration           3
show_id            0
type               0
title              0
release_year       0
listed_in          0
description        0
dtype: int64

 The three columns that have the most missing values are: Director, Cast, and Country.
 There is more missing values in director column.


## Step 3: Clean the Dataset

**Analysis Questions:**

6. Replace missing values in `country`, `director`, and `cast` with appropriate default values.
7. Drop rows where `date_added` is missing. How many rows remain after cleaning?
8. After cleaning, are there still any missing values in the dataset?

In [None]:
netflix[['country', 'director', 'cast']] = netflix[['country', 'director', 'cast']].fillna('Not Specified')

In [None]:
netflix[['rating','duration']] = netflix[['rating','duration']].fillna(netflix[['rating','duration']].mode().iloc[0])

In [None]:
netflix['date_added'] = pd.to_datetime(netflix['date_added'], errors='coerce')
netflix = netflix.dropna(subset=['date_added'])
print(netflix.isnull().sum())

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64


In [None]:
print("The number of rows after cleaning is:", netflix.shape[0])

The number of rows after cleaning is: 8709


## Step 4: Filter the Data with `.loc[]`

**Analysis Questions:**

9. What are the top 5 "TV Shows" released in "Canada"?
10. How many "Movies" have a rating of "PG-13"?
11. List all titles added in 2019 that are "TV Shows" from Brazil.

In [None]:
netflix_canada = netflix[(netflix['country'] == 'Canada') & (netflix['type'] == 'TV Show')].head()
print(netflix_canada)

     show_id     type                   title       director  \
510     s511  TV Show                 Between  Not Specified   
521     s522  TV Show       Kim's Convenience  Not Specified   
535     s536  TV Show  Some Assembly Required  Not Specified   
544     s545  TV Show            Workin' Moms  Not Specified   
1065   s1066  TV Show                 Slasher  Not Specified   

                                                   cast country date_added  \
510   Jennette McCurdy, Jesse Carere, Ryan Allen, Ju...  Canada 2021-07-06   
521   Paul Sun-Hyung Lee, Jean Yoon, Andrea Bang, Si...  Canada 2021-07-06   
535   Kolton Stewart, Harrison Houde, Charlie Storwi...  Canada 2021-07-06   
544   Catherine Reitman, Dani Kind, Juno Rinaldi, Je...  Canada 2021-07-06   
1065  Katie McGrath, Brandon Jay McLaren, Steve Byer...  Canada 2021-04-14   

      release_year rating   duration  \
510           2016  TV-14  2 Seasons   
521           2021  TV-MA  5 Seasons   
535           2015   TV-Y 

In [None]:
netflix_movies = netflix[(netflix['rating'] == 'PG-13') & (netflix['type'] == 'Movie')]
print("The amount of movies that have PG-13 rating is:", len(netflix_movies))

The amount of movies that have PG-13 rating is: 490


In [None]:
netflix_brazil = netflix[(netflix['country'] == 'Brazil') & (netflix['type'] == 'TV Show') & (netflix['date_added'].dt.year == 2019)]
print(netflix_brazil[['title', 'date_added']])

                 title date_added
3174    The Chosen One 2019-12-06
3220   Lugar de Mulher 2019-11-28
3243  Nobody's Looking 2019-11-22
3370       Brotherhood 2019-10-25
3604          Sintonia 2019-08-09
3838     The Mechanism 2019-05-10
3902         Samantha! 2019-04-19
4090                Z4 2019-02-22


## Step 5: Value Counts Analysis

**Analysis Questions:**

12. What are the top 5 countries by number of titles?
13. Which 5 ratings are most common in the dataset?
14. What are the top 5 genres in the `listed_in` column?
15. How many titles fall under the genre "Documentaries"?

In [None]:
netflix['country'].value_counts().head()

Unnamed: 0_level_0,count
country,Unnamed: 1_level_1
United States,2778
India,971
Not Specified,827
United Kingdom,403
Japan,241


In [None]:
netflix['rating'].value_counts().head()

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
TV-MA,3187
TV-14,2133
TV-PG,838
R,799
PG-13,490


In [None]:
netflix['listed_in'].value_counts().head()

Unnamed: 0_level_0,count
listed_in,Unnamed: 1_level_1
"Dramas, International Movies",362
Documentaries,359
Stand-Up Comedy,334
"Comedies, Dramas, International Movies",274
"Dramas, Independent Movies, International Movies",252


In [None]:
netflix_documentary = netflix[netflix['listed_in'].str.contains('Documentaries', na=False)].shape[0] # str.contains checks if the string "Documentaries" is present in each row of the listed_in column.
print("The number of titles under the genre 'Documentaries' is:", netflix_documentary)

The number of titles under the genre 'Documentaries' is: 869


## Step 6: GroupBy Analysis

**Analysis Questions:**

16. How many titles were released each year between 2010 and 2020?
17. Which year had the highest number of releases?
18. What is the total number of titles per rating?
19. Which country has the most "TV Shows"? (use groupby on `type` and `country`)

In [None]:
netflix_year = netflix[(netflix['release_year'] >= 2010) & (netflix['release_year'] <= 2020)]
print(netflix_year.groupby('release_year').size())

release_year
2010     190
2011     184
2012     229
2013     282
2014     343
2015     549
2016     878
2017    1016
2018    1140
2019    1030
2020     953
dtype: int64


## Step 7: Sorting and Selecting Rows with `.iloc[]`

**Analysis Questions:**

20. Show the 10 most recent titles based on `release_year`.
21. Which is the oldest title in the dataset, and in which year was it released?
22. Use `.iloc[]` to select the 50th row — what is the title and its type?

In [None]:
netflix_recent = netflix.sort_values(by='release_year', ascending=False).head(10)
print(netflix_recent.iloc[:,[2,7]])

                                                title  release_year
1                                       Blood & Water          2021
8437                           The Netflix Afterparty          2021
31                                 Chicago Party Aunt          2021
30                                    Ankahi Kahaniya          2021
25                               Love on the Spectrum          2021
23       Go! Go! Cory Carson: Chrissy Takes the Wheel          2021
20    Monsters Inside: The 24 Faces of Billy Milligan          2021
19                                             Jaguar          2021
1551                                            Hilda          2021
55                                          Nailed It          2021


In [None]:
netflix_oldest = netflix.sort_values(by='release_year', ascending=True).head(1)
print(netflix_oldest.iloc[:,[2,7]])
print("\n The oldest title in the dataset is", netflix_oldest['title'].values[0], "and it was released in the year of", netflix_oldest['release_year'].values[0])

                                  title  release_year
4250  Pioneers: First Women Filmmakers*          1925

 The oldest title in the dataset is Pioneers: First Women Filmmakers* and it was released in the year of 1925


In [None]:
netflix_50th = netflix.iloc[49]
print(netflix_50th[['title', 'type']])

title    Castle and Castle
type               TV Show
Name: 49, dtype: object


## Step 8: Summary of Insights

**Summary Prompts:**

- What is the overall distribution between movies and shows?
- Are there content production trends over the years?
- Which countries and genres dominate Netflix’s catalog?
- Are there any surprising findings in ratings or release patterns?

In [None]:
netflix['type'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
type,Unnamed: 1_level_1
Movie,0.703984
TV Show,0.296016


In [None]:
yearly_counts = netflix.groupby('release_year').size()
print(yearly_counts.sort_values())

release_year
1925       1
1947       1
1961       1
1959       1
1966       1
        ... 
2016     878
2020     953
2017    1016
2019    1030
2018    1140
Length: 74, dtype: int64


In [None]:
trend_by_type = netflix.groupby(['release_year', 'type']).size().unstack() # .unstack() turns part of the row index into columns
print(trend_by_type.sort_index())

type          Movie  TV Show
release_year                
1925            NaN      1.0
1942            2.0      NaN
1943            3.0      NaN
1944            3.0      NaN
1945            3.0      1.0
...             ...      ...
2017          767.0    249.0
2018          767.0    373.0
2019          633.0    397.0
2020          517.0    436.0
2021          277.0    315.0

[74 rows x 2 columns]


In [None]:
top_countries = netflix['country'].value_counts().head(10)
print(top_countries)

country
United States     2778
India              971
Not Specified      827
United Kingdom     403
Japan              241
South Korea        195
Canada             173
Spain              141
France             122
Mexico             110
Name: count, dtype: int64


In [None]:
genre_counts = netflix['listed_in'].str.split(', ').explode().value_counts() # we use .explode() because the 'listed_in' column contains multiple genres separated by commas.
print("Most common genre on Netflix:")
print(genre_counts.head(10))

Most common genre on Netflix:
listed_in
International Movies        2752
Dramas                      2427
Comedies                    1674
International TV Shows      1328
Documentaries                869
Action & Adventure           859
Independent Movies           756
TV Dramas                    739
Children & Family Movies     641
Romantic Movies              616
Name: count, dtype: int64


In [None]:
rating_by_year = netflix.groupby(['release_year', 'rating']).size().unstack(fill_value=0)
print("\nRatings per year (last few years):")
print(rating_by_year.tail(5))


Ratings per year (last few years):
rating        66 min  74 min  84 min  G  NC-17  NR  PG  PG-13   R  TV-14  \
release_year                                                               
2017               0       1       0  1      0   2  15     32  73    247   
2018               0       0       0  2      1   1  31     30  52    266   
2019               0       0       0  1      0   0  12     19  39    252   
2020               0       0       0  1      0   0  15     21  48    174   
2021               0       0       0  0      0   0  11     14  21    151   

rating        TV-G  TV-MA  TV-PG  TV-Y  TV-Y7  TV-Y7-FV  UR  
release_year                                                 
2017            24    447    107    30     37         0   0  
2018            26    548    102    40     40         1   0  
2019            23    500     98    50     36         0   0  
2020            45    469     80    59     41         0   0  
2021            21    270     45    26     33         0   0

In [None]:
netflix.shape

(8709, 12)