## Importing libraries

In [4]:
import pandas as pd

## Datasets

In [5]:
# Reading Dataset
netflix_ = pd.read_csv('Netflix.csv')

# Creating a copy to work with
netflix = netflix_.copy()

In [6]:
# Exploring Dataset
netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7787 entries, 0 to 7786
Data columns (total 12 columns):
show_id         7787 non-null object
type            7787 non-null object
title           7787 non-null object
director        5398 non-null object
cast            7069 non-null object
country         7280 non-null object
date_added      7777 non-null object
release_year    7787 non-null int64
rating          7780 non-null object
duration        7787 non-null object
listed_in       7787 non-null object
description     7787 non-null object
dtypes: int64(1), object(11)
memory usage: 730.2+ KB


In [7]:
# Droping no interesting data
netflix = netflix.drop(columns = ['description', 'director'])

In [8]:
netflix.head()

Unnamed: 0,show_id,type,title,cast,country,date_added,release_year,rating,duration,listed_in
0,s1,TV Show,3%,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &..."
1,s2,Movie,7:19,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies"
2,s3,Movie,23:59,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies"
3,s4,Movie,9,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi..."
4,s5,Movie,21,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas


In [9]:
# Parse date_added column to datetime format 
netflix['date_added'] = pd.to_datetime(netflix['date_added'], errors = 'coerce')

# Replacing date_added for year
netflix['date_added'] = netflix.date_added.dt.year

In [10]:
# Droping rows with nan values
netflix.dropna(axis = 0, inplace = True)

In [11]:
# Checking there are not duplicated rows
duplicatedRows = netflix[netflix.duplicated()]
print(duplicatedRows)

Empty DataFrame
Columns: [show_id, type, title, cast, country, date_added, release_year, rating, duration, listed_in]
Index: []


In [12]:
# Changing date_added type to int
netflix.date_added = netflix.date_added.astype('int64', errors = 'raise')

In [13]:
netflix.head()

Unnamed: 0,show_id,type,title,cast,country,date_added,release_year,rating,duration,listed_in
0,s1,TV Show,3%,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,2020,2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &..."
1,s2,Movie,7:19,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,2016,2016,TV-MA,93 min,"Dramas, International Movies"
2,s3,Movie,23:59,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,2018,2011,R,78 min,"Horror Movies, International Movies"
3,s4,Movie,9,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,2017,2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi..."
4,s5,Movie,21,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,2020,2008,PG-13,123 min,Dramas


In [14]:
netflix.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6643 entries, 0 to 7785
Data columns (total 10 columns):
show_id         6643 non-null object
type            6643 non-null object
title           6643 non-null object
cast            6643 non-null object
country         6643 non-null object
date_added      6643 non-null int64
release_year    6643 non-null int64
rating          6643 non-null object
duration        6643 non-null object
listed_in       6643 non-null object
dtypes: int64(2), object(8)
memory usage: 570.9+ KB


In [15]:
# Creating "Time" column in order to know how long it takes Netflix to include a film in the platform
netflix["Time"] = netflix["date_added"] - netflix["release_year"]

In [16]:
# Reassining the columns name
netflix.columns = netflix.columns.str.capitalize().str.strip()
netflix.head()

Unnamed: 0,Show_id,Type,Title,Cast,Country,Date_added,Release_year,Rating,Duration,Listed_in,Time
0,s1,TV Show,3%,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,2020,2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",0
1,s2,Movie,7:19,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,2016,2016,TV-MA,93 min,"Dramas, International Movies",0
2,s3,Movie,23:59,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,2018,2011,R,78 min,"Horror Movies, International Movies",7
3,s4,Movie,9,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,2017,2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...",8
4,s5,Movie,21,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,2020,2008,PG-13,123 min,Dramas,12


In [17]:
netflix.to_csv('Clean_Netflix.csv')