### Optimizing A Data Set for Memory Usage

In [1]:
import pandas as pd

In [2]:
employees_data = '/Users/ypushiev/Learning/PANDAS IN ACTION/Chapter 5 Dataframe filtering/Data/employees.csv'

In [3]:
pd.read_csv(employees_data).isna().sum()

First Name     68
Gender        147
Start Date      2
Salary          2
Mgmt           68
Team           44
dtype: int64

In [4]:
df_employees = pd.read_csv(employees_data, parse_dates = ['Start Date'])

  df_employees = pd.read_csv(employees_data, parse_dates = ['Start Date'])


In [5]:
df_employees['Start Date'] = pd.to_datetime(df_employees['Start Date'], format='%Y-%m-%d')

In [6]:
df_employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   First Name  933 non-null    object        
 1   Gender      854 non-null    object        
 2   Start Date  999 non-null    datetime64[ns]
 3   Salary      999 non-null    float64       
 4   Mgmt        933 non-null    object        
 5   Team        957 non-null    object        
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 47.1+ KB


**memory usage: 47.1+ KB**

In [7]:
df_employees.head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,,True,Marketing
1,Thomas,Male,1996-03-31,61933.0,True,
2,Maria,Female,NaT,130590.0,False,Finance
3,Jerry,,2005-03-04,138705.0,True,Finance
4,Larry,Male,1998-01-24,101004.0,True,IT


**Convert datetime into the object to cut the time from the data**

In [8]:
df_employees['Start Date'] = df_employees['Start Date'].dt.strftime('%Y-%m-%d')

In [9]:
df_employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   First Name  933 non-null    object 
 1   Gender      854 non-null    object 
 2   Start Date  999 non-null    object 
 3   Salary      999 non-null    float64
 4   Mgmt        933 non-null    object 
 5   Team        957 non-null    object 
dtypes: float64(1), object(5)
memory usage: 47.1+ KB


**memory usage: 47.1+ KB**

### Converting Data Types with the astype Method

In [10]:
df_employees['Mgmt'].astype('bool').tail()

996     False
997     False
998     False
999      True
1000     True
Name: Mgmt, dtype: bool

**Convert object type into the bool type and see results in the size**

In [11]:
df_employees['Mgmt'] = df_employees['Mgmt'].astype('bool')

In [12]:
df_employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   First Name  933 non-null    object 
 1   Gender      854 non-null    object 
 2   Start Date  999 non-null    object 
 3   Salary      999 non-null    float64
 4   Mgmt        1001 non-null   bool   
 5   Team        957 non-null    object 
dtypes: bool(1), float64(1), object(4)
memory usage: 40.2+ KB


**memory usage: 40.2+ KB**

### Convert into the integer type

**The Salary column contains NaN values**

In [13]:
df_employees['Salary'].isna().sum()

2

**Replace NaN values by 0**

In [14]:
df_employees['Salary']= df_employees['Salary'].fillna(0)

In [15]:
df_employees['Salary']=df_employees['Salary'].astype('int')

In [16]:
df_employees['Salary'].head()

0         0
1     61933
2    130590
3    138705
4    101004
Name: Salary, dtype: int64

In [17]:
df_employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   First Name  933 non-null    object
 1   Gender      854 non-null    object
 2   Start Date  999 non-null    object
 3   Salary      1001 non-null   int64 
 4   Mgmt        1001 non-null   bool  
 5   Team        957 non-null    object
dtypes: bool(1), int64(1), object(4)
memory usage: 40.2+ KB


**The memory might be reduced more using INT32**

**Check max value**

In [18]:
df_employees['Salary'].max()

149908

In [19]:
df_employees['Salary']=df_employees['Salary'].astype('int32')

In [20]:
df_employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   First Name  933 non-null    object
 1   Gender      854 non-null    object
 2   Start Date  999 non-null    object
 3   Salary      1001 non-null   int32 
 4   Mgmt        1001 non-null   bool  
 5   Team        957 non-null    object
dtypes: bool(1), int32(1), object(4)
memory usage: 36.3+ KB


In [21]:
df_employees['Salary'].max()

149908

In [22]:
df_employees['Salary'].head()

0         0
1     61933
2    130590
3    138705
4    101004
Name: Salary, dtype: int32

**memory usage: 36.3+ KB**

### Convert into categorical type

**Check unique values in the Dataframe** 

In [23]:
df_employees.nunique()

First Name    200
Gender          2
Start Date    971
Salary        995
Mgmt            2
Team           10
dtype: int64

**The Gender column contains only 2 unique values** 

In [24]:
df_employees['Gender'].astype('category').head()

0      Male
1      Male
2    Female
3       NaN
4      Male
Name: Gender, dtype: category
Categories (2, object): ['Female', 'Male']

In [25]:
df_employees['Gender'] = df_employees['Gender'].astype('category')

**Convert Team column type**

In [26]:
df_employees['Team'] = df_employees['Team'].astype('category')

In [27]:
df_employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   First Name  933 non-null    object  
 1   Gender      854 non-null    category
 2   Start Date  999 non-null    object  
 3   Salary      1001 non-null   int32   
 4   Mgmt        1001 non-null   bool    
 5   Team        957 non-null    category
dtypes: bool(1), category(2), int32(1), object(2)
memory usage: 23.1+ KB


**memory usage: 23.1+ KB**

**The Dataframe size has been decreased from 47.1+ KB to 23.1+ KB approximately 50%**

### Filtering by a Single Condition

In [28]:
"Maria" == "Maria"

True

In [29]:
"Maria" == "Taylor"

False

**Check the column First Name**

In [30]:
df_employees["First Name"] == "Maria"

0       False
1       False
2        True
3       False
4       False
        ...  
996     False
997     False
998     False
999     False
1000    False
Name: First Name, Length: 1001, dtype: bool

**Select rows where "First Name" == "Maria"**

In [31]:
df_employees[df_employees["First Name"] == "Maria"]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
2,Maria,Female,,130590,False,Finance
198,Maria,Female,1990-12-27,36067,True,Product
815,Maria,,1986-01-18,106562,False,HR
844,Maria,,1985-06-19,148857,False,Legal
936,Maria,Female,2003-03-14,96250,False,Business Dev
984,Maria,Female,2011-10-15,43455,False,Engineering


**Create a filter for Dataframe**

In [32]:
marias = df_employees["First Name"] == "Maria"
df_employees[marias]

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
2,Maria,Female,,130590,False,Finance
198,Maria,Female,1990-12-27,36067,True,Product
815,Maria,,1986-01-18,106562,False,HR
844,Maria,,1985-06-19,148857,False,Legal
936,Maria,Female,2003-03-14,96250,False,Business Dev
984,Maria,Female,2011-10-15,43455,False,Engineering


**Create a filter and find Top 5 Salary > 100000**

In [33]:
high_earners = df_employees["Salary"] > 100000
df_employees[high_earners].sort_values(by="Salary", ascending=False).head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
644,Katherine,Female,1996-08-13,149908,False,Finance
429,Rose,Female,2015-05-28,149903,False,HR
828,Cynthia,Female,2006-07-12,149684,False,Product
186,,Female,2005-02-23,149654,True,Sales
160,Kathy,Female,2000-03-18,149563,True,Finance


### Filtering by Multiple Conditions

#### Condition and - &

**First condition**

In [34]:
is_female = df_employees['Gender'] == 'Female'

**Second condiotion**

In [35]:
in_biz_dev = df_employees['Team'] == "Business Dev"

**Find only women from the Business Development team**

In [36]:
df_employees[is_female & in_biz_dev].head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
9,Frances,Female,2002-08-08,139852,True,Business Dev
33,Jean,Female,1993-12-18,119082,False,Business Dev
36,Rachel,Female,2009-02-16,142032,False,Business Dev
38,Stephanie,Female,1986-09-13,36844,True,Business Dev
61,Denise,Female,2001-11-06,106862,False,Business Dev


In [37]:
is_manager = df_employees["Mgmt"]

**Add more conditionals**

In [38]:
df_employees[is_female & in_biz_dev & is_manager].head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
9,Frances,Female,2002-08-08,139852,True,Business Dev
38,Stephanie,Female,1986-09-13,36844,True,Business Dev
66,Nancy,Female,2012-12-15,125250,True,Business Dev
92,Linda,Female,2000-05-25,119009,True,Business Dev
111,Bonnie,Female,1999-12-17,42153,True,Business Dev


#### Condiotional OR - |

**Find employees with a salary lower than 40000 or a start date later than 2015-01-01**

In [39]:
earning_below_40k = df_employees["Salary"] < 40000
started_after_2015 = df_employees["Start Date"] > "2015-01-01"

In [40]:
df_employees[earning_below_40k | started_after_2015].tail()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
958,Gloria,Female,1987-10-24,39833,False,Engineering
964,Bruce,Male,1980-05-07,35802,True,Sales
967,Thomas,Male,2016-03-12,105681,False,Engineering
989,Justin,,1991-02-10,38344,False,Legal
1000,,,,0,True,


### Logical rejection (~)

#### Two same results with different ways of solving

**Salary lower than 100000 - strict conditional**

In [41]:
df_employees[df_employees["Salary"] < 100000].head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
6,Ruby,Female,1987-08-17,65476,True,Product
7,,Female,2015-07-20,45906,True,Finance
8,Angela,Female,2005-11-22,95570,True,Engineering


**Using logical rejection**

In [42]:
df_employees[~(df_employees["Salary"] >= 100000)].head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
6,Ruby,Female,1987-08-17,65476,True,Product
7,,Female,2015-07-20,45906,True,Finance
8,Angela,Female,2005-11-22,95570,True,Engineering


### Methods for boolean results

**Operation ==**

In [43]:
df_employees["Team"].eq("Marketing").head(1)

0    True
Name: Team, dtype: bool

**Operation !=**

In [44]:
df_employees["Team"].ne("Marketing").head(1)

0    False
Name: Team, dtype: bool

**Operation <**

In [45]:
df_employees["Salary"].lt(100000).head(1)

0    True
Name: Salary, dtype: bool

**Operation <=**

In [46]:
df_employees["Salary"].le(100000).head(1)

0    True
Name: Salary, dtype: bool

**Operation >**

In [47]:
df_employees["Salary"].gt(100000).head(1)

0    False
Name: Salary, dtype: bool

**Operation >=**

In [48]:
df_employees["Salary"].ge(100000).head(1)

0    False
Name: Salary, dtype: bool

### Filtering by Condition

#### The isin Method

In [49]:
all_star_teams = ["Sales", "Legal", "Marketing"]
is_in_all_star_teams = df_employees["Team"].isin(all_star_teams)
df_employees[is_in_all_star_teams].head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
5,Dennis,Male,1987-04-18,115163,False,Legal
11,Julie,Female,1997-10-26,102508,True,Legal
13,Gary,Male,2008-01-27,109831,False,Sales
20,Lois,,1995-04-22,64714,True,Legal


#### The between Method

In [50]:
between_80k_and_90k = df_employees["Salary"].between(80000, 90000)
df_employees[['First Name','Team','Salary']][between_80k_and_90k].sort_values(by="Salary").head()

Unnamed: 0,First Name,Team,Salary
821,,Distribution,80399
865,Karen,Legal,80633
360,Susan,Sales,80688
104,John,Marketing,80740
542,Amanda,Distribution,80803


**Using for datetime type**

In [51]:
eighties_folk = df_employees["Start Date"].between(
left = "1980-01-01",
right = "1990-01-01"
)
df_employees[eighties_folk].head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
5,Dennis,Male,1987-04-18,115163,False,Legal
6,Ruby,Female,1987-08-17,65476,True,Product
10,Louise,Female,1980-08-12,63241,True,
12,Brandon,Male,1980-12-01,112807,True,HR
17,Shawn,Male,1986-12-07,111737,False,Product


**Using for string type**

In [52]:
name_starts_with_r = df_employees["First Name"].between("R", "S")
df_employees[name_starts_with_r].head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
6,Ruby,Female,1987-08-17,65476,True,Product
36,Rachel,Female,2009-02-16,142032,False,Business Dev
45,Roger,Male,1980-04-17,88010,True,Sales
67,Rachel,Female,1999-08-16,51178,True,Finance
78,Robin,Female,1983-06-04,114797,True,Sales


### The isna and notna Methods

**isna method finds all NaN values from specific column**

In [53]:
no_team = df_employees["Team"].isna()
df_employees[['First Name','Gender','Team']][no_team].head()

Unnamed: 0,First Name,Gender,Team
1,Thomas,Male,
10,Louise,Female,
23,,Male,
32,,Male,
91,James,,


**notna method returns only True values in the "First Name" column**

In [54]:
has_name = df_employees["First Name"].notna()
df_employees[['First Name','Gender','Team']][has_name].head()

Unnamed: 0,First Name,Gender,Team
0,Douglas,Male,Marketing
1,Thomas,Male,
2,Maria,Female,Finance
3,Jerry,,Finance
4,Larry,Male,IT


**Count only True values**

In [55]:
df_employees.notna().sum().sort_values(ascending=False)

Salary        1001
Mgmt          1001
Start Date     999
Team           957
First Name     933
Gender         854
dtype: int64

**Count only NaN values**

In [56]:
df_employees.isna().sum().sort_values(ascending=False)

Gender        147
First Name     68
Team           44
Start Date      2
Salary          0
Mgmt            0
dtype: int64

### Dealing with Null Values

In [57]:
df_dealna = pd.read_csv(employees_data, parse_dates = ['Start Date'])

  df_dealna = pd.read_csv(employees_data, parse_dates = ['Start Date'])


In [58]:
df_dealna.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   First Name  933 non-null    object        
 1   Gender      854 non-null    object        
 2   Start Date  999 non-null    datetime64[ns]
 3   Salary      999 non-null    float64       
 4   Mgmt        933 non-null    object        
 5   Team        957 non-null    object        
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 47.1+ KB


#### dropna method will remove all rows where NaN values exist

In [59]:
df_dealna.dropna().isnull().sum()

First Name    0
Gender        0
Start Date    0
Salary        0
Mgmt          0
Team          0
dtype: int64

**Dropna(how = 'all') removes only rows where all values are NaN**

In [60]:
df_dealna.shape


(1001, 6)

In [61]:
drop_how = df_dealna.dropna(how='all')
drop_how.shape

(1000, 6)

**Drop NaN values in the specific column**

In [62]:
df_dealna.dropna(subset=['Gender']).head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,,True,Marketing
1,Thomas,Male,1996-03-31,61933.0,True,
2,Maria,Female,NaT,130590.0,False,Finance
4,Larry,Male,1998-01-24,101004.0,True,IT
5,Dennis,Male,1987-04-18,115163.0,False,Legal


**Cleaning a few columns**

In [63]:
df_dealna.dropna(subset=['Team','Salary','Gender']).isnull().sum()

First Name    52
Gender         0
Start Date     1
Salary         0
Mgmt          52
Team           0
dtype: int64

**The thresh parameter allows you to set a minimum number of non-empty values
in line**

In [64]:
# Returns all True values because of thresh=6 and Dataframe has only 6 columns
df_dealna.dropna(thresh=6).head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
4,Larry,Male,1998-01-24,101004.0,True,IT
5,Dennis,Male,1987-04-18,115163.0,False,Legal
6,Ruby,Female,1987-08-17,65476.0,True,Product
8,Angela,Female,2005-11-22,95570.0,True,Engineering
9,Frances,Female,2002-08-08,139852.0,True,Business Dev


In [65]:
# returns all rows with at least 4 non-NA values
df_dealna.dropna(thresh=4).head()

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,,True,Marketing
1,Thomas,Male,1996-03-31,61933.0,True,
2,Maria,Female,NaT,130590.0,False,Finance
3,Jerry,,2005-03-04,138705.0,True,Finance
4,Larry,Male,1998-01-24,101004.0,True,IT


### Dealing with Duplicates

#### The duplicated Method

In [66]:
df_employees["Team"].head()

0    Marketing
1          NaN
2      Finance
3      Finance
4           IT
Name: Team, dtype: category
Categories (10, object): ['Business Dev', 'Distribution', 'Engineering', 'Finance', ..., 'Legal', 'Marketing', 'Product', 'Sales']

**The duplicated() method compares first value with next value and if they equal returns True**

In [67]:
df_employees["Team"].duplicated().head()

0    False
1    False
2    False
3     True
4    False
Name: Team, dtype: bool

**The same result**

In [68]:
df_employees["Team"].duplicated(keep = "first").head()

0    False
1    False
2    False
3     True
4    False
Name: Team, dtype: bool

**Retrieve only unique value from each Team**

In [69]:
(~df_employees["Team"].duplicated()).head()

0     True
1     True
2     True
3    False
4     True
Name: Team, dtype: bool

In [70]:
first_one_in_team = ~df_employees["Team"].duplicated()
df_employees[first_one_in_team].value_counts('Team')

Team
Business Dev    1
Distribution    1
Engineering     1
Finance         1
HR              1
IT              1
Legal           1
Marketing       1
Product         1
Sales           1
Name: count, dtype: int64

In [71]:
df_employees['Team'].value_counts()

Team
IT              106
Finance         102
Business Dev    101
Marketing        98
Product          95
Sales            94
Engineering      92
HR               91
Distribution     90
Legal            88
Name: count, dtype: int64

#### The drop_duplicates Method

**The parameter subset determines which column will be affected**

In [72]:
df_employees.drop_duplicates(subset=['Team']).head(20)

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
2,Maria,Female,,130590,False,Finance
4,Larry,Male,1998-01-24,101004,True,IT
5,Dennis,Male,1987-04-18,115163,False,Legal
6,Ruby,Female,1987-08-17,65476,True,Product
8,Angela,Female,2005-11-22,95570,True,Engineering
9,Frances,Female,2002-08-08,139852,True,Business Dev
12,Brandon,Male,1980-12-01,112807,True,HR
13,Gary,Male,2008-01-27,109831,False,Sales


**Drop duplicates using a pair of columns**

In [73]:
df_employees.drop_duplicates(subset = ["Gender", "Team"]).head(30)

Unnamed: 0,First Name,Gender,Start Date,Salary,Mgmt,Team
0,Douglas,Male,1993-08-06,0,True,Marketing
1,Thomas,Male,1996-03-31,61933,True,
2,Maria,Female,,130590,False,Finance
3,Jerry,,2005-03-04,138705,True,Finance
4,Larry,Male,1998-01-24,101004,True,IT
5,Dennis,Male,1987-04-18,115163,False,Legal
6,Ruby,Female,1987-08-17,65476,True,Product
8,Angela,Female,2005-11-22,95570,True,Engineering
9,Frances,Female,2002-08-08,139852,True,Business Dev
10,Louise,Female,1980-08-12,63241,True,


**Comparing performs using the first row, so the row with Gender = 'Male' and Team = 'Marketing' will be unique and all other rows with the same pair will be removed**

### Tasks

In [74]:
df_netflix = pd.read_csv('/Users/ypushiev/Learning/PANDAS IN ACTION/Chapter 5 Dataframe filtering/Data/netflix.csv',parse_dates = ['date_added'])

  df_netflix = pd.read_csv('/Users/ypushiev/Learning/PANDAS IN ACTION/Chapter 5 Dataframe filtering/Data/netflix.csv',parse_dates = ['date_added'])


**Optimize the Dataframe size** 

In [75]:
df_netflix.head()

Unnamed: 0,title,director,date_added,type
0,Alias Grace,,2017-11-03,TV Show
1,A Patch of Fog,Michael Lennox,2017-04-15,Movie
2,Lunatics,,2019-04-19,TV Show
3,Uriyadi 2,Vijay Kumar,2019-08-02,Movie
4,Shrek the Musical,Jason Moore,2013-12-29,Movie


In [76]:
df_netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5837 entries, 0 to 5836
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   title       5837 non-null   object        
 1   director    3936 non-null   object        
 2   date_added  5195 non-null   datetime64[ns]
 3   type        5837 non-null   object        
dtypes: datetime64[ns](1), object(3)
memory usage: 182.5+ KB


In [77]:
df_netflix.nunique()

title         5780
director      3024
date_added    1092
type             2
dtype: int64

In [78]:
df_netflix['type'] = df_netflix['type'].astype('category')

In [79]:
df_netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5837 entries, 0 to 5836
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   title       5837 non-null   object        
 1   director    3936 non-null   object        
 2   date_added  5195 non-null   datetime64[ns]
 3   type        5837 non-null   category      
dtypes: category(1), datetime64[ns](1), object(2)
memory usage: 142.8+ KB


#### Find all rows with Limitless

In [80]:
Limitless = df_netflix[df_netflix["title"] == 'Limitless']
Limitless.head()

Unnamed: 0,title,director,date_added,type
1559,Limitless,Neil Burger,2019-05-16,Movie
2564,Limitless,,2016-07-01,TV Show
4579,Limitless,Vrinda Samartha,2019-10-01,Movie


#### Find the movies of Robert Rodriguez

In [81]:
director_rr = df_netflix['director'] == 'Robert Rodriguez'
movie_types = df_netflix['type'] == 'Movie'
df_netflix[director_rr & movie_types].head()

Unnamed: 0,title,director,date_added,type
1384,Spy Kids: All the Time in the World,Robert Rodriguez,2019-02-19,Movie
1416,Spy Kids 3: Game Over,Robert Rodriguez,2019-04-01,Movie
1460,Spy Kids 2: The Island of Lost Dreams,Robert Rodriguez,2019-03-08,Movie
2890,Sin City,Robert Rodriguez,2019-10-01,Movie
3836,Shorts,Robert Rodriguez,2019-07-01,Movie


#### Find a movie of Robert Altman or date released 2019-07-31

In [82]:
director_ra = df_netflix['director'] == "Robert Altman"
date = df_netflix['date_added'] > '2019-07-31' 
df_netflix[director_ra | date].head()

Unnamed: 0,title,director,date_added,type
3,Uriyadi 2,Vijay Kumar,2019-08-02,Movie
6,We Have Always Lived in the Castle,Stacie Passon,2019-09-14,Movie
10,Hasta los dientes,Alberto Arnaut Estrada,2019-08-13,Movie
14,Ocean's Thirteen,Steven Soderbergh,2019-10-01,Movie
18,The Rugrats Movie,Igor Kovalyov,2019-10-01,Movie


#### Fibd requested directors

In [83]:
directors = ['Orson Welles','Aditya Kripalani','Sam Raimi']
df_netflix[df_netflix['director'].isin(directors)].head()   


Unnamed: 0,title,director,date_added,type
946,The Stranger,Orson Welles,2018-07-19,Movie
1870,The Gift,Sam Raimi,2019-11-20,Movie
3706,Spider-Man 3,Sam Raimi,2019-11-01,Movie
4243,Tikli and Laxmi Bomb,Aditya Kripalani,2018-08-01,Movie
4475,The Other Side of the Wind,Orson Welles,2018-11-02,Movie


#### Find all rows between requested date

In [84]:
between_1may_and_1june = df_netflix["date_added"].between('2019-05-01', '2019-06-01')
df_netflix[between_1may_and_1june].head()

Unnamed: 0,title,director,date_added,type
29,Chopsticks,Sachin Yardi,2019-05-31,Movie
60,Away From Home,,2019-05-08,TV Show
82,III Smoking Barrels,Sanjib Dey,2019-06-01,Movie
108,Jailbirds,,2019-05-10,TV Show
124,Pegasus,Han Han,2019-05-31,Movie


#### Remove duplicates in director column

In [85]:
df_netflix.isna().sum().sort_values(ascending=False)

director      1901
date_added     642
title            0
type             0
dtype: int64

In [86]:
df_netflix.dropna(subset=['director']).head()

Unnamed: 0,title,director,date_added,type
1,A Patch of Fog,Michael Lennox,2017-04-15,Movie
3,Uriyadi 2,Vijay Kumar,2019-08-02,Movie
4,Shrek the Musical,Jason Moore,2013-12-29,Movie
5,Schubert In Love,Lars Büchel,2018-03-01,Movie
6,We Have Always Lived in the Castle,Stacie Passon,2019-09-14,Movie


#### Find unique data with only one title released

In [87]:
df_netflix.drop_duplicates(subset = ["date_added"], keep = False)

Unnamed: 0,title,director,date_added,type
4,Shrek the Musical,Jason Moore,2013-12-29,Movie
12,Without Gorky,Cosima Spender,2017-05-31,Movie
30,Anjelah Johnson: Not Fancy,Jay Karas,2015-10-02,Movie
38,One Last Thing,Tim Rouhana,2019-08-25,Movie
70,Marvel's Iron Man & Hulk: Heroes United,Leo Riley,2014-02-16,Movie
...,...,...,...,...
5748,Menorca,John Barnard,2017-08-27,Movie
5749,Green Room,Jeremy Saulnier,2018-11-12,Movie
5788,Chris Brown: Welcome to My Life,Andrew Sandler,2017-10-07,Movie
5789,A Very Murray Christmas,Sofia Coppola,2015-12-04,Movie
