# DataFrames 2: Filtering Data

In [72]:
import pandas as pd
import datetime as dt

## This Module's Dataset + Memory Optimization
- `pd.to_datetime` method converts a **Series** to hold datetime values.
- the `format` param informs pandas of the fomat that the times are stored in
- We pass symbols designating the segments of the string. For example, %m means "month" and %d means day
- the `dt` attribute reveals an object with many datetime-related attributes and methods.
- the `dt.time` attribute extracts only the time from each value in a datetime **Series**
- the `parsedate` param of `read_csv` alt way to parse strings as datetime

In [20]:
employees_df = pd.read_csv("employees.csv")

In [21]:
employees = employees_df.copy()
employees

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.170,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.340,True,Finance
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services
...,...,...,...,...,...,...,...,...
995,Henry,,11/23/2014,6:09 AM,132483,16.655,False,Distribution
996,Phillip,Male,1/31/1984,6:30 AM,42392,19.675,False,Finance
997,Russell,Male,5/20/2013,12:39 PM,96914,1.421,False,Product
998,Larry,Male,4/20/2013,4:45 PM,60500,11.985,False,Business Development


In [22]:
employees['Start Date'] = pd.to_datetime(employees['Start Date'], format="%m/%d/%Y")

In [23]:
employees["Last Login Time"] = pd.to_datetime(employees["Last Login Time"], format="%I:%M %p").dt.time

In [24]:
employees

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,16:47:00,101004,1.389,True,Client Services
...,...,...,...,...,...,...,...,...
995,Henry,,2014-11-23,06:09:00,132483,16.655,False,Distribution
996,Phillip,Male,1984-01-31,06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,16:45:00,60500,11.985,False,Business Development


In [25]:
employees["Senior Management"] = employees["Senior Management"].astype(bool)

In [26]:
employees["Gender"] = employees["Gender"].astype("category")

In [27]:
employees

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,16:47:00,101004,1.389,True,Client Services
...,...,...,...,...,...,...,...,...
995,Henry,,2014-11-23,06:09:00,132483,16.655,False,Distribution
996,Phillip,Male,1984-01-31,06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,16:45:00,60500,11.985,False,Business Development


In [28]:
employees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   object        
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    object        
dtypes: bool(1), category(1), datetime64[ns](1), float64(1), int64(1), object(3)
memory usage: 49.1+ KB


In [61]:
# Reimport original knowing our optimizations... this can get copied later
employees_df = pd.read_csv("employees.csv", parse_dates=["Start Date"], date_format="%m/%d/%Y") # Do in csv
employees_df["Last Login Time"] = pd.to_datetime(employees_df["Last Login Time"], format="%I:%M %p").dt.time
employees_df["Senior Management"] = employees_df["Senior Management"].astype(bool)
employees_df["Gender"] = employees_df["Gender"].astype("category")

## Filter A DataFrame Based On A Condition
- Pandas needs **Series** of Booleans to perform a filter
- Pass the boolean Series inside square bracks after df

In [62]:
employees = employees_df.copy()

In [56]:
employees["Gender"] == "Male"

0       True
1       True
2      False
3       True
4       True
       ...  
995    False
996     True
997     True
998     True
999     True
Name: Gender, Length: 1000, dtype: bool

In [57]:
employees[employees["Gender"] == "Male"]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,17:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,16:45:00,60500,11.985,False,Business Development


In [58]:
employees[(employees["Gender"] == "Female") & employees["Senior Management"]]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
6,Ruby,Female,1987-08-17,16:20:00,65476,10.012,True,Product
7,,Female,2015-07-20,10:43:00,45906,11.598,True,Finance
8,Angela,Female,2005-11-22,06:29:00,95570,18.523,True,Engineering
9,Frances,Female,2002-08-08,06:51:00,139852,7.524,True,Business Development
10,Louise,Female,1980-08-12,09:01:00,63241,15.132,True,
...,...,...,...,...,...,...,...,...
982,Rose,Female,1982-04-06,10:43:00,91411,8.639,True,Human Resources
987,Gloria,Female,2014-12-08,05:08:00,136709,10.331,True,Finance
990,Robin,Female,1987-07-24,13:35:00,100765,10.982,True,Client Services
991,Rose,Female,2002-08-25,05:12:00,134505,11.051,True,Marketing


In [67]:
employees[(employees["Salary"] > 140000) & (~(employees["Senior Management"]))]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
36,Rachel,Female,2009-02-16,20:47:00,142032,12.599,False,Business Development
96,Cynthia,Female,1994-03-21,08:34:00,142321,1.737,False,Finance
110,Shirley,Female,2001-06-20,22:39:00,147113,16.135,False,Legal
132,Carlos,Male,1995-01-04,07:02:00,146670,10.763,False,Human Resources
142,Elizabeth,Female,2003-10-09,17:53:00,146129,5.687,False,Finance
178,Jane,Female,1997-09-03,02:01:00,144474,17.648,False,Product
208,Jonathan,Male,1987-10-12,02:59:00,141069,4.903,False,Human Resources
216,Matthew,Male,2013-07-31,08:04:00,142373,2.462,False,Marketing
232,Marilyn,Female,1997-04-04,21:28:00,147663,10.263,False,Human Resources
244,Clarence,Male,1996-10-08,12:53:00,142561,8.866,False,Client Services


employees.info()

In [71]:
employees[employees["Start Date"] > "2010-01-01"]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
7,,Female,2015-07-20,10:43:00,45906,11.598,True,Finance
15,Lillian,Female,2016-06-05,06:09:00,59414,1.256,False,Product
16,Jeremy,Male,2010-09-21,05:56:00,90370,7.369,False,Human Resources
19,Donna,Female,2010-07-22,03:48:00,81014,1.894,False,Product
22,Joshua,,2012-03-08,01:58:00,90816,18.816,True,Client Services
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,17:47:00,98874,4.479,True,Marketing
995,Henry,,2014-11-23,06:09:00,132483,16.655,False,Distribution
997,Russell,Male,2013-05-20,12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,16:45:00,60500,11.985,False,Business Development


In [75]:
employees[employees["Start Date"] > dt.datetime(2010, 1, 1)]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
7,,Female,2015-07-20,10:43:00,45906,11.598,True,Finance
15,Lillian,Female,2016-06-05,06:09:00,59414,1.256,False,Product
16,Jeremy,Male,2010-09-21,05:56:00,90370,7.369,False,Human Resources
19,Donna,Female,2010-07-22,03:48:00,81014,1.894,False,Product
22,Joshua,,2012-03-08,01:58:00,90816,18.816,True,Client Services
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,17:47:00,98874,4.479,True,Marketing
995,Henry,,2014-11-23,06:09:00,132483,16.655,False,Distribution
997,Russell,Male,2013-05-20,12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,16:45:00,60500,11.985,False,Business Development


In [76]:
employees[employees["Last Login Time"] < dt.time(12, 0)]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal
7,,Female,2015-07-20,10:43:00,45906,11.598,True,Finance
8,Angela,Female,2005-11-22,06:29:00,95570,18.523,True,Engineering
...,...,...,...,...,...,...,...,...
988,Alice,Female,2004-10-05,09:34:00,47638,11.209,False,Human Resources
991,Rose,Female,2002-08-25,05:12:00,134505,11.051,True,Marketing
992,Anthony,Male,2011-10-16,08:35:00,112769,11.625,True,Finance
995,Henry,,2014-11-23,06:09:00,132483,16.655,False,Distribution


## The isin Method
- The `isin` Series method accepts an iterable object
- The method returns true if it's value is found in the collection

In [77]:
employees = employees_df.copy()

In [79]:
employees[employees["Team"].isin({'Legal', 'Product', 'Sales'})] # Param can be list, set, series... etc

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,16:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,15:19:00,102508,12.637,True,Legal
13,Gary,Male,2008-01-27,23:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,06:09:00,59414,1.256,False,Product
...,...,...,...,...,...,...,...,...
981,James,Male,1993-01-15,17:19:00,148985,19.280,False,Legal
985,Stephen,,1983-07-10,20:10:00,85668,1.909,False,Legal
989,Justin,,1991-02-10,16:58:00,38344,3.794,False,Legal
997,Russell,Male,2013-05-20,12:39:00,96914,1.421,False,Product


## The isnull and notnull Methods
- The `isnull` method returns True fo `NaN` values in **Series**
- Vice versaa `notnull`

In [80]:
employees[employees["Gender"].isnull()]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
20,Lois,,1995-04-22,19:18:00,64714,4.934,True,Legal
22,Joshua,,2012-03-08,01:58:00,90816,18.816,True,Client Services
27,Scott,,1991-07-11,18:58:00,122367,5.218,False,Legal
31,Joyce,,2005-02-20,14:40:00,88657,12.752,False,Product
41,Christine,,2015-06-28,01:08:00,66582,11.308,True,Business Development
...,...,...,...,...,...,...,...,...
961,Antonio,,1989-06-18,21:37:00,103050,3.050,False,Legal
972,Victor,,2006-07-28,14:49:00,76381,11.159,True,Sales
985,Stephen,,1983-07-10,20:10:00,85668,1.909,False,Legal
989,Justin,,1991-02-10,16:58:00,38344,3.794,False,Legal


In [82]:
employees[employees["Gender"].notnull()]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,16:47:00,101004,1.389,True,Client Services
...,...,...,...,...,...,...,...,...
994,George,Male,2013-06-21,17:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,16:45:00,60500,11.985,False,Business Development


## The between Method
- The between method returns True if a **Series** value is found within its range.



In [83]:
employees = employees_df.copy()

In [85]:
employees[employees["Salary"].between(100000, 110000)] # inclusive

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
4,Larry,Male,1998-01-24,16:47:00,101004,1.389,True,Client Services
11,Julie,Female,1997-10-26,15:19:00,102508,12.637,True,Legal
13,Gary,Male,2008-01-27,23:40:00,109831,5.831,False,Sales
21,Matthew,Male,1995-09-05,02:12:00,100612,13.645,False,Marketing
55,Karen,Female,1999-11-30,07:46:00,102488,17.653,True,Product
...,...,...,...,...,...,...,...,...
944,Kenneth,Male,2006-05-10,08:24:00,101914,1.905,True,Distribution
947,,Male,2012-07-30,15:07:00,107351,5.329,True,Marketing
961,Antonio,,1989-06-18,21:37:00,103050,3.050,False,Legal
967,Thomas,Male,2016-03-12,15:10:00,105681,19.572,False,Engineering


## The duplicated Method
- The `duplicated` method returns True if a series value is a duplicate
- Pandas will mark one occurence of a repeated value as a non-duplicate.
- Use the `keep` parameter to designate whether the first or last occurence of a repeated value should be considered the "non-duplicate"
- Pass False to the keep parmeter to mark all occurences of repeated values as duplicates 

In [86]:
employees = employees_df.copy()

In [87]:
employees[employees.duplicated()]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team


In [93]:
employees[~(employees["First Name"].duplicated(keep="first"))] # unique first names, keep of false will say keep no values with dupes

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,16:47:00,101004,1.389,True,Client Services
...,...,...,...,...,...,...,...,...
712,Martin,,2001-02-06,04:17:00,123963,15.745,True,Engineering
749,Janet,,1986-01-25,05:48:00,85789,9.712,False,Legal
832,Keith,Male,2003-02-12,15:02:00,120672,19.467,False,Legal
855,Phillip,,2003-10-20,11:09:00,89700,2.277,True,


## The drop_duplicates Method

In [94]:
employees=employees_df.copy()

In [95]:
employees.drop_duplicates()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.170,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,16:47:00,101004,1.389,True,Client Services
...,...,...,...,...,...,...,...,...
995,Henry,,2014-11-23,06:09:00,132483,16.655,False,Distribution
996,Phillip,Male,1984-01-31,06:30:00,42392,19.675,False,Finance
997,Russell,Male,2013-05-20,12:39:00,96914,1.421,False,Product
998,Larry,Male,2013-04-20,16:45:00,60500,11.985,False,Business Development


In [97]:
employees.drop_duplicates("Team") # Non mutating

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
4,Larry,Male,1998-01-24,16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,16:20:00,65476,10.012,True,Product
8,Angela,Female,2005-11-22,06:29:00,95570,18.523,True,Engineering
9,Frances,Female,2002-08-08,06:51:00,139852,7.524,True,Business Development
12,Brandon,Male,1980-12-01,01:08:00,112807,17.492,True,Human Resources
13,Gary,Male,2008-01-27,23:40:00,109831,5.831,False,Sales


In [99]:
employees.drop_duplicates("First Name", keep=False)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,01:35:00,115163,10.125,False,Legal
8,Angela,Female,2005-11-22,06:29:00,95570,18.523,True,Engineering
33,Jean,Female,1993-12-18,09:07:00,119082,16.18,False,Business Development
190,Carol,Female,1996-03-19,03:39:00,57783,9.129,False,Finance
291,Tammy,Female,1984-11-11,10:30:00,132839,17.463,True,Client Services
495,Eugene,Male,1984-05-24,10:54:00,81077,2.117,False,Sales
688,Brian,Male,2007-04-07,22:47:00,93901,17.821,True,Legal
832,Keith,Male,2003-02-12,15:02:00,120672,19.467,False,Legal
887,David,Male,2009-12-05,08:48:00,92242,15.407,False,Legal


In [100]:
employees.drop_duplicates(["Senior Management", "Team"]).sort_values("Team")

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
33,Jean,Female,1993-12-18,09:07:00,119082,16.18,False,Business Development
9,Frances,Female,2002-08-08,06:51:00,139852,7.524,True,Business Development
4,Larry,Male,1998-01-24,16:47:00,101004,1.389,True,Client Services
18,Diana,Female,1981-10-23,10:27:00,132940,19.082,False,Client Services
60,Paula,,2005-11-23,14:01:00,48866,4.271,False,Distribution
40,Michael,Male,2008-10-10,11:25:00,99283,2.665,True,Distribution
8,Angela,Female,2005-11-22,06:29:00,95570,18.523,True,Engineering
54,Sara,Female,2007-08-15,09:23:00,83677,8.999,False,Engineering
2,Maria,Female,1993-04-23,11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,13:00:00,138705,9.34,True,Finance


## The unique and nunique Methods

In [101]:
employees = employees_df.copy()

In [115]:
employees["Gender"].unique()

['Male', 'Female', NaN]
Categories (2, object): ['Female', 'Male']

In [111]:
employees["Gender"].nunique()

2

In [107]:
type(employees["Gender"].unique())# categories unique gets type that is in pandas

pandas.core.arrays.categorical.Categorical

In [109]:
type(employees["First Name"].unique())

numpy.ndarray

In [112]:
employees["Gender"].nunique(dropna=False) # NaN will be counted as a unique val

3

In [117]:
employees.nunique()

First Name           200
Gender                 2
Start Date           972
Last Login Time      720
Salary               995
Bonus %              971
Senior Management      2
Team                  10
dtype: int64