In [1]:
import pandas as pd
# filtering: extracting one or more rows of data based on some kind of criteria
# ^ a subset of the original data

# DF 2.1 (section 5.75):
## Intro to filtering dataframes:

In [34]:
df = pd.read_csv("employees.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   First Name         933 non-null    object 
 1   Gender             855 non-null    object 
 2   Start Date         1000 non-null   object 
 3   Last Login Time    1000 non-null   object 
 4   Salary             1000 non-null   int64  
 5   Bonus %            1000 non-null   float64
 6   Senior Management  933 non-null    object 
 7   Team               957 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


In [28]:
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    object        
dtypes: bool(1), category(1), datetime64[ns](2), float64(1), int64(1), object(2)
memory usage: 49.1+ KB


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   First Name         933 non-null    object 
 1   Gender             855 non-null    object 
 2   Start Date         1000 non-null   object 
 3   Last Login Time    1000 non-null   object 
 4   Salary             1000 non-null   int64  
 5   Bonus %            1000 non-null   float64
 6   Senior Management  933 non-null    object 
 7   Team               957 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


In [16]:
df["Start Date"] = pd.to_datetime(df["Start Date"])
# converts the values in a series from strings to datetime
# higher level funciton so you have to use pandas
# pandas can recognize a lot of conventional datetime formats
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,6:53 AM,61933,4.17,True,
2,Maria,Female,1993-04-23,11:17 AM,130590,11.858,False,Finance


In [20]:
df["Last Login Time"] = pd.to_datetime(df["Last Login Time"])
df.head(3)
# doesn't look different, but allows you to sort by datetime since they are no longer strings
# needs a date and a time, if it doesn't have a date it will assume it is present day

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-10-13 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-10-13 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2021-10-13 11:17:00,130590,11.858,False,Finance


In [24]:
# booleans use less memory
df["Senior Management"] = df["Senior Management"].astype("bool")
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-10-13 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-10-13 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2021-10-13 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2021-10-13 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2021-10-13 16:47:00,101004,1.389,True,Client Services


In [32]:
df["Gender"] = df["Gender"].astype("category")
# reduces memory

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   First Name         933 non-null    object        
 1   Gender             855 non-null    category      
 2   Start Date         1000 non-null   datetime64[ns]
 3   Last Login Time    1000 non-null   datetime64[ns]
 4   Salary             1000 non-null   int64         
 5   Bonus %            1000 non-null   float64       
 6   Senior Management  1000 non-null   bool          
 7   Team               957 non-null    object        
dtypes: bool(1), category(1), datetime64[ns](2), float64(1), int64(1), object(2)
memory usage: 49.1+ KB


In [36]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
# ^ does what the two lines below do ^
# df["Start Date"] = pd.to_datetime(df["Start Date"])
# df["Last Login Time"] = pd.to_datetime(df["Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-10-13 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-10-13 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2021-10-13 11:17:00,130590,11.858,False,Finance


# DF 2.2 (section 5.76):
## Filtering a DataFrame based on a condition:

- need a series to be able to focus on a condition

In [2]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-10-14 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-10-14 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2021-10-14 11:17:00,130590,11.858,False,Finance


In [4]:
df["Gender"] == "Male"
# returns a brand new boolean series
# if you only put one =, it will overwrite all values

0       True
1       True
2      False
3       True
4       True
       ...  
995    False
996     True
997     True
998     True
999     True
Name: Gender, Length: 1000, dtype: bool

In [6]:
df[df["Gender"] == "Male"].head(3)
# this is how you extract all the rows with a True value

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-10-14 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-10-14 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2021-10-14 13:00:00,138705,9.34,True,Finance


In [10]:
df[df["Team"] == "Finance"].head(3)
# more elegant syntax of the same code below (mask is the common name for this variable):
mask = df["Team"] == "Finance"
df[mask].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2021-10-14 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2021-10-14 13:00:00,138705,9.34,True,Finance
7,,Female,2015-07-20,2021-10-14 10:43:00,45906,11.598,True,Finance


In [13]:
mask = df["Senior Management"]
# can use this code instead of the == if the series is boolean
df[mask].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-10-14 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-10-14 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2021-10-14 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2021-10-14 16:47:00,101004,1.389,True,Client Services
6,Ruby,Female,1987-08-17,2021-10-14 16:20:00,65476,10.012,True,Product


In [14]:
mask = df["Team"] != "Marketing"
df[mask].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2021-10-14 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2021-10-14 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2021-10-14 13:00:00,138705,9.34,True,Finance


In [24]:
mask = df["Salary"] > 110000
df[mask].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2021-10-14 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2021-10-14 13:00:00,138705,9.34,True,Finance
5,Dennis,Male,1987-04-18,2021-10-14 01:35:00,115163,10.125,False,Legal
9,Frances,Female,2002-08-08,2021-10-14 06:51:00,139852,7.524,True,Business Development
12,Brandon,Male,1980-12-01,2021-10-14 01:08:00,112807,17.492,True,Human Resources


In [23]:
df[df["Bonus %"] < 1.5].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
4,Larry,Male,1998-01-24,2021-10-14 16:47:00,101004,1.389,True,Client Services
15,Lillian,Female,2016-06-05,2021-10-14 06:09:00,59414,1.256,False,Product
58,Theresa,Female,2010-04-11,2021-10-14 07:18:00,72670,1.481,True,Engineering
77,Charles,Male,2004-09-14,2021-10-14 20:13:00,107391,1.26,True,Marketing
175,Willie,Male,1998-02-17,2021-10-14 20:20:00,146651,1.451,True,Engineering


In [27]:
# < and > also works with dates, write the date as a string
# with dates, less than means prior to
mask = df["Start Date"] <= "1985-01-01"
df[mask].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
10,Louise,Female,1980-08-12,2021-10-14 09:01:00,63241,15.132,True,
12,Brandon,Male,1980-12-01,2021-10-14 01:08:00,112807,17.492,True,Human Resources
18,Diana,Female,1981-10-23,2021-10-14 10:27:00,132940,19.082,False,Client Services
28,Terry,Male,1981-11-27,2021-10-14 18:30:00,124008,13.464,True,Client Services
37,Linda,Female,1981-10-19,2021-10-14 20:49:00,57427,9.557,True,Client Services


# DF 2.3 (section 5.77):
## Filtering a df based on more than one condition (and, &):

- Boris strongly reccomends storing each condition in their own separate variable

In [28]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-10-14 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-10-14 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2021-10-14 11:17:00,130590,11.858,False,Finance


In [36]:
mask1 = df["Gender"] == "Male"

mask2 = df["Team"] == "Marketing"
df[mask1 & mask2].head()
# can't use the word "and"

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-10-14 12:42:00,97308,6.945,True,Marketing
21,Matthew,Male,1995-09-05,2021-10-14 02:12:00,100612,13.645,False,Marketing
26,Craig,Male,2000-02-27,2021-10-14 07:45:00,37598,7.757,True,Marketing
74,Thomas,Male,1995-06-04,2021-10-14 14:24:00,62096,17.029,False,Marketing
77,Charles,Male,2004-09-14,2021-10-14 20:13:00,107391,1.26,True,Marketing


# DF 2.4 (section 5.78):
## Filtering a df based on more than one condition (or, |):

- Boris strongly reccomends storing each condition in their own separate variable

In [37]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-10-14 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-10-14 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2021-10-14 11:17:00,130590,11.858,False,Finance


In [38]:
mask1 = df["Senior Management"]

mask2 = df["Start Date"] < "1990-01-01"
# with dates, less than means prior to

df[mask1 | mask2]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-10-14 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-10-14 06:53:00,61933,4.170,True,
3,Jerry,Male,2005-03-04,2021-10-14 13:00:00,138705,9.340,True,Finance
4,Larry,Male,1998-01-24,2021-10-14 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2021-10-14 01:35:00,115163,10.125,False,Legal
...,...,...,...,...,...,...,...,...
992,Anthony,Male,2011-10-16,2021-10-14 08:35:00,112769,11.625,True,Finance
993,Tina,Female,1997-05-15,2021-10-14 15:53:00,56450,19.040,True,Engineering
994,George,Male,2013-06-21,2021-10-14 17:47:00,98874,4.479,True,Marketing
996,Phillip,Male,1984-01-31,2021-10-14 06:30:00,42392,19.675,False,Finance


In [39]:
# mixing & and |

mask1 = df["First Name"] == "Robert"
mask2 = df["Team"] == "Client Services"
mask3 = df["Start Date"] > "2016-06-01"

df[(mask1 & mask2) | mask3]
# pulls out where the name is robert and the team is client services, or the start date is after 2016-06-01
# wrap 'and' conditions in parentheses

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
15,Lillian,Female,2016-06-05,2021-10-14 06:09:00,59414,1.256,False,Product
98,Tina,Female,2016-06-16,2021-10-14 19:47:00,100705,16.961,True,Marketing
387,Robert,Male,1994-10-29,2021-10-14 04:26:00,123294,19.894,False,Client Services
451,Terry,,2016-07-15,2021-10-14 00:29:00,140002,19.49,True,Marketing


# DF 2.5 (section 5.79):
## Check for inclusion with the .isin() method:

- useful for checking for multiple values within a single series
- helpful when you want to filter multiple values and you don't want to create multiple boolean series
- can give .isin() a list, tuple or even another series

In [40]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-10-14 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-10-14 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2021-10-14 11:17:00,130590,11.858,False,Finance


In [43]:
mask = df["Team"].isin(["Legal", "Sales", "Product"])
# more adaptable to future change
df[mask]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,2021-10-14 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2021-10-14 16:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,2021-10-14 15:19:00,102508,12.637,True,Legal
13,Gary,Male,2008-01-27,2021-10-14 23:40:00,109831,5.831,False,Sales
15,Lillian,Female,2016-06-05,2021-10-14 06:09:00,59414,1.256,False,Product
...,...,...,...,...,...,...,...,...
981,James,Male,1993-01-15,2021-10-14 17:19:00,148985,19.280,False,Legal
985,Stephen,,1983-07-10,2021-10-14 20:10:00,85668,1.909,False,Legal
989,Justin,,1991-02-10,2021-10-14 16:58:00,38344,3.794,False,Legal
997,Russell,Male,2013-05-20,2021-10-14 12:39:00,96914,1.421,False,Product


# DF 2.6 (section 5.80):
## the .isnull() and .notnull() methods:

- can be called on a series within the df
- both check the null status and return true or false based on whether that condition is met
- another way of generating a boolean series for the purpose of filtering later

In [44]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-10-14 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-10-14 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2021-10-14 11:17:00,130590,11.858,False,Finance


In [47]:
mask = df["Team"].isnull()
# null values return True
df[mask].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2021-10-14 06:53:00,61933,4.17,True,
10,Louise,Female,1980-08-12,2021-10-14 09:01:00,63241,15.132,True,
23,,Male,2012-06-14,2021-10-14 16:19:00,125792,5.042,True,
32,,Male,1998-08-21,2021-10-14 14:27:00,122340,6.417,True,
91,James,,2005-01-26,2021-10-14 23:00:00,128771,8.309,False,


In [49]:
condition = df["Gender"].notnull()
# null values return False
df[condition].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-10-14 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-10-14 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2021-10-14 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2021-10-14 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2021-10-14 16:47:00,101004,1.389,True,Client Services


# DF 2.7 (section 5.81):
## the between() method:

- called directly on a series, generates a brand new boolean series
- finds values that fall between a range (times, dates, etc)

In [50]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-10-14 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-10-14 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2021-10-14 11:17:00,130590,11.858,False,Finance


In [52]:
mask = df["Salary"].between(60000, 70000)
# takes two arguements, first is the lower bound, second is the upper bound
# both values are inclusive (includes exactly 60000 and 70000 in this example)
df[mask].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2021-10-14 06:53:00,61933,4.17,True,
6,Ruby,Female,1987-08-17,2021-10-14 16:20:00,65476,10.012,True,Product
10,Louise,Female,1980-08-12,2021-10-14 09:01:00,63241,15.132,True,
20,Lois,,1995-04-22,2021-10-14 19:18:00,64714,4.934,True,Legal
41,Christine,,2015-06-28,2021-10-14 01:08:00,66582,11.308,True,Business Development


In [54]:
df[df["Bonus %"].between(2.0, 5.0)]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2021-10-14 06:53:00,61933,4.170,True,
20,Lois,,1995-04-22,2021-10-14 19:18:00,64714,4.934,True,Legal
40,Michael,Male,2008-10-10,2021-10-14 11:25:00,99283,2.665,True,Distribution
49,Chris,,1980-01-24,2021-10-14 12:13:00,113590,3.055,False,Sales
60,Paula,,2005-11-23,2021-10-14 14:01:00,48866,4.271,False,Distribution
...,...,...,...,...,...,...,...,...
943,Wayne,Male,2006-09-08,2021-10-14 11:09:00,67471,2.728,False,Engineering
961,Antonio,,1989-06-18,2021-10-14 21:37:00,103050,3.050,False,Legal
976,Denise,Female,1992-10-19,2021-10-14 05:42:00,137954,4.195,True,Legal
989,Justin,,1991-02-10,2021-10-14 16:58:00,38344,3.794,False,Legal


In [56]:
df[df["Start Date"].between("1991-01-01", "1992-01-01")].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
27,Scott,,1991-07-11,2021-10-14 18:58:00,122367,5.218,False,Legal
75,Bonnie,Female,1991-07-02,2021-10-14 01:27:00,104897,5.118,True,Human Resources
88,Donna,Female,1991-11-27,2021-10-14 13:59:00,64088,6.155,True,Legal
116,,Male,1991-06-22,2021-10-14 20:58:00,76189,18.988,True,Legal
148,Patrick,,1991-07-14,2021-10-14 02:24:00,124488,14.837,True,Sales


In [60]:
df[df["Last Login Time"].between("08:30AM", "12:00PM")]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2021-10-14 11:17:00,130590,11.858,False,Finance
7,,Female,2015-07-20,2021-10-14 10:43:00,45906,11.598,True,Finance
10,Louise,Female,1980-08-12,2021-10-14 09:01:00,63241,15.132,True,
18,Diana,Female,1981-10-23,2021-10-14 10:27:00,132940,19.082,False,Client Services
33,Jean,Female,1993-12-18,2021-10-14 09:07:00,119082,16.180,False,Business Development
...,...,...,...,...,...,...,...,...
963,Ann,Female,1994-09-23,2021-10-14 11:15:00,89443,17.940,True,Sales
977,Sarah,Female,1995-12-04,2021-10-14 09:16:00,124566,5.949,False,Product
982,Rose,Female,1982-04-06,2021-10-14 10:43:00,91411,8.639,True,Human Resources
988,Alice,Female,2004-10-05,2021-10-14 09:34:00,47638,11.209,False,Human Resources


# DF 2.8 (section 5.82):
## the .duplicated() method:

- allows us to extract duplacted rows from a df

In [62]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.sort_values("First Name", inplace = True)
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2021-10-14 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2021-10-14 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2021-10-14 14:53:00,52119,11.343,True,Client Services


In [68]:
df[df["First Name"].duplicated()].head()
# by defualt, marks the first occurance of each duplicate as a non-duplicate

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
327,Aaron,Male,1994-01-29,2021-10-14 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2021-10-14 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2021-10-14 19:39:00,63126,18.424,False,Client Services
141,Adam,Male,1990-12-24,2021-10-14 20:57:00,110194,14.727,True,Product
302,Adam,Male,2007-07-05,2021-10-14 11:59:00,71276,5.027,True,Human Resources


In [67]:
df[df["First Name"].duplicated(keep = "last")].head()
# marks the last instance of each duplicate as unique

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2021-10-14 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2021-10-14 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2021-10-14 14:53:00,52119,11.343,True,Client Services
137,Adam,Male,2011-05-21,2021-10-14 01:45:00,95327,15.12,False,Distribution
141,Adam,Male,1990-12-24,2021-10-14 20:57:00,110194,14.727,True,Product


In [71]:
df[df["First Name"].duplicated(keep = False)].head()
# marks as duplicated if something occurs more than once

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2021-10-14 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2021-10-14 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2021-10-14 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2021-10-14 19:39:00,63126,18.424,False,Client Services
137,Adam,Male,2011-05-21,2021-10-14 01:45:00,95327,15.12,False,Distribution


In [75]:
df[~df["First Name"].duplicated(keep = False)].head()
# ~ converts all trues to falses, allows you to remove all duplicates

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2021-10-14 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2021-10-14 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2021-10-14 03:39:00,57783,9.129,False,Finance
887,David,Male,2009-12-05,2021-10-14 08:48:00,92242,15.407,False,Legal
5,Dennis,Male,1987-04-18,2021-10-14 01:35:00,115163,10.125,False,Legal


# DF 2.9 (section 5.83):
## the .drp_duplicates() method:

- can be called on a df, allows us to do some filtering operations in less lines of code

In [76]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
df.sort_values("First Name", inplace = True)
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2021-10-14 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2021-10-14 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2021-10-14 14:53:00,52119,11.343,True,Client Services


In [77]:
len(df)

1000

In [80]:
len(df.drop_duplicates())
# without more info given to the method, its only going to remove rows that are entirely identical to another row

1000

In [83]:
df.drop_duplicates(subset = ["First Name"], keep = "last")
# drops duplicate names, but keeps the last instance of each

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
937,Aaron,,1986-01-22,2021-10-14 19:39:00,63126,18.424,False,Client Services
538,Adam,Male,2010-10-08,2021-10-14 21:53:00,45181,3.491,False,Human Resources
610,Alan,Male,2012-02-17,2021-10-14 00:26:00,41453,10.084,False,Product
959,Albert,Male,1992-09-19,2021-10-14 02:35:00,45094,5.850,True,Business Development
693,Alice,Female,1995-10-16,2021-10-14 21:19:00,92799,2.782,False,Sales
...,...,...,...,...,...,...,...,...
512,Wanda,Female,1993-04-06,2021-10-14 03:11:00,78883,19.695,False,
637,Wayne,Male,2009-09-02,2021-10-14 01:37:00,126956,18.396,False,Human Resources
127,William,Male,2002-09-29,2021-10-14 16:09:00,66521,5.830,False,Human Resources
652,Willie,Male,2009-12-05,2021-10-14 05:39:00,141932,1.017,True,Engineering


In [85]:
df.drop_duplicates(subset = ["First Name"], keep = False)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2021-10-14 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2021-10-14 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2021-10-14 03:39:00,57783,9.129,False,Finance
887,David,Male,2009-12-05,2021-10-14 08:48:00,92242,15.407,False,Legal
5,Dennis,Male,1987-04-18,2021-10-14 01:35:00,115163,10.125,False,Legal
495,Eugene,Male,1984-05-24,2021-10-14 10:54:00,81077,2.117,False,Sales
33,Jean,Female,1993-12-18,2021-10-14 09:07:00,119082,16.18,False,Business Development
832,Keith,Male,2003-02-12,2021-10-14 15:02:00,120672,19.467,False,Legal
291,Tammy,Female,1984-11-11,2021-10-14 10:30:00,132839,17.463,True,Client Services


In [86]:
df.drop_duplicates(["Team"], keep = False)
# there are no teams that only occur once, so it clears the whole df

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team


In [91]:
df.drop_duplicates(["First Name", "Team"], keep = "first", inplace = True)
df.head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2021-10-14 10:20:00,61602,11.849,True,Marketing
440,Aaron,Male,1990-07-22,2021-10-14 14:53:00,52119,11.343,True,Client Services
137,Adam,Male,2011-05-21,2021-10-14 01:45:00,95327,15.12,False,Distribution
141,Adam,Male,1990-12-24,2021-10-14 20:57:00,110194,14.727,True,Product
302,Adam,Male,2007-07-05,2021-10-14 11:59:00,71276,5.027,True,Human Resources


# DF 2.10 (section 5.84):
## the .unique() and .nunique() methods:

In [92]:
df = pd.read_csv("employees.csv", parse_dates = ["Start Date", "Last Login Time"])
df["Senior Management"] = df["Senior Management"].astype("bool")
df["Gender"] = df["Gender"].astype("category")
9df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2021-10-14 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2021-10-14 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2021-10-14 11:17:00,130590,11.858,False,Finance


In [93]:
df["Gender"].unique()

['Male', 'Female', NaN]
Categories (2, object): ['Female', 'Male']

In [95]:
df["Team"].unique()

11

In [97]:
df["Team"].nunique()
# gives number of unique values, doesn't include null values by default

10

In [99]:
df["Team"].nunique(dropna = False)

11