In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("employees.csv")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
First Name           933 non-null object
Gender               855 non-null object
Start Date           1000 non-null object
Last Login Time      1000 non-null object
Salary               1000 non-null int64
Bonus %              1000 non-null float64
Senior Management    933 non-null object
Team                 957 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


In [4]:
#Let's optimise the storage
df["Start Date"] = pd.to_datetime(df["Start Date"])
df["Last Login Time"] = pd.to_datetime(df["Last Login Time"])
df["Gender"] = df["Gender"].astype("category")
df["Senior Management"] = df["Senior Management"].astype("bool")

In [5]:
#We Reduced the space from 63K to 49K
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
First Name           933 non-null object
Gender               855 non-null category
Start Date           1000 non-null datetime64[ns]
Last Login Time      1000 non-null datetime64[ns]
Salary               1000 non-null int64
Bonus %              1000 non-null float64
Senior Management    1000 non-null bool
Team                 957 non-null object
dtypes: bool(1), category(1), datetime64[ns](2), float64(1), int64(1), object(2)
memory usage: 49.0+ KB


In [6]:
#Shorter way to parse columns as dates
df = pd.read_csv("employees.csv", parse_dates=["Start Date", "Last Login Time"])
df["Gender"] = df["Gender"].astype("category")
df["Senior Management"] = df["Senior Management"].astype("bool")
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-11-03 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-11-03 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2019-11-03 11:17:00,130590,11.858,False,Finance


## Filter data based on condition

In [7]:
#Let's filter data where gender = Male
mask = df["Gender"] == "Male"
mask

0       True
1       True
2      False
3       True
4       True
5       True
6      False
7      False
8      False
9      False
10     False
11     False
12      True
13      True
14     False
15     False
16      True
17      True
18     False
19     False
20     False
21      True
22     False
23      True
24      True
25      True
26      True
27     False
28      True
29      True
       ...  
970    False
971     True
972    False
973     True
974     True
975    False
976    False
977    False
978     True
979     True
980    False
981     True
982    False
983     True
984    False
985    False
986    False
987    False
988    False
989    False
990    False
991    False
992     True
993    False
994     True
995    False
996     True
997     True
998     True
999     True
Name: Gender, Length: 1000, dtype: bool

In [8]:
df[mask].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-11-03 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-11-03 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2019-11-03 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2019-11-03 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2019-11-03 01:35:00,115163,10.125,False,Legal


In [9]:
#Let's filter data where Team is not 'Marketing'
mask = df["Team"] != "Marketing"
df[mask].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2019-11-03 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2019-11-03 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2019-11-03 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2019-11-03 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2019-11-03 01:35:00,115163,10.125,False,Legal


In [10]:
#Let's filter data where Start Date is greater than 2015-07-14
mask = df["Start Date"] > "2015-07-14"
df[mask].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
7,,Female,2015-07-20,2019-11-03 10:43:00,45906,11.598,True,Finance
15,Lillian,Female,2016-06-05,2019-11-03 06:09:00,59414,1.256,False,Product
39,,Male,2016-01-29,2019-11-03 02:33:00,122173,7.797,True,Client Services
89,Janice,Female,2016-03-12,2019-11-03 00:40:00,51082,11.955,False,Legal
98,Tina,Female,2016-06-16,2019-11-03 19:47:00,100705,16.961,True,Marketing


In [11]:
mask = df["Senior Management"] #Since this column is already boolean
df[mask].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-11-03 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-11-03 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2019-11-03 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2019-11-03 16:47:00,101004,1.389,True,Client Services
6,Ruby,Female,1987-08-17,2019-11-03 16:20:00,65476,10.012,True,Product


## Filter data based on multiple conditions (AND - &, OR - |)

In [12]:
#Let's filter all the Males having Salary greater than 10,000
mask1 = df["Gender"] == "Male"
mask2 = df["Salary"] > 10000
df[mask1 & mask2].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-11-03 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-11-03 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2019-11-03 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2019-11-03 16:47:00,101004,1.389,True,Client Services
5,Dennis,Male,1987-04-18,2019-11-03 01:35:00,115163,10.125,False,Legal


In [13]:
#Let's filter all the employess who either are Senior Management or are a part of Marketing team
mask1 = df["Senior Management"]
mask2 = df["Team"] == "Marketing"

df[mask1 | mask2].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-11-03 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-11-03 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2019-11-03 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2019-11-03 16:47:00,101004,1.389,True,Client Services
6,Ruby,Female,1987-08-17,2019-11-03 16:20:00,65476,10.012,True,Product


In [14]:
#Let's filter data where Males having salary > 1000000 or anyone having Bonus % > 19.8
mask1 = df["Gender"] == "Male"
mask2 = df["Salary"] > 1000000
mask3 = df["Bonus %"] > 19.8

df[(mask1 & mask2) | mask3]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
150,Sean,Male,1996-05-04,2019-11-03 20:59:00,135490,19.934,False,Marketing
335,Robert,Male,2014-11-18,2019-11-03 05:00:00,85799,19.93,False,Finance
348,Philip,Male,1989-08-02,2019-11-03 11:21:00,129968,19.897,False,Finance
387,Robert,Male,1994-10-29,2019-11-03 04:26:00,123294,19.894,False,Client Services
488,Robert,Male,2007-03-11,2019-11-03 11:20:00,135882,19.944,False,Legal
797,Rose,Female,1982-12-31,2019-11-03 18:50:00,145001,19.85,False,Engineering
830,Michael,Male,2002-08-31,2019-11-03 01:20:00,81206,19.908,True,Distribution


## .isin() Method
- use this method to compare multiple values in the same Series

In [15]:
#Suppose that we want data where 'Marketing' and 'Finance' teams are involved. Simple approach would be to do
#the following
mask1 = df["Team"] == "Marketing" 
mask2 =  df["Team"] == "Finance"
df[mask1 | mask2].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-11-03 12:42:00,97308,6.945,True,Marketing
2,Maria,Female,1993-04-23,2019-11-03 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2019-11-03 13:00:00,138705,9.34,True,Finance
7,,Female,2015-07-20,2019-11-03 10:43:00,45906,11.598,True,Finance
14,Kimberly,Female,1999-01-14,2019-11-03 07:13:00,41426,14.543,True,Finance


In [16]:
#Let's make it easier
mask = df["Team"].isin(["Marketing", "Finance", "Legal"]) 
df[mask].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-11-03 12:42:00,97308,6.945,True,Marketing
2,Maria,Female,1993-04-23,2019-11-03 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2019-11-03 13:00:00,138705,9.34,True,Finance
5,Dennis,Male,1987-04-18,2019-11-03 01:35:00,115163,10.125,False,Legal
7,,Female,2015-07-20,2019-11-03 10:43:00,45906,11.598,True,Finance


In [17]:
mask1 = df["Team"].isin(["Marketing", "Finance", "Legal"]) 
mask2 = df["First Name"].isin(["Douglas", "Maria", "James"])
df[mask1 & mask2].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-11-03 12:42:00,97308,6.945,True,Marketing
2,Maria,Female,1993-04-23,2019-11-03 11:17:00,130590,11.858,False,Finance
159,James,,1983-11-22,2019-11-03 22:52:00,68501,14.316,False,Marketing
667,Douglas,,2009-02-04,2019-11-03 02:03:00,104496,14.771,True,Marketing
668,James,Male,2001-12-05,2019-11-03 07:06:00,67789,17.105,True,Legal


In [18]:
#Where People are not in Marketing, Finance, Legal
mask1 = df["Team"].isin(["Marketing", "Finance", "Legal"]) 
mask2 = df["First Name"].isin(["Douglas", "Maria", "James"])
df[~mask1 & mask2].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
91,James,,2005-01-26,2019-11-03 23:00:00,128771,8.309,False,
198,Maria,Female,1990-12-27,2019-11-03 21:57:00,36067,9.64,True,Product
217,Douglas,Male,1999-09-03,2019-11-03 16:00:00,83341,1.015,True,Client Services
301,James,Male,2015-07-11,2019-11-03 20:52:00,72257,13.023,False,Engineering
322,Douglas,Male,2002-01-08,2019-11-03 18:42:00,41428,14.372,False,Product


## .isnull() and .notnull() Methods

In [6]:
#Let's filter data where people don't have their Genders defined
mask = df["Gender"].isnull()
len(df[mask])
df[mask].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
20,Lois,,4/22/1995,7:18 PM,64714,4.934,True,Legal
22,Joshua,,3/8/2012,1:58 AM,90816,18.816,True,Client Services
27,Scott,,7/11/1991,6:58 PM,122367,5.218,False,Legal
31,Joyce,,2/20/2005,2:40 PM,88657,12.752,False,Product
41,Christine,,6/28/2015,1:08 AM,66582,11.308,True,Business Development


In [20]:
#Let's filter data for people having valid Gender
mask = df["Gender"].notnull()
len(df[mask])

855

## .between() Method
- by default values are inclusive

In [21]:
#Let's find the employees having salary between 50000 and 70000
mask = df["Salary"].between(50000, 70000)
df[mask].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2019-11-03 06:53:00,61933,4.17,True,
6,Ruby,Female,1987-08-17,2019-11-03 16:20:00,65476,10.012,True,Product
10,Louise,Female,1980-08-12,2019-11-03 09:01:00,63241,15.132,True,
15,Lillian,Female,2016-06-05,2019-11-03 06:09:00,59414,1.256,False,Product
20,Lois,,1995-04-22,2019-11-03 19:18:00,64714,4.934,True,Legal


In [22]:
#Let's find employees who joined between 1996-01-01 and 1996-02-01
df[ df["Start Date"].between("1996-01-01", "1996-02-01") ]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
572,Raymond,Male,1996-01-13,2019-11-03 16:14:00,114244,16.69,False,Human Resources
916,Marilyn,Female,1996-01-16,2019-11-03 07:18:00,118369,7.696,True,Business Development


In [23]:
#Let's find the employees who logged in between 8:00 Am and 12:00 Pm
df[ df["Last Login Time"].between("8:00AM", "12:00PM") ].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2019-11-03 11:17:00,130590,11.858,False,Finance
7,,Female,2015-07-20,2019-11-03 10:43:00,45906,11.598,True,Finance
10,Louise,Female,1980-08-12,2019-11-03 09:01:00,63241,15.132,True,
18,Diana,Female,1981-10-23,2019-11-03 10:27:00,132940,19.082,False,Client Services
33,Jean,Female,1993-12-18,2019-11-03 09:07:00,119082,16.18,False,Business Development


In [24]:
df[ df["First Name"].between("James", "Maria") ].head()

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2019-11-03 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2019-11-03 13:00:00,138705,9.34,True,Finance
4,Larry,Male,1998-01-24,2019-11-03 16:47:00,101004,1.389,True,Client Services
10,Louise,Female,1980-08-12,2019-11-03 09:01:00,63241,15.132,True,
11,Julie,Female,1997-10-26,2019-11-03 15:19:00,102508,12.637,True,Legal


## .duplicated() method
- depending on the argument 'keep', this function marks, 'first', 'last', or all entries as duplicated
- keep = 'first' will mark first entry as non-duplicated, rest as duplicated
- keep = 'last' will mark last entry as non-duplicated, rest as duplicated
- keep = False will mark all as duplicated

In [11]:
df.sort_values(by = "First Name", inplace = True)

In [26]:
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2019-11-03 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2019-11-03 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2019-11-03 14:53:00,52119,11.343,True,Client Services


In [27]:
df["First Name"].duplicated().head(8)

101    False
327     True
440     True
937     True
137    False
141     True
302     True
538     True
Name: First Name, dtype: bool

In [28]:
df["First Name"].duplicated(keep = 'last').head(8)

101     True
327     True
440     True
937    False
137     True
141     True
302     True
538    False
Name: First Name, dtype: bool

In [29]:
df["First Name"].duplicated(keep = False).head(8)

101    True
327    True
440    True
937    True
137    True
141    True
302    True
538    True
Name: First Name, dtype: bool

In [30]:
#Extract all employees with duplicated names
df[df["First Name"].duplicated(keep = False)].head(10)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2019-11-03 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2019-11-03 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2019-11-03 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2019-11-03 19:39:00,63126,18.424,False,Client Services
137,Adam,Male,2011-05-21,2019-11-03 01:45:00,95327,15.12,False,Distribution
141,Adam,Male,1990-12-24,2019-11-03 20:57:00,110194,14.727,True,Product
302,Adam,Male,2007-07-05,2019-11-03 11:59:00,71276,5.027,True,Human Resources
538,Adam,Male,2010-10-08,2019-11-03 21:53:00,45181,3.491,False,Human Resources
300,Alan,Male,1988-06-26,2019-11-03 03:54:00,111786,3.592,True,Engineering
53,Alan,,2014-03-03,2019-11-03 13:28:00,40341,17.578,True,Finance


In [31]:
#Filter all unique employees (use of negation)
df[~ df["First Name"].duplicated(keep = False)]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2019-11-03 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2019-11-03 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2019-11-03 03:39:00,57783,9.129,False,Finance
887,David,Male,2009-12-05,2019-11-03 08:48:00,92242,15.407,False,Legal
5,Dennis,Male,1987-04-18,2019-11-03 01:35:00,115163,10.125,False,Legal
495,Eugene,Male,1984-05-24,2019-11-03 10:54:00,81077,2.117,False,Sales
33,Jean,Female,1993-12-18,2019-11-03 09:07:00,119082,16.18,False,Business Development
832,Keith,Male,2003-02-12,2019-11-03 15:02:00,120672,19.467,False,Legal
291,Tammy,Female,1984-11-11,2019-11-03 10:30:00,132839,17.463,True,Client Services


In [14]:
#Filter all the first non duplicate employees
df[~df["First Name"].duplicated(keep = 'first')]

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2/17/2012,10:20 AM,61602,11.849,True,Marketing
137,Adam,Male,5/21/2011,1:45 AM,95327,15.120,False,Distribution
300,Alan,Male,6/26/1988,3:54 AM,111786,3.592,True,Engineering
372,Albert,Male,2/1/1997,4:20 PM,67827,19.717,True,Engineering
988,Alice,Female,10/5/2004,9:34 AM,47638,11.209,False,Human Resources
639,Amanda,,8/11/1991,2:15 PM,46665,19.391,True,Client Services
879,Amy,Female,5/20/2009,6:26 AM,75415,19.132,False,Client Services
430,Andrea,Female,10/1/2010,11:54 AM,79123,19.422,False,Distribution
564,Andrew,Male,3/29/1985,6:57 PM,43414,7.563,True,Client Services
8,Angela,Female,11/22/2005,6:29 AM,95570,18.523,True,Engineering


In [32]:
df.drop_duplicates(subset = ["First Name"], keep = False)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2019-11-03 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2019-11-03 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2019-11-03 03:39:00,57783,9.129,False,Finance
887,David,Male,2009-12-05,2019-11-03 08:48:00,92242,15.407,False,Legal
5,Dennis,Male,1987-04-18,2019-11-03 01:35:00,115163,10.125,False,Legal
495,Eugene,Male,1984-05-24,2019-11-03 10:54:00,81077,2.117,False,Sales
33,Jean,Female,1993-12-18,2019-11-03 09:07:00,119082,16.18,False,Business Development
832,Keith,Male,2003-02-12,2019-11-03 15:02:00,120672,19.467,False,Legal
291,Tammy,Female,1984-11-11,2019-11-03 10:30:00,132839,17.463,True,Client Services


In [34]:
df["Gender"].unique()

2