## This Module's Dataset + Memory Optimization

In [38]:
## focus on filtering datasets n this module
import pandas as pd
df = pd.read_csv('employees.csv')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance


In [39]:
df = pd.read_csv('employees.csv')
df['Start Date'] = pd.to_datetime(df['Start Date'])
df['Last Login Time'] = pd.to_datetime(df['Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category')
df.head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-07-23 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-07-23 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2019-07-23 11:17:00,130590,11.858,False,Finance


In [92]:
df.info()
## Issues in the employees dataset: 1. The Start Date and Last Login Time are strings, but date and time will be more flexible
## 2. The Gender and Senior Management are both strings, but Gender can be category and Senior Management can be boolean


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
First Name           933 non-null object
Gender               855 non-null category
Start Date           1000 non-null datetime64[ns]
Last Login Time      1000 non-null datetime64[ns]
Salary               1000 non-null int64
Bonus %              1000 non-null float64
Senior Management    1000 non-null bool
Team                 957 non-null object
dtypes: bool(1), category(1), datetime64[ns](2), float64(1), int64(1), object(2)
memory usage: 49.0+ KB


In [6]:
## pd.to_datetime(df['Start Date']) the .to_datetime() in pandas converts a string to a date which can do some operations
df['Start Date'] = pd.to_datetime(df['Start Date'])
df['Last Login Time'] = pd.to_datetime(df['Last Login Time'])
df.head(3)  ## because the date is missing in the Last Login Time, thus the date will always be today's date

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-07-19 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-07-19 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2019-07-19 11:17:00,130590,11.858,False,Finance


In [8]:
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category')
df.head(3)
df.info()  ## reduce memory usage

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
First Name           933 non-null object
Gender               855 non-null category
Start Date           1000 non-null datetime64[ns]
Last Login Time      1000 non-null datetime64[ns]
Salary               1000 non-null int64
Bonus %              1000 non-null float64
Senior Management    1000 non-null bool
Team                 957 non-null object
dtypes: bool(1), category(1), datetime64[ns](2), float64(1), int64(1), object(2)
memory usage: 49.0+ KB


In [40]:
## Another way to convert the string to datetimes
df = pd.read_csv('employees.csv', parse_dates = ['Start Date', 'Last Login Time'])
df['Senior Management'] = df['Senior Management'].astype('bool')
df['Gender'] = df['Gender'].astype('category')
## The parse_dates parameter will render the strings(list) automatically as DateTime objects

## Filter a DataFrame Based on A Condition

In [14]:
df[df['Gender'] == 'Female'].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2019-07-19 11:17:00,130590,11.858,False,Finance
6,Ruby,Female,1987-08-17,2019-07-19 16:20:00,65476,10.012,True,Product
7,,Female,2015-07-20,2019-07-19 10:43:00,45906,11.598,True,Finance


In [24]:
df[df['Team'] == 'Finance'].head(3)   ## Null values will also be False in this example

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2019-07-19 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2019-07-19 13:00:00,138705,9.34,True,Finance
7,,Female,2015-07-20,2019-07-19 10:43:00,45906,11.598,True,Finance


In [29]:
df[df['Senior Management'] == 1].head(3)  ## Deal with the Boolean values
df[df['Senior Management']].head(3)  ## or we can directly pass it to the dataframe

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-07-19 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-07-19 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2019-07-19 13:00:00,138705,9.34,True,Finance


In [31]:
## Filter by a condition not being met
df[df['Team'] != 'Marketing'].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2019-07-19 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2019-07-19 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2019-07-19 13:00:00,138705,9.34,True,Finance


In [34]:
## Filtering by Greater than or Less Than, these operations also work on datetime 
df[df['Salary'] >= 110000].head(3)
df[df['Bonus %'] < 1.5].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
4,Larry,Male,1998-01-24,2019-07-19 16:47:00,101004,1.389,True,Client Services
15,Lillian,Female,2016-06-05,2019-07-19 06:09:00,59414,1.256,False,Product
58,Theresa,Female,2010-04-11,2019-07-19 07:18:00,72670,1.481,True,Engineering


In [35]:
## Filter on the Date time
df[df['Start Date'] <= '1985-01-01'].head(3)  ## Filter the date is prior to 1985-01-01

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
10,Louise,Female,1980-08-12,2019-07-19 09:01:00,63241,15.132,True,
12,Brandon,Male,1980-12-01,2019-07-19 01:08:00,112807,17.492,True,Human Resources
18,Diana,Female,1981-10-23,2019-07-19 10:27:00,132940,19.082,False,Client Services


## Filter with More than One Condition (AND - &)

In [52]:
## Define conditions separately in variables and pass those variables within the brackets
mask1 = df['Gender'] == 'Male'
mask2 = df['Team'] == 'Marketing'
df[mask1 & mask2].head(3)
df[ (df['Gender'] == 'Male') & (df['Team'] == 'Marketing')].head(3)  ## The same with the above commands

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-07-23 12:42:00,97308,6.945,True,Marketing
21,Matthew,Male,1995-09-05,2019-07-23 02:12:00,100612,13.645,False,Marketing
26,Craig,Male,2000-02-27,2019-07-23 07:45:00,37598,7.757,True,Marketing


## Filter with More than One Condition (OR - |)

In [53]:
mask1 = df['Senior Management']
mask2 = df['Start Date'] < '1990-01-01'
df[mask1 | mask2]
df[(df['Senior Management']) | (df['Start Date'] < '1990-01-01')].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-07-23 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-07-23 06:53:00,61933,4.17,True,
3,Jerry,Male,2005-03-04,2019-07-23 13:00:00,138705,9.34,True,Finance


In [64]:
mask1 = df['First Name'] == 'Robert'
mask2 = df['Team'] == 'Client Services'
mask3 = df['Start Date'] > '2016-06-01'
df[(mask1 & mask2) | mask3].head(3)
## Python can not evaluate below sequence??
df[mask1 & mask2 | mask3]   ## question, not sure about this
## To extract the rows from a dataframe, we can only pass the boolean values to the bracket

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
15,Lillian,Female,2016-06-05,2019-07-23 06:09:00,59414,1.256,False,Product
98,Tina,Female,2016-06-16,2019-07-23 19:47:00,100705,16.961,True,Marketing
387,Robert,Male,1994-10-29,2019-07-23 04:26:00,123294,19.894,False,Client Services
451,Terry,,2016-07-15,2019-07-23 00:29:00,140002,19.49,True,Marketing


## The .isin() Method

In [83]:
## .isin() function can be used to check multiple values within a single series
df[df['Team'].isin(['Legal', 'Sales', 'Product'])].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-07-23 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-07-23 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2019-07-23 11:17:00,130590,11.858,False,Finance


In [69]:
## Stupid ways
mask1 = df['Team'] == 'Legal'
mask2 = df['Team'] == 'Sales'
mask3 = df['Team'] == 'Product'
df[mask1 | mask2 | mask3].head(3)
## The advantage of .isin() function is that we can pass series in the argument and compare

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
5,Dennis,Male,1987-04-18,2019-07-23 01:35:00,115163,10.125,False,Legal
6,Ruby,Female,1987-08-17,2019-07-23 16:20:00,65476,10.012,True,Product
11,Julie,Female,1997-10-26,2019-07-23 15:19:00,102508,12.637,True,Legal


## The .isnull() and .notnull() Methods

In [87]:
## Generate Boolean Series in Pandas
## .isnull() function return True if values in the series is NULL, otherwise False
df[-df['Team'].isnull()]
df[df['Team'].isnull()].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2019-07-23 06:53:00,61933,4.17,True,
10,Louise,Female,1980-08-12,2019-07-23 09:01:00,63241,15.132,True,
23,,Male,2012-06-14,2019-07-23 16:19:00,125792,5.042,True,


In [86]:
df[-df['Team'].isnull()].head(3)  ## .isnull() is reverse of .notnull(), thus add negative sign ahead is the same
df[df['Team'].notnull()].head(3)  

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-07-23 12:42:00,97308,6.945,True,Marketing
2,Maria,Female,1993-04-23,2019-07-23 11:17:00,130590,11.858,False,Finance
3,Jerry,Male,2005-03-04,2019-07-23 13:00:00,138705,9.34,True,Finance


In [88]:
df[df['Gender'].notnull()].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,1993-08-06,2019-07-23 12:42:00,97308,6.945,True,Marketing
1,Thomas,Male,1996-03-31,2019-07-23 06:53:00,61933,4.17,True,
2,Maria,Female,1993-04-23,2019-07-23 11:17:00,130590,11.858,False,Finance


## The .between() Method

In [96]:
## .between() function calls directly on a series and find values that fall between a range, .between(lower_bound, upper_bound)
df[df['Salary'].between(60000, 70000)].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2019-07-23 06:53:00,61933,4.17,True,
6,Ruby,Female,1987-08-17,2019-07-23 16:20:00,65476,10.012,True,Product
10,Louise,Female,1980-08-12,2019-07-23 09:01:00,63241,15.132,True,


In [99]:
df[df['Bonus %'].between(2, 5)].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
1,Thomas,Male,1996-03-31,2019-07-23 06:53:00,61933,4.17,True,
20,Lois,,1995-04-22,2019-07-23 19:18:00,64714,4.934,True,Legal
40,Michael,Male,2008-10-10,2019-07-23 11:25:00,99283,2.665,True,Distribution


In [100]:
## .between() is also available for comparing datetimes, so that is the advantage of parse_dates and .to_datetime()
df[df['Start Date'].between('1991-01-01', '1992-01-01')].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
27,Scott,,1991-07-11,2019-07-23 18:58:00,122367,5.218,False,Legal
75,Bonnie,Female,1991-07-02,2019-07-23 01:27:00,104897,5.118,True,Human Resources
88,Donna,Female,1991-11-27,2019-07-23 13:59:00,64088,6.155,True,Legal


In [105]:
df[df['Last Login Time'].between("2019-07-23 08:30AM", "2019-07-23 12:00PM")].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
2,Maria,Female,1993-04-23,2019-07-23 11:17:00,130590,11.858,False,Finance
7,,Female,2015-07-20,2019-07-23 10:43:00,45906,11.598,True,Finance
10,Louise,Female,1980-08-12,2019-07-23 09:01:00,63241,15.132,True,


## The .duplicated() Method

In [110]:
## .duplicated() function allows us to extract the rows from a dataframe that are duplicates, returns the duplicates, remove unique
df.sort_values('First Name', inplace = True)
df.head(4)  ## We have duplicates in First Names

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2019-07-23 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2019-07-23 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2019-07-23 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2019-07-23 19:39:00,63126,18.424,False,Client Services


In [109]:
df[df['First Name'].duplicated()].head(3)
## keep= augment parameter: default is 'first' which marks the very first occurrence of each value as a non-duplicate

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
327,Aaron,Male,1994-01-29,2019-07-23 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2019-07-23 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2019-07-23 19:39:00,63126,18.424,False,Client Services


In [111]:
## keep='last' which marks the very lastoccurrence of each value as a non-duplicate, the last row of each value will not 
## be marked as duplicate and will not return in the output, the value for last row is False
df[df['First Name'].duplicated(keep = 'last')].head(3)  

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2019-07-23 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2019-07-23 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2019-07-23 14:53:00,52119,11.343,True,Client Services


In [114]:
## keep = False will mark the rows as duplicated it it occurs more than once, just all of the rows marked as duplicates
## if the value appear twice
df[df['First Name'].duplicated(keep = False)].head() ## These are all the duplciates

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2019-07-23 10:20:00,61602,11.849,True,Marketing
327,Aaron,Male,1994-01-29,2019-07-23 18:48:00,58755,5.097,True,Marketing
440,Aaron,Male,1990-07-22,2019-07-23 14:53:00,52119,11.343,True,Client Services
937,Aaron,,1986-01-22,2019-07-23 19:39:00,63126,18.424,False,Client Services
137,Adam,Male,2011-05-21,2019-07-23 01:45:00,95327,15.12,False,Distribution


In [118]:
df[-df['First Name'].duplicated(keep = False)]## These are unique First Names that never appear twice
## These are all unique obs

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2019-07-23 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2019-07-23 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2019-07-23 03:39:00,57783,9.129,False,Finance
887,David,Male,2009-12-05,2019-07-23 08:48:00,92242,15.407,False,Legal
5,Dennis,Male,1987-04-18,2019-07-23 01:35:00,115163,10.125,False,Legal
495,Eugene,Male,1984-05-24,2019-07-23 10:54:00,81077,2.117,False,Sales
33,Jean,Female,1993-12-18,2019-07-23 09:07:00,119082,16.18,False,Business Development
832,Keith,Male,2003-02-12,2019-07-23 15:02:00,120672,19.467,False,Legal
291,Tammy,Female,1984-11-11,2019-07-23 10:30:00,132839,17.463,True,Client Services


In [120]:
## Another way to reverse the boolean series: tilde method
df[~df['First Name'].duplicated(keep = False)].head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2019-07-23 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2019-07-23 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2019-07-23 03:39:00,57783,9.129,False,Finance


In [127]:
df[df.duplicated(subset=['First Name', 'Team'])].head()
## extract duplicates by two variables

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
327,Aaron,Male,1994-01-29,2019-07-23 18:48:00,58755,5.097,True,Marketing
937,Aaron,,1986-01-22,2019-07-23 19:39:00,63126,18.424,False,Client Services
538,Adam,Male,2010-10-08,2019-07-23 21:53:00,45181,3.491,False,Human Resources
680,Albert,Male,1992-06-17,2019-07-23 03:25:00,86818,14.301,True,Engineering
959,Albert,Male,1992-09-19,2019-07-23 02:35:00,45094,5.85,True,Business Development


## The .drop_duplicates() Method

In [129]:
## The .drop_duplicates() function can be called on a dataframe instead of series
len(df)  ## we have 1000 rows
len(df.drop_duplicates()) ## 1000 rows
## the reason why two commands have the same results: because it doesnot matter if a duplicate value occurs in a single column
## The default settings .drop_duplicates() is only going to remove those rows that identical across all of the columns.

1000

In [135]:
## .drop_duplicates() does not return boolean, it directly give data output
## default setting keep = 'first', keep the first occurrence of each value even if it has duplicates
df.drop_duplicates(subset = 'First Name', keep = 'first').head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2019-07-23 10:20:00,61602,11.849,True,Marketing
137,Adam,Male,2011-05-21,2019-07-23 01:45:00,95327,15.12,False,Distribution
300,Alan,Male,1988-06-26,2019-07-23 03:54:00,111786,3.592,True,Engineering


In [134]:
df.drop_duplicates(subset=['First Name'], keep=False).head(3)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
8,Angela,Female,2005-11-22,2019-07-23 06:29:00,95570,18.523,True,Engineering
688,Brian,Male,2007-04-07,2019-07-23 22:47:00,93901,17.821,True,Legal
190,Carol,Female,1996-03-19,2019-07-23 03:39:00,57783,9.129,False,Finance


In [137]:
## It is possible that .drop_duplicates() returns an empty dataset if there is no unique value in a column, and also
## .drop_duplicates() works on the NULL values, if NULL appears twice, it will remove them
df.drop_duplicates(subset = ['Team'],keep = False)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team


In [141]:
df.drop_duplicates(subset = ['First Name', 'Team']).head(3)
## df.drop_duplicates(subset=['First Name', 'Team'], inplace = True)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
101,Aaron,Male,2012-02-17,2019-07-23 10:20:00,61602,11.849,True,Marketing
440,Aaron,Male,1990-07-22,2019-07-23 14:53:00,52119,11.343,True,Client Services
137,Adam,Male,2011-05-21,2019-07-23 01:45:00,95327,15.12,False,Distribution


## The .unique() and .nunique()  Methods

In [156]:
## .unique() and .nunique() is to find unique values within the series, only work on series
df[~df['Team'].duplicated()]['Team']
df['Team'].unique()
## .unique() returns an array of the unique values that are present within a single series

array(['Marketing', 'Client Services', 'Distribution', 'Product',
       'Human Resources', 'Engineering', 'Finance',
       'Business Development', 'Sales', nan, 'Legal'], dtype=object)

In [158]:
df['Gender'].unique()
len(df['Gender'].unique())

3

In [160]:
## len(df['Team'].unique())  11 is the similar as following command
## .nunique() default does not include the NULL value, however, len() include the NULL
df['Team'].nunique()
df['Team'].nunique(dropna = True)

10

In [161]:
df['Team'].nunique(dropna = False)

11