In [1]:
import pandas as pd

In [3]:
people = {
    'first': ['Corey', 'Jane', 'John'],
    'last': ['Schafer', 'Doe', 'Doe'],
    'email': ['Coreyschafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com']
}

In [4]:
ex = pd.DataFrame(people)
ex

Unnamed: 0,first,last,email
0,Corey,Schafer,Coreyschafer@gmail.com
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [5]:
# Let say I want to fetch out everyone who has last name as Doe

ex['last'] == 'Doe'           # We received a FILTER showing us true false. SERIES OF BOOLEANS

0    False
1     True
2     True
Name: last, dtype: bool

In [6]:
filt = (ex['last'] == 'Doe')           # Assigning this condition to a variable

ex[filt]                               # Passing condition to dataframe

Unnamed: 0,first,last,email
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [7]:
# Can also be passed through loc

ex.loc[filt]

Unnamed: 0,first,last,email
1,Jane,Doe,JaneDoe@email.com
2,John,Doe,JohnDoe@email.com


In [8]:
# loc helps us to fetch a specific column using the filter

ex.loc[filt,'email']                      # Emails mathcing that last name

1    JaneDoe@email.com
2    JohnDoe@email.com
Name: email, dtype: object

#### 

#### & and  |  operator

In [13]:
filt = (ex['last'] == 'Doe') & (ex['first'] == 'John')

ex.loc[filt, 'email']

2    JohnDoe@email.com
Name: email, dtype: object

In [14]:
filt = (ex['last'] == 'Schafer') | (ex['first'] == 'John')

ex.loc[filt, 'email']

0    Coreyschafer@gmail.com
2         JohnDoe@email.com
Name: email, dtype: object

In [15]:
ex.loc[~filt, 'email']          # Now using tilde , we will get opposite of the above case

1    JaneDoe@email.com
Name: email, dtype: object

# 

#  Checking conditions on real dataset

In [16]:
df = pd.read_csv('data/survey_results_public.csv')
schema_df = pd.read_csv('data/survey_results_schema.csv')

In [17]:
pd.set_option('display.max_columns',85) 
pd.set_option('display.max_rows',85) 

#### 

#### Getting data of people making salary over certain amount

In [18]:
# Making a filter

high_salary = (df['ConvertedComp'] > 70000)

In [19]:
df.loc[high_salary, ['Country', 'LanguageWorkedWith', 'ConvertedComp']]

Unnamed: 0,Country,LanguageWorkedWith,ConvertedComp
5,Canada,Java;R;SQL,366420.0
8,New Zealand,Bash/Shell/PowerShell;C#;HTML/CSS;JavaScript;P...,95179.0
12,United States,Bash/Shell/PowerShell;HTML/CSS;JavaScript;PHP;...,90000.0
15,United Kingdom,Bash/Shell/PowerShell;C#;HTML/CSS;JavaScript;T...,455352.0
21,United States,Bash/Shell/PowerShell;C++;HTML/CSS;JavaScript;...,103000.0
...,...,...,...
88323,United States,Bash/Shell/PowerShell;C#;HTML/CSS;Java;Python;...,180000.0
88324,United States,Bash/Shell/PowerShell;C;Clojure;HTML/CSS;Java;...,2000000.0
88325,United States,HTML/CSS;JavaScript;Scala;TypeScript,130000.0
88326,Finland,Bash/Shell/PowerShell;C++;Python,82488.0


#### 

#### Getting data of people living in selected countries out of all.

In [20]:
# Writing a long filter isn't a good practice so we create a list of countries first

countries = ['United States', 'India', 'United Kingdom', 'Germany', 'Canada']

filt = df['Country'].isin(countries)

In [22]:
df.loc[filt, 'Country']

0        United Kingdom
3         United States
5                Canada
7                 India
9                 India
              ...      
88859     United States
88863    United Kingdom
88864             India
88877     United States
88878            Canada
Name: Country, Length: 45008, dtype: object

#### 

####  Filtering out people who said they knew python as programming language

In [24]:
df['LanguageWorkedWith']              # Here we can notice that all languages are seperated by ;

0                          HTML/CSS;Java;JavaScript;Python
1                                      C++;HTML/CSS;Python
2                                                 HTML/CSS
3                                      C;C++;C#;Python;SQL
4              C++;HTML/CSS;Java;JavaScript;Python;SQL;VBA
                               ...                        
88878                        HTML/CSS;JavaScript;Other(s):
88879                                                  NaN
88880                                                  NaN
88881                                                  NaN
88882    Bash/Shell/PowerShell;HTML/CSS;Java;JavaScript...
Name: LanguageWorkedWith, Length: 88883, dtype: object

In [26]:
filt = df['LanguageWorkedWith'].str.contains('Python', na=False)          

# Above filter is working on LanguageWorked series. We saying that it's a string which contains Python as substring.
# Also we set na=False so that it doesn't do anything with NaN values


df.loc[filt, 'LanguageWorkedWith']

0                          HTML/CSS;Java;JavaScript;Python
1                                      C++;HTML/CSS;Python
3                                      C;C++;C#;Python;SQL
4              C++;HTML/CSS;Java;JavaScript;Python;SQL;VBA
7        Bash/Shell/PowerShell;C;C++;HTML/CSS;Java;Java...
                               ...                        
88854    Bash/Shell/PowerShell;C;C++;HTML/CSS;Java;Java...
88860      Bash/Shell/PowerShell;C++;Python;Ruby;Other(s):
88865      Bash/Shell/PowerShell;HTML/CSS;Python;Other(s):
88872             C;C++;HTML/CSS;JavaScript;PHP;Python;SQL
88876                           HTML/CSS;JavaScript;Python
Name: LanguageWorkedWith, Length: 36443, dtype: object