# Filtering and Conditional Selection
- filter rows based on conditons like high salary, city
- filter based on multiple conditions using AND, OR
- part of - Data Cleaning
          - Exploratory analysis
          - preprocessing for ML pipelines

In [1]:
import pandas as pd

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank'],
    'Age': [25, 30, 35, 40, 29, 32],
    'City': ['Pune', 'Delhi', 'Mumbai', 'Delhi', 'Pune', 'Delhi'],
    'Salary': [50000, 60000, 70000, 80000, 55000, 62000]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City,Salary
0,Alice,25,Pune,50000
1,Bob,30,Delhi,60000
2,Charlie,35,Mumbai,70000
3,David,40,Delhi,80000
4,Eva,29,Pune,55000
5,Frank,32,Delhi,62000


### Basic Filtering with Bool index

In [2]:
df[df['Age'] > 30]

# df['Age'] > 30  returns a boolean series
# df[...] returns only rows with True

Unnamed: 0,Name,Age,City,Salary
2,Charlie,35,Mumbai,70000
3,David,40,Delhi,80000
5,Frank,32,Delhi,62000


In [3]:
df[df['City'] == 'Delhi']

Unnamed: 0,Name,Age,City,Salary
1,Bob,30,Delhi,60000
3,David,40,Delhi,80000
5,Frank,32,Delhi,62000


### Combo of Conditions
- &, |, ~ 
- wrap conditions in ()

In [4]:
df[(df['City'] == 'Delhi') & (df['Salary'] > 60000)]

Unnamed: 0,Name,Age,City,Salary
3,David,40,Delhi,80000
5,Frank,32,Delhi,62000


In [5]:
df[(df['City'] == 'Pune') | (df['Salary'] > 70000)]

Unnamed: 0,Name,Age,City,Salary
0,Alice,25,Pune,50000
3,David,40,Delhi,80000
4,Eva,29,Pune,55000


In [6]:
df[~(df['City'] == 'Delhi')]

Unnamed: 0,Name,Age,City,Salary
0,Alice,25,Pune,50000
2,Charlie,35,Mumbai,70000
4,Eva,29,Pune,55000


### Filtering

In [None]:
df[df['City'].isin(['Pune', 'Mumbai'])]   # get rows with those vals

Unnamed: 0,Name,Age,City,Salary
0,Alice,25,Pune,50000
2,Charlie,35,Mumbai,70000
4,Eva,29,Pune,55000


In [None]:
df[df['Age'].between(30, 35)]  # both included

Unnamed: 0,Name,Age,City,Salary
1,Bob,30,Delhi,60000
2,Charlie,35,Mumbai,70000
5,Frank,32,Delhi,62000


In [None]:
df.query("City == 'Delhi' and Salary > 60000")   # like SQL and works faster in some workflows

Unnamed: 0,Name,Age,City,Salary
3,David,40,Delhi,80000
5,Frank,32,Delhi,62000


In [10]:
df[df['Name'].str.startswith(('A', 'D'))]

Unnamed: 0,Name,Age,City,Salary
0,Alice,25,Pune,50000
3,David,40,Delhi,80000


In [11]:
df[df['Name'].str.contains('a', case=False)]

Unnamed: 0,Name,Age,City,Salary
0,Alice,25,Pune,50000
2,Charlie,35,Mumbai,70000
3,David,40,Delhi,80000
4,Eva,29,Pune,55000
5,Frank,32,Delhi,62000


In [None]:
df[df['Salary'].notnull()] # to see not nulls
df[df['Salary'].isnull()]  # to see nulls

Unnamed: 0,Name,Age,City,Salary


| Task                   | Method                                 |                           
| ---------------------- | -------------------------------------- | 
| Basic filter           | `df[df['col'] > value]`                |                           |
| Multiple conditions    | `&`, \`                                 `, `\~\` with parentheses |
| Membership check       | `.isin([...])`                         |                           |
| Range filtering        | `.between(start, end)`                 |                           |
| SQL-like filtering     | `.query('expression')`                 |                           |
| String-based filtering | `.str.contains()`, `.str.startswith()` |                           |
| Filter NaNs            | `.isnull()`, `.notnull()`              |                           
