# Selecting, slicing, conditional indexing, loc & iloc

### Initialization

In [1]:
import pandas as pd


# Create/load the datastructures
synth_ser = pd.Series([49, 34, 12])
df = pd.read_csv('data/titanic.csv')
synth_df = pd.DataFrame({'Yes': [1, 5], 'No': [3, 8]}, index=["First", "Second"])

### Selection & Slicing

In [13]:
# Select rows with indexes
synth_ser[1]        # Second element of a Series (they behave similarly to lists)
df.iloc[0]          # First row
df.iloc[[0, 2, 4]]  # Multiple rows
df.iloc[:5, :3]     # Multiple rows and columns in a range

Unnamed: 0,PassengerId,Survived,Pclass
0,1,0,3
1,2,1,1
2,3,1,3
3,4,1,1
4,5,0,3


In [4]:
# Select columns (throws a Series object or a reduced DataFrame)
df['Name']                              # Single column (Series)
df[['Name', 'Sex', 'Survived']].head()  # Multiple columns, in their own order (DataFrame) 

Unnamed: 0,Name,Sex,Survived
0,"Braund, Mr. Owen Harris",male,0
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1
2,"Heikkinen, Miss. Laina",female,1
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1
4,"Allen, Mr. William Henry",male,0


In [5]:
# Select columns and especific rows
df.loc[[0, 2, 4], ['Name', 'Survived']]

Unnamed: 0,Name,Survived
0,"Braund, Mr. Owen Harris",0
2,"Heikkinen, Miss. Laina",1
4,"Allen, Mr. William Henry",0


### Loc vs. Iloc
[Loc vs. Iloc link](https://www.analyticsvidhya.com/blog/2020/02/loc-iloc-pandas/)<br>
[Difference with Python's native indexing](https://www.kaggle.com/residentmario/indexing-selecting-assigning)

In [6]:
# These are equivalent
synth_df.loc[['First'], ['No']]  # Can use labels (loc is label based for both rows and columns)
synth_df.iloc[[0], [1]]          # Must use numerical indexes (iloc is numerically indexed based for both rows and columns)

Unnamed: 0,No
First,3


### Conditional slicing

In [7]:
# Boolean Series and DataFrames (masks)
df.Age>30  # Returns a Series of boolean values (a mask)
df!=0      # Returns a DataFrame of boolean values

# To deny/invert the boolean values in this structure you need "~"
~(df.Age>30)  # Remember: with Pandas we DON'T use "!" or "not", we use "~"
~(df!=0)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,False,True,False,False,False,False,False,True,False,False,False,False
1,False,False,False,False,False,False,False,True,False,False,False,False
2,False,False,False,False,False,False,True,True,False,False,False,False
3,False,False,False,False,False,False,False,True,False,False,False,False
4,False,True,False,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
886,False,True,False,False,False,False,True,True,False,False,False,False
887,False,False,False,False,False,False,True,True,False,False,False,False
888,False,True,False,False,False,False,False,False,False,False,False,False
889,False,False,False,False,False,False,True,True,False,False,False,False


Note: Only through loc and native Python we can do filtering with boolean vectors (masks), therefore, only loc can do conditional slicing NOT iloc.

In [8]:
# Select all rows with ages older than 30 by using a mask as index
df[df.Age > 30]      # Abbreviated syntax
df.loc[df.Age > 30]  # They both do the exact same

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
11,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S
...,...,...,...,...,...,...,...,...,...,...,...,...
873,874,0,3,"Vander Cruyssen, Mr. Victor",male,47.0,0,0,345765,9.0000,,S
879,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C
881,882,0,3,"Markun, Mr. Johann",male,33.0,0,0,349257,7.8958,,S
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q


In [9]:
# Select the names with ages older than 30 
df.loc[df['Age']>30, ['Name']]

Unnamed: 0,Name
1,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,"Allen, Mr. William Henry"
6,"McCarthy, Mr. Timothy J"
11,"Bonnell, Miss. Elizabeth"
...,...
873,"Vander Cruyssen, Mr. Victor"
879,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)"
881,"Markun, Mr. Johann"
885,"Rice, Mrs. William (Margaret Norton)"


In [10]:
# Modify multiple values of rows that meet a certain condition
df_dummy = df.copy()
df_dummy.loc[df['Age']>30, ['Name', 'Age']] = ['New Name', '100']
df_dummy

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,New Name,female,100,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,New Name,female,100,1,0,113803,53.1000,C123,S
4,5,0,3,New Name,male,100,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [13]:
# Select rows that comply with multiple conditions
df.loc[(df['Age']>30) & (df['Age']<40), ['Name', 'Sex', 'Age']].head(5)

Unnamed: 0,Name,Sex,Age
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0
4,"Allen, Mr. William Henry",male,35.0
13,"Andersson, Mr. Anders Johan",male,39.0
18,"Vander Planke, Mrs. Julius (Emelia Maria Vande...",female,31.0


In [11]:
# Another (cleaner) way to do relatively the same thing
cols = ['Name', 'Sex', 'Age']  # The columns
mask1 = df['Age'] > 20         # Condition 1 (throws a boolean vector, also known as "mask")
mask2 = df['Age'] < 30         # Condition 2 
mask3 = df['Sex'] == "female"  # Condition 3
final_mask = mask1 & mask2 & mask3

df.loc[final_mask, cols].head()

Unnamed: 0,Name,Sex,Age
2,"Heikkinen, Miss. Laina",female,26.0
8,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0
41,"Turpin, Mrs. William John Robert (Dorothy Ann ...",female,27.0
53,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkin...",female,29.0
56,"Rugg, Miss. Emily",female,21.0


### Built-in conditional selectors (isin, isnull, notnull & all)

In [12]:
# Display only the rows that have 30, 31, 32, 33 as Age value
df.loc[df.Age.isin([30, 31, 32, 33])].head()

# Display only the rows that have no Cabin info
df.loc[df.Cabin.isnull()].head()

# Display only the rows that have Cabin info
df.loc[df.Cabin.notnull()].head()

# All returns a Series (mask) al all the row|columns where all the values are True
rows_without0_mask = (df!=0).all(axis='columns')  # Can also abbreviate to "(df!=0).all(1)"
df[rows_without0_mask]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
10,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.00,1,1,PP 9549,16.7000,G6,S
25,26,1,3,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...",female,38.00,1,5,347077,31.3875,,S
43,44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3.00,1,2,SC/Paris 2123,41.5792,,C
58,59,1,2,"West, Miss. Constance Mirium",female,5.00,1,2,C.A. 34651,27.7500,,S
65,66,1,3,"Moubarek, Master. Gerios",male,,1,1,2661,15.2458,,C
...,...,...,...,...,...,...,...,...,...,...,...,...
831,832,1,2,"Richards, Master. George Sibley",male,0.83,1,1,29106,18.7500,,S
835,836,1,1,"Compton, Miss. Sara Rebecca",female,39.00,1,1,PC 17756,83.1583,E49,C
856,857,1,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,45.00,1,1,36928,164.8667,,S
869,870,1,3,"Johnson, Master. Harold Theodor",male,4.00,1,1,347742,11.1333,,S
