# Selection, Indexing, Filtering

In [5]:
import pandas as pd

In [6]:
df = pd.read_csv("https://drive.google.com/uc?id=1oE-3rt17bFW7fOzDIjwFSEMPTIV3NvcO")

## Select Columns

In [10]:
df["grade_language_t1"] # select column by name
df[["sex", "grade_language_t1"]].head() # select columns by name

Unnamed: 0,sex,grade_language_t1
0,1,2.369783
1,1,7.206984
2,2,8.057449
3,1,7.388008
4,1,6.773626


###  filter()

In [11]:
# alternative with extra functionality
df.filter(["sex", "grade_language_t1"])
df.filter(items=["sex", "grade"]) # select columns
df.filter(regex="^grade") # 
df.filter(regex="^grade", axis=1) # select columns with "grade" at the start of their name
df.filter(regex="t2$", axis=1) # select columns with "t2" at the end
df.filter(like="math", axis=1) # select columns with "math" in their name

Unnamed: 0,sex,grade_language_t1
0,1,2.369783
1,1,7.206984
2,2,8.057449
3,1,7.388008
4,1,6.773626
...,...,...
1505,1,5.306647
1506,1,8.360153
1507,1,8.534791
1508,1,5.071503


## Index Rows
accessing specific positions (rows/cells)

In [18]:
# loc[] selects by name
df.loc[1509][:2]  # select row by name

school_id    946
grade          8
Name: 1509, dtype: object

In [15]:
# iloc[] selects by index
df.iloc[1509][:2]  # happens to the same as name

school_id    946
grade          8
Name: 1509, dtype: object

In [21]:
# indexing by row/col name 
df.at[1509, "school_id"]  # single cell

np.int64(946)

In [22]:
# indexing by row/col position
df.iat[1509, 3] # row 1509, col 3 (email)

'eohgp@cunb.edu'

### Slicing
Slicing means getting a range of rows using position

In [25]:
df.loc[200: 204]  # select rows by name sliced -> from : to

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,grade_math_t1,grade_language_t1,grade_science_t1,grade_math_t2,grade_language_t2,grade_science_t2,treatment,date_of_birth
200,60,7,A,xxp2d@cunb.edu,1,1,4.011534,4.407988,3.057861,2.799263,3.195717,1.84559,0,1996-03-17
201,60,7,A,dswba@cunb.edu,1,2,6.382753,6.884525,7.557701,4.243814,4.745587,5.418762,0,1995-12-21
202,60,7,A,wpuqt@cunb.edu,2,1,4.866325,5.093832,4.764723,6.053016,6.280523,5.951413,0,1996-02-19
203,60,7,A,qmqno@cunb.edu,1,1,5.845239,5.27685,6.764023,6.122213,5.553825,7.040997,0,1996-03-23
204,60,7,A,2sobz@cunb.edu,1,2,6.338925,7.430347,6.053049,6.307165,7.398587,6.021289,0,1996-04-02


In [27]:
df.iloc[0:3]  # first 3 rows by index

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,grade_math_t1,grade_language_t1,grade_science_t1,grade_math_t2,grade_language_t2,grade_science_t2,treatment,date_of_birth
0,57,6,A,wvqgd@cunb.edu,1,1,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368,1,1997-07-27
1,57,6,A,j0ihe@cunb.edu,1,2,7.859077,7.206984,6.994455,7.548812,6.896719,6.68419,1,1997-06-24
2,57,6,A,wcjgk@cunb.edu,2,1,7.118976,8.057449,8.57321,6.181019,7.119492,7.635253,1,1997-04-23


In [28]:
# slicing rows, columns
df.loc[:2, ["sex", "school_id"]]  # slice of rows and selection of columns

Unnamed: 0,sex,school_id
0,1,57
1,1,57
2,2,57


In [31]:
df.loc[:, "sex": "grade_language_t1"]  # get all row from col 'b' & 'c'

Unnamed: 0,sex,nationality,grade_math_t1,grade_language_t1
0,1,1,2.046285,2.369783
1,1,2,7.859077,7.206984
2,2,1,7.118976,8.057449
3,1,1,6.973737,7.388008
4,1,1,6.574877,6.773626
...,...,...,...,...
1505,1,1,4.938375,5.306647
1506,1,1,7.661931,8.360153
1507,1,1,9.248758,8.534791
1508,1,2,5.689228,5.071503


In [32]:
df.loc[1:3, "sex": "grade_language_t1"]  # get rows from : to, of columns from:to

Unnamed: 0,sex,nationality,grade_math_t1,grade_language_t1
1,1,2,7.859077,7.206984
2,2,1,7.118976,8.057449
3,1,1,6.973737,7.388008


In [33]:
df.loc[1:2, :]  # get rows - from : to & all columns

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,grade_math_t1,grade_language_t1,grade_science_t1,grade_math_t2,grade_language_t2,grade_science_t2,treatment,date_of_birth
1,57,6,A,j0ihe@cunb.edu,1,2,7.859077,7.206984,6.994455,7.548812,6.896719,6.68419,1,1997-06-24
2,57,6,A,wcjgk@cunb.edu,2,1,7.118976,8.057449,8.57321,6.181019,7.119492,7.635253,1,1997-04-23


In [34]:
df.iloc[:, -1]  # select last column by index

0       1997-07-27
1       1997-06-24
2       1997-04-23
3       1997-02-24
4       1996-09-05
           ...    
1505    1995-07-04
1506    1995-08-23
1507    1994-12-15
1508    1994-09-18
1509    1994-12-19
Name: date_of_birth, Length: 1510, dtype: object

In [None]:
df.iloc[0:3, :]  #  Slicing Rows and Columns by position
df.iloc[:, 0:3]  # slice columns by index position.
df.iloc[0:2, 1:3]  # slice row and columns by index position.
df.iloc[:2, :2]  # row index 0 to index 1 (exclusive 2), column zero to one

Unnamed: 0,a,b
1,1,2.0
2,4,


## Boolean Indexing

In [25]:
# df[df["school_id"] == 57]
df[(df["grade_math_t1"] > 9) & (df["school_id"] == 57)]
df[df["school_id"].isin([57, 141])]
df[df["date_of_birth"]>"1995-01-01"] 

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,grade_math_t1,grade_language_t1,grade_science_t1,grade_math_t2,grade_language_t2,grade_science_t2,treatment,date_of_birth
0,57,6,A,wvqgd@cunb.edu,1,1,2.046285,2.369783,2.711344,1.175309,1.498807,1.840368,1,1997-07-27
1,57,6,A,j0ihe@cunb.edu,1,2,7.859077,7.206984,6.994455,7.548812,6.896719,6.684190,1,1997-06-24
2,57,6,A,wcjgk@cunb.edu,2,1,7.118976,8.057449,8.573210,6.181019,7.119492,7.635253,1,1997-04-23
3,57,6,A,mzmqb@cunb.edu,1,1,6.973737,7.388008,6.628448,7.593102,8.007373,7.247812,1,1997-02-24
4,57,6,A,s6n0y@cunb.edu,1,1,6.574877,6.773626,7.844316,7.611065,7.809814,8.880504,1,1996-09-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1498,946,8,C,4izl6@cunb.edu,1,2,7.917516,6.101212,7.643088,6.466413,4.650109,6.191984,0,1995-02-10
1499,946,8,D,oqyyk@cunb.edu,1,1,4.465381,3.756297,4.901844,6.359582,5.650498,6.796046,0,1995-03-11
1503,946,8,D,2hk9v@cunb.edu,1,1,6.553834,6.540934,6.760266,4.893724,4.880824,5.100157,0,1995-02-05
1505,946,8,D,jj36e@cunb.edu,1,1,4.938375,5.306647,4.985917,6.162418,6.530690,6.209960,0,1995-07-04


### between() and loc[] with boolean indexing

In [None]:
# between function
df[df["grade_language_t1"].between(8.6, 8.7)]


# loc with boolean indexing and choosing columns
df.loc[df.grade_language_t1 > 9, ["school_id", "grade"]] 

Unnamed: 0,school_id,grade
9,57,6
18,57,6
19,57,6
31,57,6
39,57,6
...,...,...
1439,946,7
1464,946,8
1471,946,8
1482,946,8


### isin() and query() methods

In [51]:
df.head()
df.grade.unique()
df.grade = df.grade.astype("category")
df.dtypes

school_id               int64
grade                category
class                  object
student_id             object
sex                     int64
nationality             int64
grade_math_t1         float64
grade_language_t1     float64
grade_science_t1      float64
grade_math_t2         float64
grade_language_t2     float64
grade_science_t2      float64
treatment               int64
date_of_birth          object
dtype: object

### isin()

In [58]:
# if values is a list check whether every value in the DataFrame is present in the list
df.isin([57, 0])  

# check whole df if values are NOT in the list
~df.isin([0, 57])

# check specific column for multiple values
df.isin({"grade": [6, 7]})

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,grade_math_t1,grade_language_t1,grade_science_t1,grade_math_t2,grade_language_t2,grade_science_t2,treatment,date_of_birth
0,False,True,False,False,False,False,False,False,False,False,False,False,False,False
1,False,True,False,False,False,False,False,False,False,False,False,False,False,False
2,False,True,False,False,False,False,False,False,False,False,False,False,False,False
3,False,True,False,False,False,False,False,False,False,False,False,False,False,False
4,False,True,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1505,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1506,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1507,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1508,False,False,False,False,False,False,False,False,False,False,False,False,False,False


### query()

In [None]:

# df[df.grade_math_t1 > df.grade_math_t2]
# df.query("grade_math_t1 > grade_math_t2")


# query can be cleaner for complex queries - keyword class needs backticks in query
# df[(df.sex == 1) & (df["class"] == "C") & ((df.grade_math_t1 > 9.9) | (df.grade_math_t2 > 9.9))]
df.query("sex == 1 and `class` == 'C' and (grade_math_t1 > 9.9 or grade_math_t2 > 9.9)")

Unnamed: 0,school_id,grade,class,student_id,sex,nationality,grade_math_t1,grade_language_t1,grade_science_t1,grade_math_t2,grade_language_t2,grade_science_t2,treatment,date_of_birth
31,57,6,C,0eiwm@cunb.edu,1,1,9.737252,10.0,8.927517,10.0,10.0,9.194739,1,1997-03-25
35,57,6,C,wbcfx@cunb.edu,1,2,6.940615,6.494307,6.535804,10.0,9.66428,9.705778,1,1996-10-24
81,57,7,C,ysvls@cunb.edu,1,1,9.604527,8.388494,9.063802,10.0,9.936316,10.0,0,1996-07-29
88,57,7,C,7pojk@cunb.edu,1,1,9.267352,8.130984,8.006577,10.0,9.512149,9.387742,0,1996-02-26
137,57,8,C,yxxfv@cunb.edu,1,1,8.63128,8.390752,8.12487,10.0,10.0,10.0,1,1995-06-03
373,141,7,C,6zo9s@cunb.edu,1,2,9.207145,8.833523,8.292497,10.0,10.0,10.0,1,1996-05-05
383,141,7,C,2hbgs@cunb.edu,1,2,7.348327,6.86235,5.577604,10.0,10.0,9.297259,1,1996-01-14
419,141,8,C,144ga@cunb.edu,1,1,9.237515,10.0,8.414176,10.0,10.0,9.433997,0,1994-11-12
421,141,8,C,5xeof@cunb.edu,1,1,9.135011,8.841103,9.407451,10.0,9.86644,10.0,0,1994-12-29
527,262,7,C,nq45z@cunb.edu,1,1,8.823036,8.724362,8.356688,10.0,10.0,9.864914,1,1996-08-15


### xs() 
Return cross-section from a particular level of a MultiIndex.

In [None]:
d = {
     "num_legs": [4, 4, 2, 2],
     "num_wings": [0, 0, 2, 2],
     "class": ["mammal", "mammal", "mammal", "bird"],
     "animal": ["cat", "dog", "bat", "penguin"],
     "locomotion": ["walks", "walks", "flies", "walks"],
 }

df = pd.DataFrame(data=d)
# three index levels
df = df.set_index(["class", "animal", "locomotion"])
df


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,num_legs,num_wings
class,animal,locomotion,Unnamed: 3_level_1,Unnamed: 4_level_1
mammal,cat,walks,4,0
mammal,dog,walks,4,0
mammal,bat,flies,2,2
bird,penguin,walks,2,2


In [None]:
df.xs("mammal") # get all mammals
df.xs(("mammal", "dog")) # get all info about dogs
df.xs(("mammal", "dog", "walks")) # get specific info about dogs

# axis=1 means columns
df.xs("num_wings", axis=1) # returns this column


df.xs("mammal", level=0) # level of the index
df.xs("dog", level="animal")

# multiple levels one by index and one by name
df.xs(("bird", "walks"), level=[0, "locomotion"])

class   animal   locomotion
mammal  cat      walks         0
        dog      walks         0
        bat      flies         2
bird    penguin  walks         2
Name: num_wings, dtype: int64