In [3]:
import pandas as pd
import numpy as np

In [14]:
df = pd.read_csv("test.csv", sep = ';', index_col = 0)

In [81]:
df.head(10)

Unnamed: 0,state,color,age,height,score
Jane,NY,blue,19,180,460
Niko,TX,green,20,190,890
Aaron,NL,brown,34,183,340
Penelope,AS,black,21,164,290


In [16]:
index = df.index
columns = df.columns
values = df.values

In [17]:
index

Index(['Jane', 'Niko', 'Aaron', 'Penelope'], dtype='object')

In [18]:
columns

Index(['state ', 'color', 'age', 'height', 'score'], dtype='object')

In [19]:
values

array([['NY', 'blue', 19, 180, '4,60'],
       ['TX', 'green', 20, 190, '8,90'],
       ['NL', 'brown', 34, 183, '3,40'],
       ['AS', 'black', 21, 164, '2,90']], dtype=object)

In [20]:
type(index)

pandas.core.indexes.base.Index

In [21]:
type(columns)

pandas.core.indexes.base.Index

In [22]:
type(values) # n-dimensional array

numpy.ndarray

### Indexing operator

#### Select one column

In [28]:
color = df['color']

In [31]:
type(color) # pandas series (one-dimensional sequence of labeled data)

pandas.core.series.Series

In [32]:
color # just plain text

Jane         blue
Niko        green
Aaron       brown
Penelope    black
Name: color, dtype: object

Two main components of pandas series:

1. index
2. data 
(no columns)



#### Select multiple columns

In [37]:
df[['color']] # use a list to pass required columns

Unnamed: 0,color
Jane,blue
Niko,green
Aaron,brown
Penelope,black


In [33]:
subset = df[['color', 'age']] # pass a list to select multiple rows

In [35]:
type(subset) # subselection of multiple columns results in a dataframe

pandas.core.frame.DataFrame

In [36]:
subset

Unnamed: 0,color,age
Jane,blue,19
Niko,green,20
Aaron,brown,34
Penelope,black,21


The order in which you select the columns doesn't matter. The data frame will be adjusted according your selection.

### Getting started with .loc

Can select columns and rows from the data. Selection occurs by using the labels.

#### Select a single row

In [41]:
jane = df.loc['Jane'] # same as: df.loc['Jane', ]

In [42]:
type(jane)

pandas.core.series.Series

In [43]:
jane

state       NY
color     blue
age         19
height     180
score     4,60
Name: Jane, dtype: object

Results in a series where the old column names are now the indexes of the data.

#### Select multiple rows

In [46]:
multiple = df.loc[['Jane', 'Niko']]

In [47]:
type(multiple)

pandas.core.frame.DataFrame

In [48]:
multiple

Unnamed: 0,state,color,age,height,score
Jane,NY,blue,19,180,460
Niko,TX,green,20,190,890


#### Slice notation to select a range of rows

In [54]:
subset = df.loc['Niko': 'Penelope']

In [50]:
type(subset)

pandas.core.frame.DataFrame

In [51]:
subset

Unnamed: 0,state,color,age,height,score
Niko,TX,green,20,190,890
Aaron,NL,brown,34,183,340
Penelope,AS,black,21,164,290


Just define start and stop row index.

#### Other slices

In [55]:
df.loc[:'Penelope']

Unnamed: 0,state,color,age,height,score
Jane,NY,blue,19,180,460
Niko,TX,green,20,190,890
Aaron,NL,brown,34,183,340
Penelope,AS,black,21,164,290


In [57]:
df.loc['Jane':'Penelope':2] # just select every second index.

Unnamed: 0,state,color,age,height,score
Jane,NY,blue,19,180,460
Aaron,NL,brown,34,183,340


In [59]:
df.loc['Aaron':]

Unnamed: 0,state,color,age,height,score
Aaron,NL,brown,34,183,340
Penelope,AS,black,21,164,290


#### Select rows and columns simultaneously

df.loc[row_selector, column_selector]

In [60]:
df.loc[['Jane', 'Aaron'], ['age', 'score', 'height']]

Unnamed: 0,age,score,height
Jane,19,460,180
Aaron,34,340,183


Selections can be made by:

1. just a single label
2. a list of labels
3. a slice with labels

#### Select a scalar value

In [61]:
df.loc['Jane', 'height']

180

#### Select all rows and just some columns

In [64]:
df.loc[:, ['height', 'age']]

Unnamed: 0,height,age
Jane,180,19
Niko,190,20
Aaron,183,34
Penelope,164,21


#### Select all columns and just some rows

In [66]:
df.loc[['Jane', 'Penelope']]

Unnamed: 0,state,color,age,height,score
Jane,NY,blue,19,180,460
Penelope,AS,black,21,164,290


Same as:

In [67]:
df.loc[['Jane', 'Penelope'], :]

Unnamed: 0,state,color,age,height,score
Jane,NY,blue,19,180,460
Penelope,AS,black,21,164,290


Or:

In [71]:
df.loc[['Jane', 'Penelope'],]

Unnamed: 0,state,color,age,height,score
Jane,NY,blue,19,180,460
Penelope,AS,black,21,164,290


Assign variables to row_selector and column_selctor:

In [79]:
rows = ['Aaron', 'Jane', 'Niko']
columns = ['color', 'age']

In [80]:
df.loc[rows, columns]

Unnamed: 0,color,age
Aaron,brown,34
Jane,blue,19
Niko,green,20


## Getting started with .iloc

iloc = index locator: <br>
.iloc exclusively works with indexes instead of labels. <br>
<br>
df.iloc[row_selector, column_selector]

In [85]:
df.iloc[3]

state        AS
color     black
age          21
height      164
score      2,90
Name: Penelope, dtype: object

In [86]:
df.iloc[:, 3]

Jane        180
Niko        190
Aaron       183
Penelope    164
Name: height, dtype: int64

#### Select muliple rows

In [90]:
df.iloc[[0, 2, 3]]

Unnamed: 0,state,color,age,height,score
Jane,NY,blue,19,180,460
Aaron,NL,brown,34,183,340
Penelope,AS,black,21,164,290


#### Use slicing

In [91]:
df.iloc[0:3]

Unnamed: 0,state,color,age,height,score
Jane,NY,blue,19,180,460
Niko,TX,green,20,190,890
Aaron,NL,brown,34,183,340


.iloc works pretty much the same ways .loc.

### Selecting subsets of series

In [93]:
age = df['age']
age

Jane        19
Niko        20
Aaron       34
Penelope    21
Name: age, dtype: int64

In [94]:
age.loc['Niko']

20

In [95]:
age.iloc[3]

21

In [96]:
age.loc[['Niko', 'Aaron']]

Niko     20
Aaron    34
Name: age, dtype: int64

In [97]:
age.iloc[0:3]

Jane     19
Niko     20
Aaron    34
Name: age, dtype: int64