## The Iris dataset and pandas

In [1]:
# Wes McKinney - 10 minute tour of pandas
# https://www.youtube.com/watch?v=1MGCD8SQp3k

In [2]:
# https://github.com/wesm/pydata-book

In [3]:
# 10 Minutes to pandas
# http://pandas.pydata.org/pandas-docs/stable.10min.html
# https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html

## Loading data

In [4]:
# Import Pandas
import pandas as pd

In [5]:
# Load the iris data set from a URL
df = pd.read_csv("https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/639388c2cbc2120a14dcf466e85730eb8be498bb/iris.csv")

In [6]:
df
# dataframe

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


*** 

## Selecting rows and columns

In [7]:
df['species']

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: object

In [9]:
df[['petal_length', 'species']]

Unnamed: 0,petal_length,species
0,1.4,setosa
1,1.4,setosa
2,1.3,setosa
3,1.5,setosa
4,1.4,setosa
...,...,...
145,5.2,virginica
146,5.0,virginica
147,5.2,virginica
148,5.4,virginica


In [11]:
df[2:6]
# rows 2 up to and not including row 6

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa


In [12]:
df[['petal_length', 'species']][2:6]

Unnamed: 0,petal_length,species
2,1.3,setosa
3,1.5,setosa
4,1.4,setosa
5,1.7,setosa


In [15]:
# It is preferable not to use the above 4 methods to extract data from the dataset. 
# The preferred method is to use loc (based on column or row label) or iloc (based on position, 1st column, 1st row etc)

In [17]:
df.loc[2:6]
# # rows 2 up to and INCLUDING row 6

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa


In [20]:
df.loc[:,'species']
# to be used when you want a column. : means everything

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: object

In [26]:
df.loc[:, ['sepal_length','species']]

Unnamed: 0,sepal_length,species
0,5.1,setosa
1,4.9,setosa
2,4.7,setosa
3,4.6,setosa
4,5.0,setosa
...,...,...
145,6.7,virginica
146,6.3,virginica
147,6.5,virginica
148,6.2,virginica


In [28]:
df.loc[2:6, ['sepal_length','species']]
# first rows labels, then column labels

Unnamed: 0,sepal_length,species
2,4.7,setosa
3,4.6,setosa
4,5.0,setosa
5,5.4,setosa
6,4.6,setosa


In [32]:
df.iloc[2]
# 3rd row on the list. Give you data in every column

sepal_length       4.7
sepal_width        3.2
petal_length       1.3
petal_width        0.2
species         setosa
Name: 2, dtype: object

In [34]:
df.iloc[2:4,1]
# will return rows 2 and 3 and the values in the first column (it will return the 
# 2nd column as the first column is technically column zero)

2    3.2
3    3.1
Name: sepal_width, dtype: float64

In [35]:
df.at[3, 'species']

'setosa'

In [36]:
# df.at is better option to return a single value

In [38]:
df.iloc[1:10:2]
# will return every 2nd row up to but not including row 10

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
1,4.9,3.0,1.4,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
7,5.0,3.4,1.5,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


***

## Boolean selects

In [40]:
df.loc[:, 'species'] == 'setosa'

0       True
1       True
2       True
3       True
4       True
       ...  
145    False
146    False
147    False
148    False
149    False
Name: species, Length: 150, dtype: bool

In [46]:
df.loc[df.loc[:, 'species'] == 'versicolor']

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
50,7.0,3.2,4.7,1.4,versicolor
51,6.4,3.2,4.5,1.5,versicolor
52,6.9,3.1,4.9,1.5,versicolor
53,5.5,2.3,4.0,1.3,versicolor
54,6.5,2.8,4.6,1.5,versicolor
55,5.7,2.8,4.5,1.3,versicolor
56,6.3,3.3,4.7,1.6,versicolor
57,4.9,2.4,3.3,1.0,versicolor
58,6.6,2.9,4.6,1.3,versicolor
59,5.2,2.7,3.9,1.4,versicolor


In [49]:
x = df.loc[df.loc[:, 'species'] == 'versicolor']

In [50]:
x.loc[51]

sepal_length           6.4
sepal_width            3.2
petal_length           4.5
petal_width            1.5
species         versicolor
Name: 51, dtype: object

In [52]:
x.iloc[1]
# x.loc is a new dataframe. To get row 51 data on the new dataset using iloc we need to ask for position 1
# (as row 50 is position zero)

sepal_length           6.4
sepal_width            3.2
petal_length           4.5
petal_width            1.5
species         versicolor
Name: 51, dtype: object