<h1 style="color:cadetblue; font-size:2em;">Indexing DataFrames</h1>

In [63]:
import pandas as pd

df = pd.read_csv('datasets/sales.csv', index_col='month')

In [6]:
df

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


In [3]:
df['salt']['Jan']

12.0

In [4]:
df.eggs['Mar']

221

In [7]:
df.loc['May','spam']

52

In [8]:
df.iloc[4,2]

52

In [9]:
df_new = df[['salt', 'eggs']]
df_new

Unnamed: 0_level_0,salt,eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,12.0,47
Feb,50.0,110
Mar,89.0,221
Apr,87.0,77
May,,132
Jun,60.0,205


<h1 style="color:cadetblue; font-size:2em;">Slicing DataFrames</h1>

In [10]:
df

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


In [11]:
df['eggs']

month
Jan     47
Feb    110
Mar    221
Apr     77
May    132
Jun    205
Name: eggs, dtype: int64

In [12]:
type(df['eggs'])

pandas.core.series.Series

In [32]:
# Part of the eggs column
df['eggs'][1:4]

month
Feb    110
Mar    221
Apr     77
Name: eggs, dtype: int64

In [31]:
# The value associated with May
df['eggs'][4]

132

In [30]:
# All rows, some columns
df.loc[:, 'eggs':'salt']

Unnamed: 0_level_0,eggs,salt
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,47,12.0
Feb,110,50.0
Mar,221,89.0
Apr,77,87.0
May,132,
Jun,205,60.0


In [33]:
# Some rows, all columns
df.loc['Jan':'Apr', :]

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20


In [20]:
df.loc['Mar':'May', 'salt':'spam']

Unnamed: 0_level_0,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Mar,89.0,72
Apr,87.0,20
May,,52


In [34]:
# A block from middle of the DataFrame
df.iloc[2:5, 1:]

Unnamed: 0_level_0,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Mar,89.0,72
Apr,87.0,20
May,,52


In [22]:
df.loc['Jan':'May', ['eggs', 'spam']]

Unnamed: 0_level_0,eggs,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,47,17
Feb,110,31
Mar,221,72
Apr,77,20
May,132,52


In [23]:
df.iloc[[0,4,5],0:2]

Unnamed: 0_level_0,eggs,salt
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,47,12.0
May,132,
Jun,205,60.0


In [24]:
# A series column name
df['eggs']

month
Jan     47
Feb    110
Mar    221
Apr     77
May    132
Jun    205
Name: eggs, dtype: int64

In [26]:
type(df['eggs'])

pandas.core.series.Series

In [27]:
# A DataFrame w/ single column
df[['eggs']]

Unnamed: 0_level_0,eggs
month,Unnamed: 1_level_1
Jan,47
Feb,110
Mar,221
Apr,77
May,132
Jun,205


In [28]:
type(df[['eggs']])

pandas.core.frame.DataFrame

<h1 style="color:cadetblue; font-size:2em;">Filtering DataFrames</h1>

In [35]:
df.salt > 60

month
Jan    False
Feb    False
Mar     True
Apr     True
May    False
Jun    False
Name: salt, dtype: bool

In [36]:
df[df.salt > 60]

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mar,221,89.0,72
Apr,77,87.0,20


In [37]:
enough_salt_sold = df.salt > 60
df[enough_salt_sold]

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Mar,221,89.0,72
Apr,77,87.0,20


In [38]:
# Both conditions
df[(df.salt >= 50) & (df.eggs < 200)]

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Feb,110,50.0,31
Apr,77,87.0,20


In [39]:
# Either conditions
df[(df.salt >= 50) | (df.eggs < 200)]

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


In [50]:
df2 = df.copy()
df2['bacon'] = [0, 0, 50, 60, 70, 80]
df2['beef'] = [None, 45, 62, None, None, 75]
df2

Unnamed: 0_level_0,eggs,salt,spam,bacon,beef
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Jan,47,12.0,17,0,
Feb,110,50.0,31,0,45.0
Mar,221,89.0,72,50,62.0
Apr,77,87.0,20,60,
May,132,,52,70,
Jun,205,60.0,55,80,75.0


In [51]:
# Select columns with all nonzeros
df2.loc[:, df2.all()]

Unnamed: 0_level_0,eggs,salt,spam,beef
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jan,47,12.0,17,
Feb,110,50.0,31,45.0
Mar,221,89.0,72,62.0
Apr,77,87.0,20,
May,132,,52,
Jun,205,60.0,55,75.0


In [52]:
# Select columns with any nonzeros
df2.loc[:, df2.any()]

Unnamed: 0_level_0,eggs,salt,spam,bacon,beef
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Jan,47,12.0,17,0,
Feb,110,50.0,31,0,45.0
Mar,221,89.0,72,50,62.0
Apr,77,87.0,20,60,
May,132,,52,70,
Jun,205,60.0,55,80,75.0


In [53]:
# Select columns with any NaNs
df2.loc[:, df2.isnull().any()]

Unnamed: 0_level_0,salt,beef
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,12.0,
Feb,50.0,45.0
Mar,89.0,62.0
Apr,87.0,
May,,
Jun,60.0,75.0


In [54]:
# Select columns without NaNs
df2.loc[:, df2.notnull().all()]

Unnamed: 0_level_0,eggs,spam,bacon
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,17,0
Feb,110,31,0
Mar,221,72,50
Apr,77,20,60
May,132,52,70
Jun,205,55,80


In [57]:
# Drop rows with any NaNs
df2.dropna(how='any')

Unnamed: 0_level_0,eggs,salt,spam,bacon,beef
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Feb,110,50.0,31,0,45.0
Mar,221,89.0,72,50,62.0
Jun,205,60.0,55,80,75.0


In [64]:
# Filtering a column based on another
df.eggs[df.salt > 55]

month
Mar    221
Apr     77
Jun    205
Name: eggs, dtype: int64

In [65]:
df

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


In [66]:
# Modifying a column based on another
df.eggs[df.salt > 55] += 5
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,226,89.0,72
Apr,82,87.0,20
May,132,,52
Jun,210,60.0,55


<h1 style="color:cadetblue; font-size:2em;">Transforming DataFrames</h1>

In [74]:
import numpy as np
df = pd.read_csv('datasets/sales.csv', index_col='month')

In [75]:
# Convert to dozens unit
df.floordiv(12)

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,3,1.0,1
Feb,9,4.0,2
Mar,18,7.0,6
Apr,6,7.0,1
May,11,,4
Jun,17,5.0,4


In [76]:
# Convert to dozens unit
np.floor_divide(df, 12)

  


Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,3.0,1.0,1.0
Feb,9.0,4.0,2.0
Mar,18.0,7.0,6.0
Apr,6.0,7.0,1.0
May,11.0,,4.0
Jun,17.0,5.0,4.0


In [77]:
def dozens(n):
    return n//12
# Convert to dozens unit
df.apply(dozens)

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,3,1.0,1
Feb,9,4.0,2
Mar,18,7.0,6
Apr,6,7.0,1
May,11,,4
Jun,17,5.0,4


In [78]:
df.apply(lambda n: n//12)

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,3,1.0,1
Feb,9,4.0,2
Mar,18,7.0,6
Apr,6,7.0,1
May,11,,4
Jun,17,5.0,4


In [79]:
df['dozens_of_eggs'] = df.eggs.floordiv(12)
df

Unnamed: 0_level_0,eggs,salt,spam,dozens_of_eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jan,47,12.0,17,3
Feb,110,50.0,31,9
Mar,221,89.0,72,18
Apr,77,87.0,20,6
May,132,,52,11
Jun,205,60.0,55,17


In [80]:
df.index

Index(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun'], dtype='object', name='month')

In [82]:
df.index = df.index.str.upper()
df

Unnamed: 0_level_0,eggs,salt,spam,dozens_of_eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
JAN,47,12.0,17,3
FEB,110,50.0,31,9
MAR,221,89.0,72,18
APR,77,87.0,20,6
MAY,132,,52,11
JUN,205,60.0,55,17


In [83]:
df.index = df.index.map(str.lower)
df

Unnamed: 0_level_0,eggs,salt,spam,dozens_of_eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
jan,47,12.0,17,3
feb,110,50.0,31,9
mar,221,89.0,72,18
apr,77,87.0,20,6
may,132,,52,11
jun,205,60.0,55,17


In [85]:
df['salty_eggs'] = df.salt + df.dozens_of_eggs
df

Unnamed: 0_level_0,eggs,salt,spam,dozens_of_eggs,salty_eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
jan,47,12.0,17,3,15.0
feb,110,50.0,31,9,59.0
mar,221,89.0,72,18,107.0
apr,77,87.0,20,6,93.0
may,132,,52,11,
jun,205,60.0,55,17,77.0
