### Pandas

In [3]:
import numpy as np
import pandas as pd

In [4]:
labels = ['a', 'b', 'c']
my_data = [10,20,30]
arr = np.array(my_data)
d = {'a':10, 'b':20,'c':30}

In [5]:
labels

['a', 'b', 'c']

In [6]:
my_data

[10, 20, 30]

In [7]:
arr

array([10, 20, 30])

In [8]:
d

{'a': 10, 'b': 20, 'c': 30}

In [9]:
pd.Series(data = my_data)

0    10
1    20
2    30
dtype: int64

In [10]:
pd.Series(data=my_data, index=labels)

a    10
b    20
c    30
dtype: int64

In [11]:
pd.Series(my_data, labels)

a    10
b    20
c    30
dtype: int64

In [12]:
pd.Series(arr, labels)

a    10
b    20
c    30
dtype: int32

In [13]:
pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [14]:
pd.Series(data=[sum,print,len])

0      <built-in function sum>
1    <built-in function print>
2      <built-in function len>
dtype: object

### Pandas - DataFrames

In [15]:
from numpy.random import rand

In [16]:
np.random.seed(101)

In [17]:
df = pd.DataFrame(rand(5,4), ['A','B','C','D','E'],['W','X','Y','Z'])

In [18]:
df

Unnamed: 0,W,X,Y,Z
A,0.516399,0.570668,0.028474,0.171522
B,0.685277,0.833897,0.306966,0.893613
C,0.721544,0.189939,0.554228,0.352132
D,0.181892,0.785602,0.965483,0.232354
E,0.083561,0.603548,0.728993,0.276239


In [19]:
df['W']

A    0.516399
B    0.685277
C    0.721544
D    0.181892
E    0.083561
Name: W, dtype: float64

In [20]:
type(df) # A datagrame is a bunch of series

pandas.core.frame.DataFrame

In [21]:
type(df['W'])

pandas.core.series.Series

In [22]:
df.W # Avoid this..

A    0.516399
B    0.685277
C    0.721544
D    0.181892
E    0.083561
Name: W, dtype: float64

How to access multiple columns

In [24]:
df[['W','X']] # Get back a datagrame

Unnamed: 0,W,X
A,0.516399,0.570668
B,0.685277,0.833897
C,0.721544,0.189939
D,0.181892,0.785602
E,0.083561,0.603548


Creating columns

In [27]:
df['new'] = df['W'] + df['Y']
df

Unnamed: 0,W,X,Y,Z,new
A,0.516399,0.570668,0.028474,0.171522,0.544873
B,0.685277,0.833897,0.306966,0.893613,0.992243
C,0.721544,0.189939,0.554228,0.352132,1.275771
D,0.181892,0.785602,0.965483,0.232354,1.147376
E,0.083561,0.603548,0.728993,0.276239,0.812554


Deleting columns

In [29]:
df.drop('new', axis=1) #Select axis

Unnamed: 0,W,X,Y,Z
A,0.516399,0.570668,0.028474,0.171522
B,0.685277,0.833897,0.306966,0.893613
C,0.721544,0.189939,0.554228,0.352132
D,0.181892,0.785602,0.965483,0.232354
E,0.083561,0.603548,0.728993,0.276239


In [30]:
df

Unnamed: 0,W,X,Y,Z,new
A,0.516399,0.570668,0.028474,0.171522,0.544873
B,0.685277,0.833897,0.306966,0.893613,0.992243
C,0.721544,0.189939,0.554228,0.352132,1.275771
D,0.181892,0.785602,0.965483,0.232354,1.147376
E,0.083561,0.603548,0.728993,0.276239,0.812554


In [32]:
df.drop('new', axis=1, inplace=True) #Select axis

In [33]:
df

Unnamed: 0,W,X,Y,Z
A,0.516399,0.570668,0.028474,0.171522
B,0.685277,0.833897,0.306966,0.893613
C,0.721544,0.189939,0.554228,0.352132
D,0.181892,0.785602,0.965483,0.232354
E,0.083561,0.603548,0.728993,0.276239


drop rows

In [34]:
df.drop('E') # Default axis=0

Unnamed: 0,W,X,Y,Z
A,0.516399,0.570668,0.028474,0.171522
B,0.685277,0.833897,0.306966,0.893613
C,0.721544,0.189939,0.554228,0.352132
D,0.181892,0.785602,0.965483,0.232354


In [35]:
df.shape # 5 rows, 4 columns

(5, 4)

Selecting rows

In [37]:
df.loc['A'] # Rows are also series

W    0.516399
X    0.570668
Y    0.028474
Z    0.171522
Name: A, dtype: float64

In [40]:
df.iloc[2] #index location; select by numerical index

W    0.721544
X    0.189939
Y    0.554228
Z    0.352132
Name: C, dtype: float64

In [41]:
df.loc['B', 'Y']

0.3069662196722378

In [43]:
df.loc[['A', 'B'], ['W', 'Y']] # Subset

Unnamed: 0,W,Y
A,0.516399,0.028474
B,0.685277,0.306966


In [46]:
df

Unnamed: 0,W,X,Y,Z
A,0.516399,0.570668,0.028474,0.171522
B,0.685277,0.833897,0.306966,0.893613
C,0.721544,0.189939,0.554228,0.352132
D,0.181892,0.785602,0.965483,0.232354
E,0.083561,0.603548,0.728993,0.276239


In [49]:
bool_df = df > 0.3
bool_df

Unnamed: 0,W,X,Y,Z
A,True,True,False,False
B,True,True,True,True
C,True,False,True,True
D,False,True,True,False
E,False,True,True,False


In [50]:
df[bool_df]

Unnamed: 0,W,X,Y,Z
A,0.516399,0.570668,,
B,0.685277,0.833897,0.306966,0.893613
C,0.721544,,0.554228,0.352132
D,,0.785602,0.965483,
E,,0.603548,0.728993,


In [54]:
df[df>0.2] # Conditional

Unnamed: 0,W,X,Y,Z
A,0.516399,0.570668,,
B,0.685277,0.833897,0.306966,0.893613
C,0.721544,,0.554228,0.352132
D,,0.785602,0.965483,0.232354
E,,0.603548,0.728993,0.276239


In [55]:
df

Unnamed: 0,W,X,Y,Z
A,0.516399,0.570668,0.028474,0.171522
B,0.685277,0.833897,0.306966,0.893613
C,0.721544,0.189939,0.554228,0.352132
D,0.181892,0.785602,0.965483,0.232354
E,0.083561,0.603548,0.728993,0.276239


In [56]:
df['W'] > 0.1

A     True
B     True
C     True
D     True
E    False
Name: W, dtype: bool

In [57]:
df[df['W']>0.1] # Get only the rows that match

Unnamed: 0,W,X,Y,Z
A,0.516399,0.570668,0.028474,0.171522
B,0.685277,0.833897,0.306966,0.893613
C,0.721544,0.189939,0.554228,0.352132
D,0.181892,0.785602,0.965483,0.232354


In [58]:
df[df['Z']>0.2] # Get a df as result

Unnamed: 0,W,X,Y,Z
B,0.685277,0.833897,0.306966,0.893613
C,0.721544,0.189939,0.554228,0.352132
D,0.181892,0.785602,0.965483,0.232354
E,0.083561,0.603548,0.728993,0.276239


In [60]:
df[df['Z']>0.2]['X'] # Select a column from the result

B    0.833897
C    0.189939
D    0.785602
E    0.603548
Name: X, dtype: float64

In [63]:
df[df['Z']>0.2][['X','Y']] # Select a column from the result

Unnamed: 0,X,Y
B,0.833897,0.306966
C,0.189939,0.554228
D,0.785602,0.965483
E,0.603548,0.728993


In [66]:
df[(df['Z']>0.2) & (df['Y']>0.4)] # use & For multiple conditions
# Use & for and
# Use | for or

Unnamed: 0,W,X,Y,Z
C,0.721544,0.189939,0.554228,0.352132
D,0.181892,0.785602,0.965483,0.232354
E,0.083561,0.603548,0.728993,0.276239


In [67]:
df

Unnamed: 0,W,X,Y,Z
A,0.516399,0.570668,0.028474,0.171522
B,0.685277,0.833897,0.306966,0.893613
C,0.721544,0.189939,0.554228,0.352132
D,0.181892,0.785602,0.965483,0.232354
E,0.083561,0.603548,0.728993,0.276239


In [68]:
df.reset_index() # Add inplace = True to save changes

Unnamed: 0,index,W,X,Y,Z
0,A,0.516399,0.570668,0.028474,0.171522
1,B,0.685277,0.833897,0.306966,0.893613
2,C,0.721544,0.189939,0.554228,0.352132
3,D,0.181892,0.785602,0.965483,0.232354
4,E,0.083561,0.603548,0.728993,0.276239


In [70]:
newind = ' CA NY WY OR CO'.split()

In [71]:
df['States'] = newind

In [72]:
df

Unnamed: 0,W,X,Y,Z,States
A,0.516399,0.570668,0.028474,0.171522,CA
B,0.685277,0.833897,0.306966,0.893613,NY
C,0.721544,0.189939,0.554228,0.352132,WY
D,0.181892,0.785602,0.965483,0.232354,OR
E,0.083561,0.603548,0.728993,0.276239,CO


In [74]:
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,0.516399,0.570668,0.028474,0.171522
NY,0.685277,0.833897,0.306966,0.893613
WY,0.721544,0.189939,0.554228,0.352132
OR,0.181892,0.785602,0.965483,0.232354
CO,0.083561,0.603548,0.728993,0.276239


DataFrames Part 3

In [90]:
import pandas as pd
import numpy as np

In [91]:
# Index Levels
outside = ['G1', 'G1', 'G1', 'G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [92]:
outside

['G1', 'G1', 'G1', 'G2', 'G2', 'G2']

In [93]:
inside

[1, 2, 3, 1, 2, 3]

In [94]:
hier_index

MultiIndex([('G1', 1),
            ('G1', 2),
            ('G1', 3),
            ('G2', 1),
            ('G2', 2),
            ('G2', 3)],
           )

In [124]:
df = pd.DataFrame(np.random.randn(6,2), hier_index,['A', 'B'])

In [2]:
df

NameError: name 'df' is not defined

In [128]:
df.loc['G1']

Unnamed: 0,A,B
1,0.190794,1.978757
2,2.605967,0.683509
3,0.302665,1.693723


In [131]:
df.loc['G1'].loc[1]

A    0.190794
B    1.978757
Name: 1, dtype: float64

In [133]:
df.index.names = ['Groups', 'Num']

In [134]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,0.190794,1.978757
G1,2,2.605967,0.683509
G1,3,0.302665,1.693723
G2,1,-1.706086,-1.159119
G2,2,-0.134841,0.390528
G2,3,0.166905,0.184502


In [137]:
df.loc['G2'].loc[2]['B']

0.39052784273374097

In [139]:
df.xs('G1')

Unnamed: 0_level_0,A,B
Num,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.190794,1.978757
2,2.605967,0.683509
3,0.302665,1.693723


In [140]:
df.xs(1, level="Num")

Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,0.190794,1.978757
G2,-1.706086,-1.159119


### Missing Data

In [147]:
d = {'A':[1,2,np.nan], 'B':[5,np.nan,np.nan], 'C':[1,2,3]}

In [149]:
df = pd.DataFrame(d)
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


drop missing values (NaN)

In [151]:
df.dropna() 

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [152]:
df.dropna(axis=1) 

Unnamed: 0,C
0,1
1,2
2,3


In [153]:
df.dropna(thresh=2) # treshold

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


In [154]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2
2,,,3


Replace values

In [155]:
df.fillna(value='FILL VALUE')

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,FILL VALUE,2
2,FILL VALUE,FILL VALUE,3


In [156]:
df['A']

0    1.0
1    2.0
2    NaN
Name: A, dtype: float64

Replace with mean

In [157]:
df['A'].fillna(value=df['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

#### Groupby