## pandas Series

In [1]:
import pandas as pd
import numpy as np

In [2]:
from numpy.random import randn

In [3]:
np.random.seed(0)

In [4]:
np.random.random(5)

array([0.5488135 , 0.71518937, 0.60276338, 0.54488318, 0.4236548 ])

In [5]:
labels = ['a', 'b', 'c']
my_data = [10, 20, 30]
dic_data = {'a':100, 'b':200, 'c':300}
arr = np.array(my_data)

In [6]:
pd.Series(labels)

0    a
1    b
2    c
dtype: object

In [7]:
pd.Series(my_data)

0    10
1    20
2    30
dtype: int64

In [8]:
pd.Series(dic_data)

a    100
b    200
c    300
dtype: int64

In [9]:
pd.Series(data=my_data, index=labels)

a    10
b    20
c    30
dtype: int64

In [10]:
ser1 = pd.Series([1,2,3,4], ['USA','Japan','China','Swis']) # Series(data, index)
ser1

USA      1
Japan    2
China    3
Swis     4
dtype: int64

In [11]:
ser2 = pd.Series([1,2,4,6], ['USA','Italy','China','BD'])
ser2

USA      1
Italy    2
China    4
BD       6
dtype: int64

In [12]:
ser1 + ser2

BD       NaN
China    7.0
Italy    NaN
Japan    NaN
Swis     NaN
USA      2.0
dtype: float64

## Pandas DataFrame

In [13]:
df = pd.DataFrame(data=randn(5,4), index=['A','B','C','D','E'], columns=['C1','C2','C3','C4'], dtype="float64")
df

Unnamed: 0,C1,C2,C3,C4
A,-0.842724,1.969924,1.266119,-0.505877
B,2.545201,1.080812,0.484312,0.57914
C,-0.181583,1.410205,-0.374472,0.275198
D,-0.960755,0.376927,0.033439,0.680567
E,-1.563497,-0.566698,-0.24215,1.514391


In [14]:
df['C1']

A   -0.842724
B    2.545201
C   -0.181583
D   -0.960755
E   -1.563497
Name: C1, dtype: float64

In [15]:
type(df['C1'])

pandas.core.series.Series

In [16]:
df[['C1','C3']]

Unnamed: 0,C1,C3
A,-0.842724,1.266119
B,2.545201,0.484312
C,-0.181583,-0.374472
D,-0.960755,0.033439
E,-1.563497,-0.24215


In [17]:
df['new'] = df['C2'] + df['C4']
df

Unnamed: 0,C1,C2,C3,C4,new
A,-0.842724,1.969924,1.266119,-0.505877,1.464048
B,2.545201,1.080812,0.484312,0.57914,1.659952
C,-0.181583,1.410205,-0.374472,0.275198,1.685403
D,-0.960755,0.376927,0.033439,0.680567,1.057494
E,-1.563497,-0.566698,-0.24215,1.514391,0.947694


In [18]:
df.drop(labels='new', axis=1)

Unnamed: 0,C1,C2,C3,C4
A,-0.842724,1.969924,1.266119,-0.505877
B,2.545201,1.080812,0.484312,0.57914
C,-0.181583,1.410205,-0.374472,0.275198
D,-0.960755,0.376927,0.033439,0.680567
E,-1.563497,-0.566698,-0.24215,1.514391


In [19]:
df

Unnamed: 0,C1,C2,C3,C4,new
A,-0.842724,1.969924,1.266119,-0.505877,1.464048
B,2.545201,1.080812,0.484312,0.57914,1.659952
C,-0.181583,1.410205,-0.374472,0.275198,1.685403
D,-0.960755,0.376927,0.033439,0.680567,1.057494
E,-1.563497,-0.566698,-0.24215,1.514391,0.947694


In [20]:
df.drop("E")

Unnamed: 0,C1,C2,C3,C4,new
A,-0.842724,1.969924,1.266119,-0.505877,1.464048
B,2.545201,1.080812,0.484312,0.57914,1.659952
C,-0.181583,1.410205,-0.374472,0.275198,1.685403
D,-0.960755,0.376927,0.033439,0.680567,1.057494


In [21]:
df

Unnamed: 0,C1,C2,C3,C4,new
A,-0.842724,1.969924,1.266119,-0.505877,1.464048
B,2.545201,1.080812,0.484312,0.57914,1.659952
C,-0.181583,1.410205,-0.374472,0.275198,1.685403
D,-0.960755,0.376927,0.033439,0.680567,1.057494
E,-1.563497,-0.566698,-0.24215,1.514391,0.947694


## show only row data

In [22]:
df.loc[['C']]

Unnamed: 0,C1,C2,C3,C4,new
C,-0.181583,1.410205,-0.374472,0.275198,1.685403


In [23]:
df.loc['C']

C1    -0.181583
C2     1.410205
C3    -0.374472
C4     0.275198
new    1.685403
Name: C, dtype: float64

## show index based row

In [24]:
df.iloc[2]

C1    -0.181583
C2     1.410205
C3    -0.374472
C4     0.275198
new    1.685403
Name: C, dtype: float64

In [25]:
df.loc['C','C3']

-0.3744716909802062

In [26]:
df.loc[['B','C'], ['C3','C4']]

Unnamed: 0,C3,C4
B,0.484312,0.57914
C,-0.374472,0.275198


In [27]:
booldf = df>0
booldf

Unnamed: 0,C1,C2,C3,C4,new
A,False,True,True,False,True
B,True,True,True,True,True
C,False,True,False,True,True
D,False,True,True,True,True
E,False,False,False,True,True


In [28]:
df[booldf]

Unnamed: 0,C1,C2,C3,C4,new
A,,1.969924,1.266119,,1.464048
B,2.545201,1.080812,0.484312,0.57914,1.659952
C,,1.410205,,0.275198,1.685403
D,,0.376927,0.033439,0.680567,1.057494
E,,,,1.514391,0.947694


In [29]:
df[df>0]

Unnamed: 0,C1,C2,C3,C4,new
A,,1.969924,1.266119,,1.464048
B,2.545201,1.080812,0.484312,0.57914,1.659952
C,,1.410205,,0.275198,1.685403
D,,0.376927,0.033439,0.680567,1.057494
E,,,,1.514391,0.947694


In [30]:
df['C3']>0

A     True
B     True
C    False
D     True
E    False
Name: C3, dtype: bool

In [31]:
df[df['C3']>0]

Unnamed: 0,C1,C2,C3,C4,new
A,-0.842724,1.969924,1.266119,-0.505877,1.464048
B,2.545201,1.080812,0.484312,0.57914,1.659952
D,-0.960755,0.376927,0.033439,0.680567,1.057494


In [32]:
df[df['C3']>0][['C1','C4']]

Unnamed: 0,C1,C4
A,-0.842724,-0.505877
B,2.545201,0.57914
D,-0.960755,0.680567


In [33]:
dfseries = df['C3']>0
dfdata = df[dfseries]
dfcolumn = ['C1','C4']
dfdata[dfcolumn]

Unnamed: 0,C1,C4
A,-0.842724,-0.505877
B,2.545201,0.57914
D,-0.960755,0.680567


## multiple condition in dataframe

In [34]:
df[(df['C2']>0) & (df['C4']>.5)]

Unnamed: 0,C1,C2,C3,C4,new
B,2.545201,1.080812,0.484312,0.57914,1.659952
D,-0.960755,0.376927,0.033439,0.680567,1.057494


In [35]:
df['states'] = "DD GG HH UU OI".split()

In [36]:
df

Unnamed: 0,C1,C2,C3,C4,new,states
A,-0.842724,1.969924,1.266119,-0.505877,1.464048,DD
B,2.545201,1.080812,0.484312,0.57914,1.659952,GG
C,-0.181583,1.410205,-0.374472,0.275198,1.685403,HH
D,-0.960755,0.376927,0.033439,0.680567,1.057494,UU
E,-1.563497,-0.566698,-0.24215,1.514391,0.947694,OI


In [37]:
df.set_index('states')

Unnamed: 0_level_0,C1,C2,C3,C4,new
states,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
DD,-0.842724,1.969924,1.266119,-0.505877,1.464048
GG,2.545201,1.080812,0.484312,0.57914,1.659952
HH,-0.181583,1.410205,-0.374472,0.275198,1.685403
UU,-0.960755,0.376927,0.033439,0.680567,1.057494
OI,-1.563497,-0.566698,-0.24215,1.514391,0.947694


## multiple index levels

In [38]:
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index

[('G1', 1), ('G1', 2), ('G1', 3), ('G2', 1), ('G2', 2), ('G2', 3)]

In [39]:
hier_index = pd.MultiIndex.from_tuples(hier_index)
hier_index

MultiIndex(levels=[['G1', 'G2'], [1, 2, 3]],
           codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

In [40]:
df1 = pd.DataFrame(randn(6,2),hier_index,['A','B'])
df1

Unnamed: 0,Unnamed: 1,A,B
G1,1,-0.333057,0.047365
G1,2,1.46274,1.535029
G1,3,0.56644,0.149265
G2,1,-1.078278,1.395472
G2,2,1.787484,-0.569517
G2,3,0.175387,-0.462506
