# CHAPTER5 Getting Started with Pandas

![axis.jpg](attachment:axis.jpg)

## 5.1 Introduction to pandas Data Structures

In [1]:
import pandas as pd

In [3]:
# SERIES

In [4]:
obj = pd.Series([4, 7, -5, 3])

In [5]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [6]:
obj.values

array([ 4,  7, -5,  3])

In [7]:
obj.index # like range(4)

RangeIndex(start=0, stop=4, step=1)

In [8]:
obj2 = pd.Series([4, 7, -5, 3], index=['d','b','a','c'])

In [9]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [10]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [11]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [12]:
obj2['a']

-5

In [13]:
obj2['b']

7

In [14]:
obj2[['c','a','d']]

c    3
a   -5
d    4
dtype: int64

In [15]:
# menampilkan value dari obj2 dengan nilai lebih dari > 0
obj2[obj2 > 0]

d    4
b    7
c    3
dtype: int64

In [16]:
obj2 * 2

d     8
b    14
a   -10
c     6
dtype: int64

In [17]:
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [2]:
import numpy as np

In [19]:
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [20]:
'b' in obj2

True

In [21]:
'b' in obj

False

In [22]:
'e' in obj2

False

In [23]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [24]:
obj3 = pd.Series(sdata)

In [25]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [29]:
states = ['California','Ohio','Oregon','Texas']

In [30]:
# isi index dengan value yg diambil dari dict
obj4 = pd.Series(sdata, index=states)

In [31]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [32]:
# cek nilai null
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [33]:
# cek not null
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [34]:
# alternative cek null
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [35]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [36]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [37]:
# contoh intersection obj3 dan obj4, see Texas jadi NaN
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [38]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [39]:
# memberi nama object dan nama index
obj4.name = "Population"
obj4.index.name = "state"

In [40]:
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: Population, dtype: float64

In [41]:
# Altered Series index

In [42]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [43]:
obj.index = ['Bob','Steve','Jeff','Ryan']

In [44]:
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

In [45]:
# DATAFRAME

In [20]:
data = {
    'state': ['Ohio','Ohio','Ohio','Nevada','Nevada','Nevada'],
    'year': [2000, 2001, 2002, 2001, 2002, 2003],
    'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
}

In [48]:
frame = pd.DataFrame(data)

In [49]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [50]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [51]:
pd.DataFrame(data, columns=['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [52]:
data

{'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002, 2003],
 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

In [21]:
frame2 = pd.DataFrame(data, columns=['year','state','pop','debt'], index=['one','two','three','four','five','six'])

In [54]:
# setiap baris memiliki index dan column debt akan berisi
# missing values
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [55]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [56]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [57]:
frame2['year']

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [58]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [59]:
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [22]:
# mengisi seluruh column debt dengan 16.5
frame2['debt'] = 16.5

In [63]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [64]:
np.arange(6.)

array([0., 1., 2., 3., 4., 5.])

In [23]:
frame2['debt'] = np.arange(6.)

In [66]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


When you are assigning lists or arrays to a column, the value’s length must match the
length of the DataFrame. If you assign a Series, its labels will be realigned exactly to
the DataFrame’s index, inserting missing values in any holes:

In [24]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two','four','five'])

In [25]:
frame2['debt'] = val

In [26]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [80]:
val2 = pd.Series(['Ohio','Ohio','Ohio','Nevada','Nevada','Nevada'], index=['one','two','three','four','five','six'])

In [81]:
frame2['state'] = val2

In [83]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [85]:
frame2['eastern'] = frame2.state == 'Ohio'

In [86]:
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [87]:
# delete columns
del frame2['eastern']

In [88]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [89]:
data

{'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002, 2003],
 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

In [6]:
pop = {
    'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6},
    'Nevada': {2001: 2.4, 2002: 2.9}}

In [7]:
frame3 = pd.DataFrame(pop)

In [8]:
frame3

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [10]:
frame3.T

Unnamed: 0,2000,2001,2002
Ohio,1.5,1.7,3.6
Nevada,,2.4,2.9


In [11]:
pd.DataFrame(pop, index=[2001, 2002, 2003])

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9
2003,,


In [14]:
pdata = {
    'Ohio': frame3['Ohio'][:-1],
    'Nevada': frame3['Nevada'][:2]
}

In [15]:
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4


In [17]:
frame3.index.name = 'year'; frame3.columns.name = 'state'

In [18]:
frame3

state,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [19]:
frame3.values

array([[1.5, nan],
       [1.7, 2.4],
       [3.6, 2.9]])

In [27]:
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

In [28]:
# INDEX OBJECT

In [29]:
obj = pd.Series(range(3), index=['a','b','c'])

In [32]:
index = obj.index

In [33]:
index

Index(['a', 'b', 'c'], dtype='object')

In [34]:
index[1:]

Index(['b', 'c'], dtype='object')

In [35]:
# Index objects are immutable and thus can't be modified by the user
# index[1] = 'd' akan error

In [38]:
labels = pd.Index(np.arange(3))

In [39]:
labels

Int64Index([0, 1, 2], dtype='int64')

In [40]:
obj2 = pd.Series([1.5, -2.5, 0], index=labels)

In [41]:
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [42]:
obj2.index is labels

True

In [43]:
frame3

state,Ohio,Nevada
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,1.5,
2001,1.7,2.4
2002,3.6,2.9


In [44]:
frame3.columns

Index(['Ohio', 'Nevada'], dtype='object', name='state')

In [45]:
'Ohio' in frame3.columns

True

In [46]:
2003 in frame3.index

False

In [47]:
2001 in frame3.index

True

In [48]:
dup_labels = pd.Index(['foo','foo','bar','bar'])

In [49]:
dup_labels

Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

## 5.2 Essential Functionality

In [50]:
# REINDEXING

In [51]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d','b','a','c'])

In [52]:
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [53]:
obj2 = obj.reindex(['a','b','c','d','e'])

In [54]:
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [55]:
obj3 = pd.Series(['blue','purple','yellow'], index=[0, 2, 4])

In [56]:
obj3

0      blue
2    purple
4    yellow
dtype: object

For ordered data like time series, it may be desirable to do some interpolation or fill‐
ing of values when reindexing. The method option allows us to do this, using a
method such as ffill , which forward-fills the values

In [57]:
obj3.reindex(range(7), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
6    yellow
dtype: object

In [58]:
frame = pd.DataFrame(np.arange(9).reshape((3,3)), index = ['a','c','d'], columns=['Ohio','Texas','California'])

In [59]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [60]:
frame2 = frame.reindex(['a','b','c','d'])

In [61]:
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [62]:
states = ['Texas','Utah','California']

In [63]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [64]:
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [72]:
# DROPPING ENTRIES FROM AN AXIS

In [73]:
obj = pd.Series(np.arange(5.), index=['a','b','c','d','e'])

In [74]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [77]:
new_obj = obj.drop('c')

In [78]:
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [79]:
obj.drop(['d','c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [80]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [81]:
data = pd.DataFrame(np.arange(16).reshape((4,4)), index=['Ohio','Colorado','Utah','New York'], columns=['one','two','three','four'])

In [82]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [83]:
data.drop(['Colorado','Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [84]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


You can drop values from the columns by passing axis=1 or axis='columns' :

In [85]:
data.drop('two', axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [86]:
data.drop('two', axis='columns')

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [87]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [88]:
obj.drop('c', inplace=True)

In [89]:
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [90]:
# INDEXING, SELECTION, AND FILTERING

In [91]:
obj = pd.Series(np.arange(4.0), index=['a','b','c','d'])

In [92]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [93]:
obj['b']

1.0

In [94]:
obj[1]

1.0

In [95]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [96]:
obj[['b','a','d']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [97]:
obj[[1, 3]]

b    1.0
d    3.0
dtype: float64

In [98]:
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

In [99]:
obj['b':'c']

b    1.0
c    2.0
dtype: float64

In [100]:
obj['b':'c'] = 5

In [101]:
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

In [4]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)), index=['Ohio','Colorado','Utah','New Yourk'], columns=['one','two','three','four'])

In [103]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New Yourk,12,13,14,15


In [104]:
data['two']

Ohio          1
Colorado      5
Utah          9
New Yourk    13
Name: two, dtype: int64

In [105]:
data[['three','one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New Yourk,14,12


In [106]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New Yourk,12,13,14,15


In [107]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [108]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New Yourk,12,13,14,15


In [109]:
data['three']

Ohio          2
Colorado      6
Utah         10
New Yourk    14
Name: three, dtype: int64

In [110]:
data['three'] > 5

Ohio         False
Colorado      True
Utah          True
New Yourk     True
Name: three, dtype: bool

In [111]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New Yourk,12,13,14,15


In [112]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New Yourk,False,False,False,False


In [5]:
data[data<5] = 0

In [6]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New Yourk,12,13,14,15


In [3]:
# SELECITON WITH loc and iloc

In [7]:
data.loc['Colorado', ['two','three']]

two      5
three    6
Name: Colorado, dtype: int64

In [9]:
data.iloc[2, [3, 0, 1]]

four    11
one      8
two      9
Name: Utah, dtype: int64

In [10]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int64

In [11]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New Yourk,12,13,14,15


In [12]:
data.iloc[[1, 2], [3, 0,1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [13]:
data.loc[:'Utah','two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int64

In [14]:
data.iloc[:,:3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New Yourk,12,13,14


In [15]:
# INTEGER INDEXES

In [16]:
ser = pd.Series(np.arange(3.))

In [17]:
ser

0    0.0
1    1.0
2    2.0
dtype: float64

In [19]:
# ser[-1]
# akan terjadi error

In [20]:
ser2 = pd.Series(np.arange(3.), index=['a','b','c'])

In [21]:
ser2[-1]

2.0

In [22]:
# kode diatas tidak error

In [23]:
ser[:1]

0    0.0
dtype: float64

In [24]:
# jika pakai loc maka :n, nilai n nya akan terbawa
ser.loc[:1]

0    0.0
1    1.0
dtype: float64

In [25]:
ser.iloc[:1]

0    0.0
dtype: float64

In [26]:
# ARITMATHIC AND DATA ALIGNMENT

In [28]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a','c','d','e'])

In [30]:
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [31]:
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a','c','e','f','g'])

In [32]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [33]:
# perhatikan
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [34]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'), index=['Ohio','Texas','Colorado'])

In [35]:
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'), index=['Utah','Ohio','Texas','Oregon'])

In [36]:
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [37]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [38]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [40]:
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'B': [3, 3]})

In [41]:
df1

Unnamed: 0,A
0,1
1,2


In [42]:
df2

Unnamed: 0,B
0,3
1,3


In [43]:
df1 - df2

Unnamed: 0,A,B
0,,
1,,


In [44]:
# Aritmetic methods with fill values

In [45]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))

In [47]:
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))

In [48]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [50]:
df2.loc[1, 'b'] = np.nan

In [51]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [52]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [53]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [54]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [55]:
# semua nan akan dikasih nilai 0, dan jika disalah satu df tidak
# terdapat column tertentu maka akan diisi 0
# maka column e tidak akan terisi NaN tapi nilai dari column e itu sendiri
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [56]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [57]:
1 / df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


Each of
them has a counterpart, starting with the letter r , that has arguments flipped. So these
two statements are equivalent

In [58]:
1 / df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [59]:
df1.rdiv(1)

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


In [60]:
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [61]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [63]:
# column df1 akan diganti dengan column df2, dan 
# jika tidak ada nilainya akan diisi 0
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


In [64]:
# OPERATION BETWEEN DATAFRAME AND SERIES

In [65]:
arr = np.arange(12.).reshape((3, 4))

In [66]:
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [67]:
arr[0]

array([0., 1., 2., 3.])

In [68]:
# setiap baris akan dikurangi dengan array arr[0]
arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [69]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                    columns=list('bde'),
                    index=['Utah','Ohio','Texas','Oregon'])

In [70]:
series = frame.iloc[0]

In [71]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [73]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

In [74]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


In [75]:
series2 = pd.Series(range(3), index=['b','e','f'])

In [76]:
series2

b    0
e    1
f    2
dtype: int64

In [78]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [79]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [80]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [81]:
series3 = frame['d']

In [82]:
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

In [83]:
frame.sub(series3, axis='index')

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


In [84]:
# FUNCTION APPLICATION AND MAPPING

In [85]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), index=['Utah','Ohio','Texas','Oregon'])

In [86]:
frame

Unnamed: 0,b,d,e
Utah,-3.190421,1.203335,1.308863
Ohio,0.28652,-1.0067,0.521105
Texas,2.231325,1.004235,0.940333
Oregon,-0.474756,-1.707058,1.623855


In [87]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,3.190421,1.203335,1.308863
Ohio,0.28652,1.0067,0.521105
Texas,2.231325,1.004235,0.940333
Oregon,0.474756,1.707058,1.623855


In [88]:
frame

Unnamed: 0,b,d,e
Utah,-3.190421,1.203335,1.308863
Ohio,0.28652,-1.0067,0.521105
Texas,2.231325,1.004235,0.940333
Oregon,-0.474756,-1.707058,1.623855


In [89]:
f = lambda x: x.max() - x.min()

In [90]:
# yang dihitung adalah index / axis 0, maka yang muncul columns
frame.apply(f)

b    5.421747
d    2.910393
e    1.102750
dtype: float64

In [91]:
frame.apply(f, axis='columns')

Utah      4.499284
Ohio      1.527805
Texas     1.290993
Oregon    3.330913
dtype: float64

In [93]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min','max'])

In [94]:
frame

Unnamed: 0,b,d,e
Utah,-3.190421,1.203335,1.308863
Ohio,0.28652,-1.0067,0.521105
Texas,2.231325,1.004235,0.940333
Oregon,-0.474756,-1.707058,1.623855


In [95]:
frame.apply(f)

Unnamed: 0,b,d,e
min,-3.190421,-1.707058,0.521105
max,2.231325,1.203335,1.623855


In [96]:
format = lambda x: '%.2f' % x

In [97]:
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,-3.19,1.2,1.31
Ohio,0.29,-1.01,0.52
Texas,2.23,1.0,0.94
Oregon,-0.47,-1.71,1.62


In [98]:
frame

Unnamed: 0,b,d,e
Utah,-3.190421,1.203335,1.308863
Ohio,0.28652,-1.0067,0.521105
Texas,2.231325,1.004235,0.940333
Oregon,-0.474756,-1.707058,1.623855


In [99]:
frame['e'].map(format)

Utah      1.31
Ohio      0.52
Texas     0.94
Oregon    1.62
Name: e, dtype: object

In [101]:
# SORTING AND RANKING

In [102]:
obj = pd.Series(range(4), index=['d','a','b','c'])

In [103]:
obj.sort_index()

a    1
b    2
c    3
d    0
dtype: int64

In [104]:
obj

d    0
a    1
b    2
c    3
dtype: int64

With a DataFrame, you can sort by index on either axis:

In [105]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                    index=['three','one'],
                    columns=['d','a','b','c'])

In [106]:
frame

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


In [108]:
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [109]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


In [111]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [112]:
obj = pd.Series([4, 7, -3, 2])

In [113]:
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

In [114]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])

In [115]:
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [118]:
frame = pd.DataFrame({'b': [4, 7, -3, 2],
                    'a': [0 , 1, 0, 1]})

In [122]:
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


In [123]:
frame.sort_values(by='b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [120]:
# ketika di a ada dua nilai yg sama maka yg column b yg akan jadi pertimbangan berikutnya

In [121]:
frame.sort_values(by=['a','b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


In [124]:
pd.Series([7, -5, 7, 4, 2, 0, 4])

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [125]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])

In [126]:
obj.rank()

0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64

Ranks can also be assigned according to the order in which they’re observed in the
data:

In [127]:
obj.rank(method='first')

0    6.0
1    1.0
2    7.0
3    4.0
4    3.0
5    2.0
6    5.0
dtype: float64

In [129]:
obj

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64

In [130]:
# sort descending (dari nilai terkecil), nilai paling kecil paling besar

In [131]:
# nilai terkecil memiliki ranking terbesar
obj.rank(ascending=False, method='max')

0    2.0
1    7.0
2    2.0
3    4.0
4    5.0
5    6.0
6    4.0
dtype: float64

In [132]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0,1], 'c': [-2, 5, 8, 2.5]})

In [133]:
frame

Unnamed: 0,b,a,c
0,4.3,0,-2.0
1,7.0,1,5.0
2,-3.0,0,8.0
3,2.0,1,2.5


In [134]:
frame.rank(axis='columns')

Unnamed: 0,b,a,c
0,3.0,2.0,1.0
1,3.0,1.0,2.0
2,1.0,2.0,3.0
3,2.0,1.0,3.0


In [135]:
# AXIS INDEXES WITH DUPLOCATE LABELS

In [136]:
obj = pd.Series(range(5), index=['a','a','b','b','c'])

In [137]:
obj.index.is_unique

False

In [138]:
obj['a']

a    0
a    1
dtype: int64

In [139]:
obj['c']

4

In [140]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a','a','b','b'])

In [141]:
df

Unnamed: 0,0,1,2
a,0.555429,0.943953,0.003589
a,-0.967379,-0.007484,-1.275216
b,1.021152,0.231632,0.255313
b,-0.674578,-1.514211,-0.248194


In [142]:
df.loc['b']

Unnamed: 0,0,1,2
b,1.021152,0.231632,0.255313
b,-0.674578,-1.514211,-0.248194


## 5.3 Summarizing and Computing Descriptive Statistics

In [144]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=['a','b','c','d'], columns=['one','two'])

In [145]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [146]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [147]:
df.sum(axis='columns')

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

In [148]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [149]:
df.mean(axis='columns', skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

In [150]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [151]:
df.idxmax()

one    b
two    d
dtype: object

In [152]:
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


In [153]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [154]:
df.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [155]:
['a','a','b','c'] * 4

['a',
 'a',
 'b',
 'c',
 'a',
 'a',
 'b',
 'c',
 'a',
 'a',
 'b',
 'c',
 'a',
 'a',
 'b',
 'c']

In [156]:
obj = pd.Series(['a','a','b','c'] * 4)

In [157]:
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [158]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

In [3]:
# CORRELATION AND COVARIANCE

In [4]:
import pandas_datareader.data as web

In [5]:
all_data = {ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL','IBM','MSFT','GOOG']}

In [6]:
type(all_data)

dict

In [7]:
all_data

{'AAPL':                   High         Low        Open       Close       Volume  \
 Date                                                                      
 2015-09-08   28.139999   27.580000   27.937500   28.077499  219374400.0   
 2015-09-09   28.504999   27.442499   28.440001   27.537500  340043200.0   
 2015-09-10   28.320000   27.475000   27.567499   28.142500  251571200.0   
 2015-09-11   28.552500   27.940001   27.947500   28.552500  199662000.0   
 2015-09-14   29.222500   28.715000   29.145000   28.827499  233453600.0   
 ...                ...         ...         ...         ...          ...   
 2020-08-31  131.000000  126.000000  127.580002  129.039993  225702700.0   
 2020-09-01  134.800003  130.529999  132.759995  134.179993  152470100.0   
 2020-09-02  137.979996  127.000000  137.589996  131.399994  200119000.0   
 2020-09-03  128.839996  120.500000  126.910004  120.879997  254723200.0   
 2020-09-04  123.699997  110.889999  120.070000  120.959999  332607163.0   
 
  

In [8]:
all_data['AAPL']

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-09-08,28.139999,27.580000,27.937500,28.077499,219374400.0,25.946814
2015-09-09,28.504999,27.442499,28.440001,27.537500,340043200.0,25.447792
2015-09-10,28.320000,27.475000,27.567499,28.142500,251571200.0,26.006878
2015-09-11,28.552500,27.940001,27.947500,28.552500,199662000.0,26.385771
2015-09-14,29.222500,28.715000,29.145000,28.827499,233453600.0,26.639898
...,...,...,...,...,...,...
2020-08-31,131.000000,126.000000,127.580002,129.039993,225702700.0,129.039993
2020-09-01,134.800003,130.529999,132.759995,134.179993,152470100.0,134.179993
2020-09-02,137.979996,127.000000,137.589996,131.399994,200119000.0,131.399994
2020-09-03,128.839996,120.500000,126.910004,120.879997,254723200.0,120.879997


In [9]:
all_data['IBM']

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-09-08,147.339996,145.660004,145.860001,147.229996,3933300.0,117.901871
2015-09-09,149.039993,144.850006,148.740005,145.050003,3407800.0,116.156136
2015-09-10,147.160004,144.509995,145.850006,146.199997,3461700.0,117.077080
2015-09-11,147.500000,145.669998,145.910004,147.369995,3115100.0,118.013985
2015-09-14,147.369995,145.410004,147.369995,145.649994,3226700.0,116.636604
...,...,...,...,...,...,...
2020-08-31,125.250000,123.029999,125.250000,123.309998,4827900.0,123.309998
2020-09-01,123.949997,122.150002,122.849998,123.400002,3155300.0,123.400002
2020-09-02,128.699997,123.580002,123.720001,128.179993,6592400.0,128.179993
2020-09-03,129.949997,123.650002,128.190002,124.449997,5711000.0,124.449997


In [10]:
{ticker: data['Adj Close'] for ticker, data in all_data.items()}

{'AAPL': Date
 2015-09-08     25.946814
 2015-09-09     25.447792
 2015-09-10     26.006878
 2015-09-11     26.385771
 2015-09-14     26.639898
                  ...    
 2020-08-31    129.039993
 2020-09-01    134.179993
 2020-09-02    131.399994
 2020-09-03    120.879997
 2020-09-04    120.959999
 Name: Adj Close, Length: 1259, dtype: float64,
 'IBM': Date
 2015-09-08    117.901871
 2015-09-09    116.156136
 2015-09-10    117.077080
 2015-09-11    118.013985
 2015-09-14    116.636604
                  ...    
 2020-08-31    123.309998
 2020-09-01    123.400002
 2020-09-02    128.179993
 2020-09-03    124.449997
 2020-09-04    122.300003
 Name: Adj Close, Length: 1259, dtype: float64,
 'MSFT': Date
 2015-09-08     39.887024
 2015-09-09     39.141819
 2015-09-10     39.341743
 2015-09-11     39.514427
 2015-09-14     39.114552
                  ...    
 2020-08-31    225.529999
 2020-09-01    227.270004
 2020-09-02    231.649994
 2020-09-03    217.300003
 2020-09-04    214.250000
 Name

In [11]:
price = pd.DataFrame({ticker: data['Adj Close'] for ticker, data in all_data.items()})

In [12]:
price

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-09-08,25.946814,117.901871,39.887024,614.659973
2015-09-09,25.447792,116.156136,39.141819,612.719971
2015-09-10,26.006878,117.077080,39.341743,621.349976
2015-09-11,26.385771,118.013985,39.514427,625.770020
2015-09-14,26.639898,116.636604,39.114552,623.239990
...,...,...,...,...
2020-08-31,129.039993,123.309998,225.529999,1634.180054
2020-09-01,134.179993,123.400002,227.270004,1660.709961
2020-09-02,131.399994,128.179993,231.649994,1728.280029
2020-09-03,120.879997,124.449997,217.300003,1641.839966


In [13]:
volume = pd.DataFrame({ticker: data['Volume'] for ticker, data in all_data.items()})

In [14]:
volume

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-09-08,219374400.0,3933300.0,32469800.0,2279500
2015-09-09,340043200.0,3407800.0,33469500.0,1702100
2015-09-10,251571200.0,3461700.0,31366600.0,1905300
2015-09-11,199662000.0,3115100.0,27132500.0,1373500
2015-09-14,233453600.0,3226700.0,23656000.0,1702300
...,...,...,...,...
2020-08-31,225702700.0,4827900.0,28774200.0,1823400
2020-09-01,152470100.0,3155300.0,25791200.0,1826700
2020-09-02,200119000.0,6592400.0,34080800.0,2511200
2020-09-03,254723200.0,5711000.0,58148900.0,3100800


In [15]:
# compute percent changes of the price, 
# ini salah satu contoh operasi time series

In [16]:
price.pct_change()

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-09-08,,,,
2015-09-09,-0.019232,-0.014807,-0.018683,-0.003156
2015-09-10,0.021970,0.007929,0.005108,0.014085
2015-09-11,0.014569,0.008002,0.004389,0.007114
2015-09-14,0.009631,-0.011671,-0.010120,-0.004043
...,...,...,...,...
2020-08-31,0.033912,-0.014072,-0.014766,-0.006221
2020-09-01,0.039833,0.000730,0.007715,0.016234
2020-09-02,-0.020718,0.038736,0.019272,0.040687
2020-09-03,-0.080061,-0.029100,-0.061947,-0.050015


In [17]:
returns = price.pct_change()

In [18]:
returns

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-09-08,,,,
2015-09-09,-0.019232,-0.014807,-0.018683,-0.003156
2015-09-10,0.021970,0.007929,0.005108,0.014085
2015-09-11,0.014569,0.008002,0.004389,0.007114
2015-09-14,0.009631,-0.011671,-0.010120,-0.004043
...,...,...,...,...
2020-08-31,0.033912,-0.014072,-0.014766,-0.006221
2020-09-01,0.039833,0.000730,0.007715,0.016234
2020-09-02,-0.020718,0.038736,0.019272,0.040687
2020-09-03,-0.080061,-0.029100,-0.061947,-0.050015


In [19]:
returns.tail()

Unnamed: 0_level_0,AAPL,IBM,MSFT,GOOG
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-08-31,0.033912,-0.014072,-0.014766,-0.006221
2020-09-01,0.039833,0.00073,0.007715,0.016234
2020-09-02,-0.020718,0.038736,0.019272,0.040687
2020-09-03,-0.080061,-0.0291,-0.061947,-0.050015
2020-09-04,0.000662,-0.017276,-0.014036,-0.030941


In [20]:
# menghitung korelasi antara Variabel Adj Close(price)
# antara MSFT dan IBM

In [21]:
returns['MSFT'].corr(returns['IBM'])

0.5803947581676301

In [22]:
# menghitung covariance

In [23]:
returns['MSFT'].cov(returns['IBM'])

0.00016152264848401965

In [24]:
# alternative syntax untuk menghitung korelasi

In [25]:
returns.MSFT.corr(returns.IBM)

0.5803947581676301

In [26]:
# DataFrame's corr and cov method:

In [27]:
returns.corr()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,1.0,0.501901,0.698478,0.651976
IBM,0.501901,1.0,0.580395,0.537152
MSFT,0.698478,0.580395,1.0,0.781827
GOOG,0.651976,0.537152,0.781827,1.0


In [28]:
returns.cov()

Unnamed: 0,AAPL,IBM,MSFT,GOOG
AAPL,0.000341,0.000148,0.000224,0.000198
IBM,0.000148,0.000257,0.000162,0.000142
MSFT,0.000224,0.000162,0.000301,0.000223
GOOG,0.000198,0.000142,0.000223,0.000271


In [29]:
# menghitung nilai korelasi sebuah variabel dengan variabel lain nya

In [31]:
returns.corrwith(returns.IBM)

AAPL    0.501901
IBM     1.000000
MSFT    0.580395
GOOG    0.537152
dtype: float64

In [34]:
returns.corrwith(volume)

AAPL   -0.100658
IBM    -0.098690
MSFT   -0.053062
GOOG   -0.150853
dtype: float64

In [35]:
# UNIQUE VALUE, VALUE COUNTS, AND MEMBERSHIP

In [37]:
obj = pd.Series(['c','a','d','a','a','b','b','c','c'])

In [38]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [39]:
uniques = obj.unique()

In [40]:
uniques

array(['c', 'a', 'd', 'b'], dtype=object)

In [41]:
obj.value_counts()

c    3
a    3
b    2
d    1
dtype: int64

In [42]:
obj

0    c
1    a
2    d
3    a
4    a
5    b
6    b
7    c
8    c
dtype: object

In [43]:
mask = obj.isin(['b','c'])

In [44]:
mask

0     True
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
dtype: bool

In [45]:
obj[mask]

0    c
5    b
6    b
7    c
8    c
dtype: object

In [46]:
to_match = pd.Series(['c','a','b','b','c','a'])

In [47]:
unique_vals = pd.Series(['c','b','a'])

In [48]:
pd.Index(unique_vals).get_indexer(to_match)

array([0, 2, 1, 1, 0, 2])

In [49]:
# menghitung histogram dari kolom-kolom yang ada pada data frame

In [50]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                    'Qu2': [2, 3, 1, 2, 3],
                    'Qu3': [1, 5, 2, 4, 4]})

In [51]:
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [52]:
pd.value_counts

<function pandas.core.algorithms.value_counts(values, sort: bool = True, ascending: bool = False, normalize: bool = False, bins=None, dropna: bool = True) -> 'Series'>

In [53]:
result = data.apply(pd.value_counts).fillna(0)

In [54]:
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0
