In [21]:
import pandas as pd
import numpy as np

##### Pandas의 객체 : Series

In [3]:
data1 = pd.Series([0.25,0.5,0.75,1.0], index=['a','b','c','d'])
data2 = pd.Series([0.25,0.5,0.75,1.0], index=['2','5','3','7'])

In [4]:
data1

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [5]:
data2

2    0.25
5    0.50
3    0.75
7    1.00
dtype: float64

In [6]:
population_dict = {'California':38332521,'Texas':26448193,'New York':123,'Florida':234,'Illinois':1435}

In [7]:
population_dict

{'California': 38332521,
 'Texas': 26448193,
 'New York': 123,
 'Florida': 234,
 'Illinois': 1435}

In [8]:
population = pd.Series(population_dict)

In [9]:
population

California    38332521
Texas         26448193
New York           123
Florida            234
Illinois          1435
dtype: int64

##### Pandas의 객체  : DataFrame

In [10]:
area_dict = {'California':38332521,'Texas':26448193,'New York':123,'Florida':234,'Illinois':1435}

In [11]:
area_dict

{'California': 38332521,
 'Texas': 26448193,
 'New York': 123,
 'Florida': 234,
 'Illinois': 1435}

In [12]:
area = pd.Series(area_dict)

In [13]:
area

California    38332521
Texas         26448193
New York           123
Florida            234
Illinois          1435
dtype: int64

In [14]:
states = pd.DataFrame({'population':population,'area':area})

In [15]:
states

Unnamed: 0,population,area
California,38332521,38332521
Texas,26448193,26448193
New York,123,123
Florida,234,234
Illinois,1435,1435


In [16]:
pd.DataFrame(population,columns=['population'])

Unnamed: 0,population
California,38332521
Texas,26448193
New York,123
Florida,234
Illinois,1435


In [17]:
data = [{'a':i,'b':2*i}for i in range(3)]

In [18]:
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [19]:
pd.DataFrame([{'a':1,'b':2},{'b':3,'c':4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [22]:
pd.DataFrame(np.random.rand(3,2),columns=['foo','bar'],index=['a','b','c'])

Unnamed: 0,foo,bar
a,0.858696,0.182845
b,0.33686,0.40004
c,0.62118,0.002428


##### Pandas의 객체 : Index

In [24]:
ind = pd.Index([2,3,5,7,11])

In [25]:
ind[1]

3

In [26]:
ind[::2]

Int64Index([2, 5, 11], dtype='int64')

In [27]:
ind[1]=0 ## 요소 변경 불가

TypeError: Index does not support mutable operations

In [29]:
indA = pd.Index([1,3,5,7,9])
indB = pd.Index([2,3,5,7,11])

In [30]:
indA&indB

Int64Index([3, 5, 7], dtype='int64')

In [31]:
indA|indB

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

##### Key를 이용한  Series에서 데이터 인덱싱

In [42]:
data = pd.Series([0.25,0.5,0.75,1.0], index=['a','b','c','d'])

In [43]:
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [44]:
data['b']

0.5

In [45]:
'a' in data

True

In [46]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [47]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [48]:
data['e']=1.25

In [49]:
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [50]:
data['d']=9.9999

In [51]:
data

a    0.2500
b    0.5000
c    0.7500
d    9.9999
e    1.2500
dtype: float64

##### Key를 이용한  Series에서 데이터 슬라이싱

In [52]:
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [53]:
data[0:2]

a    0.25
b    0.50
dtype: float64

In [54]:
data[(data>0.3)&(data<0.8)]

b    0.50
c    0.75
dtype: float64

In [55]:
data[['a','e']]

a    0.25
e    1.25
dtype: float64

In [56]:
data[[3,2]]

d    9.9999
c    0.7500
dtype: float64

##### loc 인덱서 (명시적인 인덱스 참조) iloc 인덱서 (암묵적인 인덱스 참조)

In [57]:
data = pd.Series(['a','b','c'],index=[1,3,5])

In [58]:
data.iloc[1]

'b'

In [63]:
data.iloc[1:3]

3    b
5    c
dtype: object

In [60]:
data.loc[3]

'b'

In [61]:
data.loc[3:5]

3    b
5    c
dtype: object

##### DataFrame에서 데이터 선택

In [64]:
area = pd.Series({'California':38332521,'Texas':26448193,'New York':123,'Florida':234,'Illinois':1435})

In [65]:
area

California    38332521
Texas         26448193
New York           123
Florida            234
Illinois          1435
dtype: int64

In [66]:
pop = pd.Series({'California':38332521,'Texas':26448193,'New York':123,'Florida':234,'Illinois':1435})

In [67]:
pop

California    38332521
Texas         26448193
New York           123
Florida            234
Illinois          1435
dtype: int64

In [68]:
data = pd.DataFrame({'area':area,'pop':pop})

In [69]:
data

Unnamed: 0,area,pop
California,38332521,38332521
Texas,26448193,26448193
New York,123,123
Florida,234,234
Illinois,1435,1435


In [70]:
data['density'] = data['pop']/data['area']

In [71]:
data

Unnamed: 0,area,pop,density
California,38332521,38332521,1.0
Texas,26448193,26448193,1.0
New York,123,123,1.0
Florida,234,234,1.0
Illinois,1435,1435,1.0


##### DataFrame의 인덱싱

In [72]:
data['Florida':'Illinois']

Unnamed: 0,area,pop,density
Florida,234,234,1.0
Illinois,1435,1435,1.0


In [74]:
data['Florida'] #인덱싱은 열을 참조 - 오류발생

KeyError: 'Florida'

In [75]:
data['area':'density'] #슬라이싱은 행을 참조

KeyError: 'area'

In [76]:
data['area']

California    38332521
Texas         26448193
New York           123
Florida            234
Illinois          1435
Name: area, dtype: int64

In [78]:
data[data.density>0]

Unnamed: 0,area,pop,density
California,38332521,38332521,1.0
Texas,26448193,26448193,1.0
New York,123,123,1.0
Florida,234,234,1.0
Illinois,1435,1435,1.0


##### 결측치 처리
    NaN : 표현 불가능한 수치형 값이라는 뜻이지만, 대부분의 결측값은 NaN으로 처리됨
    None : 함수의 출력 값이 정의되지 않은 경우에 None이 출력됨

##### NaN 연산

In [79]:
vals = np.array([1,np.nan,3,4])

In [80]:
vals

array([ 1., nan,  3.,  4.])

In [81]:
sum(vals)

nan

In [82]:
1 + np.nan

nan

In [83]:
0 * np.nan

nan

In [84]:
np.nansum(vals)

8.0

In [85]:
np.nanmin(vals)

1.0

##### NaN 과 None

In [86]:
pd.Series([1,np.nan,2,None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [87]:
x = pd.Series([0,1],dtype=int)

In [88]:
x

0    0
1    1
dtype: int32

In [89]:
x.dtype

dtype('int32')

In [91]:
x[0] = None

In [92]:
x

0    NaN
1    1.0
dtype: float64

In [93]:
x.dtype

dtype('float64')

##### Null값 연산

In [94]:
data = pd.Series([1,np.nan,'hello',None])

In [95]:
data

0        1
1      NaN
2    hello
3     None
dtype: object

In [96]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [97]:
data.notnull()

0     True
1    False
2     True
3    False
dtype: bool

In [98]:
data.dropna()

0        1
2    hello
dtype: object

In [99]:
data.fillna(0)

0        1
1        0
2    hello
3        0
dtype: object

In [100]:
df = pd.DataFrame([[np.nan,np.nan,np.nan],[2,3,5],[np.nan,4,6]])

In [101]:
df

Unnamed: 0,0,1,2
0,,,
1,2.0,3.0,5.0
2,,4.0,6.0


In [102]:
df.isnull()

Unnamed: 0,0,1,2
0,True,True,True
1,False,False,False
2,True,False,False


In [103]:
df.notnull()

Unnamed: 0,0,1,2
0,False,False,False
1,True,True,True
2,False,True,True


In [104]:
df.dropna(axis=0)

Unnamed: 0,0,1,2
1,2.0,3.0,5.0


In [105]:
df.dropna(axis=1)

0
1
2


In [106]:
df.dropna(axis=0,how='all')

Unnamed: 0,0,1,2
1,2.0,3.0,5.0
2,,4.0,6.0


In [107]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.0,0.0,0.0
1,2.0,3.0,5.0
2,0.0,4.0,6.0


In [109]:
df.fillna(method = 'ffill')

Unnamed: 0,0,1,2
0,,,
1,2.0,3.0,5.0
2,2.0,4.0,6.0


In [110]:
df.fillna(method = 'bfill')

Unnamed: 0,0,1,2
0,2.0,3.0,5.0
1,2.0,3.0,5.0
2,,4.0,6.0


##### pd.concat

In [111]:
ser1 = pd.Series(['A','B','C'],index=[1,2,3])
ser2 = pd.Series(['D','E','F'],index=[4,5,6])

In [112]:
ser1

1    A
2    B
3    C
dtype: object

In [113]:
ser2

4    D
5    E
6    F
dtype: object

In [114]:
pd.concat([ser1,ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [115]:
df1 = pd.DataFrame({'A':['A1','A2'],'B':['B1','B2']},index = [0,1])
df2 = pd.DataFrame({'A':['A1','A2'],'B':['B1','B2']},index = [2,3])

In [116]:
df1

Unnamed: 0,A,B
0,A1,B1
1,A2,B2


In [117]:
df2

Unnamed: 0,A,B
2,A1,B1
3,A2,B2


In [118]:
pd.concat([df1,df2])

Unnamed: 0,A,B
0,A1,B1
1,A2,B2
2,A1,B1
3,A2,B2


In [119]:
df2.index = df1.index

In [120]:
df1

Unnamed: 0,A,B
0,A1,B1
1,A2,B2


In [121]:
df2

Unnamed: 0,A,B
0,A1,B1
1,A2,B2


In [122]:
pd.concat([df1,df2])

Unnamed: 0,A,B
0,A1,B1
1,A2,B2
0,A1,B1
1,A2,B2


In [123]:
pd.concat([df1,df2],ignore_index = True)

Unnamed: 0,A,B
0,A1,B1
1,A2,B2
2,A1,B1
3,A2,B2


In [124]:
pd.concat([df1,df2],keys=['df1','df2'])

Unnamed: 0,Unnamed: 1,A,B
df1,0,A1,B1
df1,1,A2,B2
df2,0,A1,B1
df2,1,A2,B2


##### pd.merge

In [125]:
df1 = pd.DataFrame({'employee':['Bob','Jake','Lisa','Sue'],'group':['Accounting','Engineering','Engineering','HR']})

In [126]:
df1

Unnamed: 0,employee,group
0,Bob,Accounting
1,Jake,Engineering
2,Lisa,Engineering
3,Sue,HR


In [127]:
df2 = pd.DataFrame({'employee':['Bob','Jake','Lisa','Sue'],'hire_date':[2004,2008,2012,2014]})

In [128]:
df2

Unnamed: 0,employee,hire_date
0,Bob,2004
1,Jake,2008
2,Lisa,2012
3,Sue,2014


In [129]:
df3 = pd.merge(df1,df2)

In [130]:
df3

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2004
1,Jake,Engineering,2008
2,Lisa,Engineering,2012
3,Sue,HR,2014


In [131]:
pd.merge(df1,df2,on=['employee'])

Unnamed: 0,employee,group,hire_date
0,Bob,Accounting,2004
1,Jake,Engineering,2008
2,Lisa,Engineering,2012
3,Sue,HR,2014


In [136]:
df3 = pd.DataFrame({'name':['Bob','Jake','Lisa','Sue'],'salary':[2004,2008,2012,2014]})

In [137]:
df3

Unnamed: 0,name,salary
0,Bob,2004
1,Jake,2008
2,Lisa,2012
3,Sue,2014


In [139]:
pd.merge(df1,df3,left_on='employee',right_on='name')

Unnamed: 0,employee,group,name,salary
0,Bob,Accounting,Bob,2004
1,Jake,Engineering,Jake,2008
2,Lisa,Engineering,Lisa,2012
3,Sue,HR,Sue,2014


In [145]:
pd.merge(df1,df3,left_index=True,right_index=True)

Unnamed: 0,employee,group,name,salary
0,Bob,Accounting,Bob,2004
1,Jake,Engineering,Jake,2008
2,Lisa,Engineering,Lisa,2012
3,Sue,HR,Sue,2014
