In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

class disp(object):
    template = '<div style="float: left;padding:10px;"> <b>[{0}]</b> {1}</div>'
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join( self.template.format(a, eval(a)._repr_html_()) 
                        for a in self.args)

def prt(*pVPK) :
    print(*pVPK, sep = '\n', end = '\n\n')
    
import pandas as pd
import numpy as np    

### [예제1] Series 생성 : copy parameter

* copy하지 않으면 multiple binding 문제 발생
* a = b로 a에 b를 바인딩 한 후, b의 원소를 바꾸면 a의 원소도 바뀜 -> **multiple binding**

In [2]:
# ndarray 수정 전
a = np.array([3,5])
sr1 = pd.Series(a, index=('A', 'B'))
sr2 = pd.Series(a, index=('A', 'B'), copy=True)

prt(a, sr1, sr2)

[3 5]
A    3
B    5
dtype: int32
A    3
B    5
dtype: int32



In [3]:
# ndarray 수정 후
a = np.array([3,5])
sr1 = pd.Series(a,index=('A','B')) 
sr2 = pd.Series(a,index=('A','B'),copy=True) # copy한것으로 multiple binding 발생 X
a[0] = 100

prt(a, sr1, sr2)    

[100   5]
A    100
B      5
dtype: int32
A    3
B    5
dtype: int32



### [예제2] Series 생성 : iterable, array_like

In [None]:
# [s1 ~ s6]

s1 = pd.Series(range(20,23))
s2 = pd.Series(10)
s3 = pd.Series(10, index = ['kim','lee'], name='score')
s4 = pd.Series('hello')
s5 = pd.Series(['h','e','l'])
s6 = pd.Series(np.array([90,80]), index=['kim','lee'], dtype=float)  # data type -> float
# s6 = pd.Series(np.array([90,80]), index=['kim','lee'])  # dtype 없으면 data type 자동 지정 (-> int)
s1; s2; s3; s4; s5; s6

In [None]:
# [s7 ~ s9]
# Label align : index 기준으로 Series 생성 -> index에 dict key나 series label 존재시 해당 value 사용, 미존재시 NaN 처리

s7 = pd.Series({10:'kim', 20:'lee', 30:'park'})
s8 = pd.Series({10:'kim', 20:'lee', 30:'park'}, index=[20, 10, 11])  # index에 맞춰서 key 정렬 (key=11은 없으니까 value=NaN)
s9 = pd.Series(s7, index=[20, 10, 11])  # s7에서 index=[20, 10, 11]에 맞춰 정렬
s7; s8; s9

### [예제3] DataFrame 생성 : 빈 DataFrame 생성

In [4]:
df1 = pd.DataFrame(columns=list('ABC')) # col 'A','B','C'인 빈 데이터프레임 생성
df2 = pd.DataFrame(index=list('ABC')) # index 'A','B','C'인 빈 데이터프레임 생성
df3 = pd.DataFrame(index=np.arange(2), columns=list('ABC')) # index=0 1, col=A B C인 2x3 빈 데이터프레임 생성
df1; df2; df3

Unnamed: 0,A,B,C


A
B
C


Unnamed: 0,A,B,C
0,,,
1,,,


### [예제4] DataFrame 생성 : Scalar, Iterable

In [1]:
# [df2, df3]

df1 = pd.DataFrame(0, index=[1,2]) #error <- data가 있는데 column 지정 안됨. (data 있으면 row,col 둘다 필수 ★★★)
df2 = pd.DataFrame(0, index=[1,2], columns=['A','B'])
df3 = pd.DataFrame(50, index=range(10,12), columns=['A','B','C'], dtype=float)
df2; df3

In [6]:
# [df4, df5, df6]

df4 = pd.DataFrame([['kim',90],['lee',50]])
df5 = pd.DataFrame([['kim',90],['lee',50]], columns=['n','s'], index=[10,11])
df6 = pd.DataFrame([['kim',90],['lee']], columns=['n','s'], index=['A','B'])
df4; df5; df6

Unnamed: 0,0,1
0,kim,90
1,lee,50


Unnamed: 0,n,s
10,kim,90
11,lee,50


Unnamed: 0,n,s
A,kim,90.0
B,lee,


### [예제5] DataFrame 생성 : dict

In [9]:
# [df2, df3]

# df1 = pd.DataFrame({'n':'kim','a':10})  #error <- scalar는 broadcasting 해야하므로 반드시 index 개수 필요 ★
df2 = pd.DataFrame({'n':'kim','a':10}, index=[0])
df3 = pd.DataFrame({'n':['kim'],'a':[10]})  # 아니면, 아예 개수가 정해진 list 형태로 주면 됨
df2; df3

Unnamed: 0,n,a
0,kim,10


Unnamed: 0,n,a
0,kim,10


In [10]:
# [df4, df5]

df4 = pd.DataFrame({'n':'kim','a':10}, index=[1,2])
df5 = pd.DataFrame({'n':['kim'],'a':[10]}, index=[1,2])
df5_1 = pd.DataFrame({'n':['kim'],'a':[10]}, index=[1,2,3])
# df5_2 = pd.DataFrame({'n':['kim','lee'],'a':[10,20]}, index=[1,2,3,4])  #error <- broadcasting은 배수가 아님. 1개 scalar만 적용가능 
df4; df5; df5_1

Unnamed: 0,n,a
1,kim,10
2,kim,10


Unnamed: 0,n,a
1,kim,10
2,kim,10


Unnamed: 0,n,a
1,kim,10
2,kim,10
3,kim,10


In [8]:
# [df6, df7]

df6 = pd.DataFrame({'n':['kim','lee','park'],'a':[10,20,30]})
df7 = pd.DataFrame({'n':['kim','lee','park'],'a':[10,20,30]},index=['x','y','z']) 
df6; df7

Unnamed: 0,n,a
0,kim,10
1,lee,20
2,park,30


Unnamed: 0,n,a
x,kim,10
y,lee,20
z,park,30


In [11]:
# [df8]

sr = pd.Series(['kim','lee','park'])          
df8 = pd.DataFrame({'n':sr, 'a':{0:10, 2:20}}) # label align. a컬럼 dict의 index가 dataframe의 label에 align된다.
sr; df8

0     kim
1     lee
2    park
dtype: object

Unnamed: 0,n,a
0,kim,10.0
1,lee,
2,park,20.0


In [12]:
# [df9]

df9 = pd.DataFrame([{'n':'kim','a':10}, {'n':'lee','a':20}]) #?
df9

Unnamed: 0,n,a
0,kim,10
1,lee,20


### [예제6] DataFrame 생성 : ndarray

* type의 크기 : object > float > int
* 타입이 서로 다를 땐, 모두 포괄할 수 있도록 더 큰 타입으로 생성됨

    ex) float, object로 이뤄짐 -> 모두 object로
    
    ex) int, float로 이뤄짐 -> 모두 float로

In [13]:
arr1 = np.array([[1,2,3],[4,5,6]])
arr2 = np.array([['kim', 'W', 90], ['lee', 'M', 50]]) # ndarray는 전부 같은 type으로 만듦 -> 숫자도 object가 됨
df1 = pd.DataFrame(arr1, index=['X','Y'], columns=['A','B','C'])
df2 = pd.DataFrame(arr2, index=[10,20], columns = ['A','B','C'])

prt(df1.dtypes, df2.dtypes)
disp('df1', 'df2')

A    int32
B    int32
C    int32
dtype: object
A    object
B    object
C    object
dtype: object



Unnamed: 0,A,B,C
X,1,2,3
Y,4,5,6

Unnamed: 0,A,B,C
10,kim,W,90
20,lee,M,50


In [14]:
df2['C'] = df2['C'].astype(int)

prt(df2.dtypes)

A    object
B    object
C     int32
dtype: object



### [예제7] DataFrame 생성 : DataFrame, Series, 기타

In [15]:
# [1] 

df = pd.DataFrame({'id':[11,12,13],'n':['kim','lee','park'],'a':[30,40,50]}) 
df1 = pd.DataFrame(df, columns=['n','a'], index=[0, 2])
df2 = pd.DataFrame({'names':df['n'], 'age':df['a']}, index=[0,2])
df3 = pd.DataFrame(df, columns=['n','a','h'], index=[0,2,10])

disp('df', 'df1', 'df2','df3')

Unnamed: 0,id,n,a
0,11,kim,30
1,12,lee,40
2,13,park,50

Unnamed: 0,n,a
0,kim,30
2,park,50

Unnamed: 0,names,age
0,kim,30
2,park,50

Unnamed: 0,n,a,h
0,kim,30.0,
2,park,50.0,
10,,,


In [None]:
# [2]

df = pd.DataFrame({'id':[11,12,13],'n':['kim','lee','park'],'a':[30,40,50]}) 
df4 = pd.DataFrame(df['n'])    

disp('df', 'df4')

In [None]:
# [3]

df5 = pd.DataFrame.from_dict({'n':['kim','lee'],'a':[30,40]})
df6 = pd.DataFrame.from_dict({'n':['kim','lee'],'a':[30,40]}, orient='index')

disp('df5', 'df6')

### [예제8] Index 생성

In [None]:
# [1]

i1 = pd.Index([11, 12, 13], name='id')
i2 = pd.Index(['A','B','A'], name='class')
sr1 = pd.Series(['kim','lee','park'], index=i1)
sr2 = pd.Series(['kim','lee','park'], index=i2)
i1; i2; sr1; sr2

In [None]:
# [2]

i3 = pd.Index(range(11, 14), dtype = 'int32')
i4 = pd.RangeIndex(11, 16, 2, name='id')
sr3 = pd.Series(['kim','lee','park'], index = i3)
sr4 = pd.Series(sr3, index = i4)
i3; i4; sr3; sr4

### [예제9] DatetimeIndex 객체

In [6]:
# [1]

i1 = pd.DatetimeIndex(['2000-1-2','2000-1-27','2000-2-21','2000-3-17'])
i2 = pd.date_range('1/2/2000', periods=4, freq='25D')
i3 = pd.date_range('2000-1-1 12:00:00', periods=2, freq='30T')
prt(type(i1),i1.dtype, type(i2),i2.dtype, type(i3), i3.dtype)
i1; i2; i3

<class 'pandas.core.indexes.datetimes.DatetimeIndex'>
datetime64[ns]
<class 'pandas.core.indexes.datetimes.DatetimeIndex'>
datetime64[ns]
<class 'pandas.core.indexes.datetimes.DatetimeIndex'>
datetime64[ns]



DatetimeIndex(['2000-01-02', '2000-01-27', '2000-02-21', '2000-03-17'], dtype='datetime64[ns]', freq=None)

DatetimeIndex(['2000-01-02', '2000-01-27', '2000-02-21', '2000-03-17'], dtype='datetime64[ns]', freq='25D')

DatetimeIndex(['2000-01-01 12:00:00', '2000-01-01 12:30:00'], dtype='datetime64[ns]', freq='30T')

In [7]:
# [2]

sr = pd.Series(range(4), index=i2)
r1 = sr['2000-01-27']
r2 = sr['2000-01-01':'2000-02-20']
r3 = sr['2000-01']
r4 = sr['2000']
sr; r1; r2; r3; r4

2000-01-02    0
2000-01-27    1
2000-02-21    2
2000-03-17    3
Freq: 25D, dtype: int64

1

2000-01-02    0
2000-01-27    1
Freq: 25D, dtype: int64

2000-01-02    0
2000-01-27    1
Freq: 25D, dtype: int64

2000-01-02    0
2000-01-27    1
2000-02-21    2
2000-03-17    3
Freq: 25D, dtype: int64

### [예제10] Series, DataFrame의 Format 변환!

In [None]:
#[1]

df = pd.DataFrame({'n':['kim','lee','park'],'a':[19,20,22]}) 

df.to_excel('out_data/test1.xlsx', index=False)
r1 = pd.read_excel('out_data/test1.xlsx')
df.to_csv('out_data/test2.csv', index=False)
r2 = pd.read_csv('out_data/test2.csv')
r3 = df.to_dict()
r1; r2; r3

In [None]:
# [2] 

idx = pd.Index(['kim','lee'], name='name')
sr = pd.Series([10,20], index=idx, name='score')

r4 = sr.to_numpy()
r5 = sr.to_list()
r6 = sr.to_dict()
r7 = sr.to_frame()
sr.to_csv('out_data/test3.csv')
r8 = pd.read_csv('out_data/test3.csv')
sr; r4; r5; r6; r7; r8