In [6]:
#!pip install pandas

In [7]:
#!pip install seaborn

In [8]:
import pandas as pd 

pd.__version__

'2.0.3'

## pandas의 데이터 구조
- Series 데이터 구조, 1차원 데이터 : 컬럼의 갯수가 1개인 데이터
- DataFrame 데이터 구조, 컬럼 갯수가 여러개인 데이터

In [9]:
s1 = pd.Series([10, 20, 30, 40, 50])
s1

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [10]:
s1.index

RangeIndex(start=0, stop=5, step=1)

In [11]:
s1.values

array([10, 20, 30, 40, 50], dtype=int64)

In [12]:
s1 = pd.Series(['a', 'b', 'c', 1, 2, 3])
s1

0    a
1    b
2    c
3    1
4    2
5    3
dtype: object

In [13]:
import numpy as np 

s3 = pd.Series([np.nan, 10, 30]) # 결측치, missing values
s3

0     NaN
1    10.0
2    30.0
dtype: float64

In [14]:
index_date = ['2018-10-07', '2018-10-08']
s4 = pd.Series([200, 195], index = index_date)
s4

2018-10-07    200
2018-10-08    195
dtype: int64

In [15]:
index_date = ['2018-10-08', '2018-10-08']
s4 = pd.Series([200, 195], index = index_date)
s4

# index 설정의 기본 원칙은 중복이 되면 안된다. 코드 상으로는 중복은 허용하더라.

2018-10-08    200
2018-10-08    195
dtype: int64

In [16]:
data_dict = {
    '국어' : 100, 
    '영어' : 95
}

s5 = pd.Series(data_dict)
s5

국어    100
영어     95
dtype: int64

## 날짜 데이터 
- p.242

In [17]:
pd.date_range(start='2024/01/01', end='2024.01.07')

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06', '2024-01-07'],
              dtype='datetime64[ns]', freq='D')

In [18]:
pd.date_range(start='2024/01/01', periods = 7)

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06', '2024-01-07'],
              dtype='datetime64[ns]', freq='D')

In [19]:
pd.date_range(start='2024/01/01', periods = 4, freq = '2D')

DatetimeIndex(['2024-01-01', '2024-01-03', '2024-01-05', '2024-01-07'], dtype='datetime64[ns]', freq='2D')

In [20]:
pd.date_range(start='2024-01-01 08:00', periods = 4, freq = 'H')

DatetimeIndex(['2024-01-01 08:00:00', '2024-01-01 09:00:00',
               '2024-01-01 10:00:00', '2024-01-01 11:00:00'],
              dtype='datetime64[ns]', freq='H')

## DataFrame을 활용한 데이터 생성

In [21]:
#!pip install seaborn



In [22]:
import seaborn as sns # 시각화 라이브러리 (통계 시각화)

sns.__version__

'0.12.2'

In [23]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [24]:
titanic = sns.load_dataset('titanic')
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [25]:
# 247p 참고
# 249p 딕셔너리 스타일 좋아함

table_data = {
    '연도' : [2015, 2016, 2016, 2017, 2017], 
    '지사' : ['한국', '한국', '미국', '한국', '미국'], 
    '고객 수' : [200, 250, 450, 300, 500]
}

table_data

{'연도': [2015, 2016, 2016, 2017, 2017],
 '지사': ['한국', '한국', '미국', '한국', '미국'],
 '고객 수': [200, 250, 450, 300, 500]}

In [26]:
data = pd.DataFrame(table_data)
data

Unnamed: 0,연도,지사,고객 수
0,2015,한국,200
1,2016,한국,250
2,2016,미국,450
3,2017,한국,300
4,2017,미국,500


In [27]:
data.index

RangeIndex(start=0, stop=5, step=1)

In [28]:
data.values.shape

(5, 3)

- 데이터 가공 시, numpy 메서드와 pandas 메서드 조합을 해서 처리하는 경우 많음
    + vectorization 으로 처리 / 파이썬 기초문법 (for-loop) 대신
    + 속도가 매우 빠름

In [29]:
data.columns

Index(['연도', '지사', '고객 수'], dtype='object')

In [30]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


## 데이터 연산 

In [31]:
# 리스트와 Numpy의 배열과 다르게, 서로 크기가 달라도 연산할 수 있음.

s1 = pd.Series([1, 2, 3, 4, 5])
s2 = pd.Series([10, 20, 30, 40, 50])
s1 + s2

0    11
1    22
2    33
3    44
4    55
dtype: int64

In [32]:
table_data1 = {'A': [1, 2, 3, 4, 5],
              'B': [10, 20, 30, 40, 50],
              'C': [100, 200, 300, 400, 500]}
df1 = pd.DataFrame(table_data1)
df1

Unnamed: 0,A,B,C
0,1,10,100
1,2,20,200
2,3,30,300
3,4,40,400
4,5,50,500


In [33]:
table_data2 = {'A': [6, 7, 8],
              'B': [60, 70, 80],
              'C': [600, 700, 800]}
df2 = pd.DataFrame(table_data2)
df2

Unnamed: 0,A,B,C
0,6,60,600
1,7,70,700
2,8,80,800


In [34]:
df1 + df2

Unnamed: 0,A,B,C
0,7.0,70.0,700.0
1,9.0,90.0,900.0
2,11.0,110.0,1100.0
3,,,
4,,,


In [35]:
table_data3 = {'봄':  [256.5, 264.3, 215.9, 223.2, 312.8],
              '여름': [770.6, 567.5, 599.8, 387.1, 446.2],
              '가을': [363.5, 231.2, 293.1, 247.7, 381.6],
              '겨울': [139.3, 59.9, 76.9, 109.1, 108.1]}
columns_list = ['봄', '여름', '가을', '겨울']
index_list = ['2012', '2013', '2014', '2015', '2016']

df3 = pd.DataFrame(table_data3, columns=columns_list, index=index_list)
df3

Unnamed: 0,봄,여름,가을,겨울
2012,256.5,770.6,363.5,139.3
2013,264.3,567.5,231.2,59.9
2014,215.9,599.8,293.1,76.9
2015,223.2,387.1,247.7,109.1
2016,312.8,446.2,381.6,108.1


In [36]:
df3.mean()

봄     254.54
여름    554.24
가을    303.42
겨울     98.66
dtype: float64

In [37]:
df3.max()

봄     312.8
여름    770.6
가을    381.6
겨울    139.3
dtype: float64

In [38]:
df3.min()

봄     215.9
여름    387.1
가을    231.2
겨울     59.9
dtype: float64

In [41]:
df3.mean(axis=1)

2012    382.475
2013    280.725
2014    296.425
2015    241.775
2016    312.175
dtype: float64

In [42]:
df3.max(axis=1)

2012    770.6
2013    567.5
2014    599.8
2015    387.1
2016    446.2
dtype: float64

In [44]:
df3.std(axis=1)

2012    274.472128
2013    211.128782
2014    221.150739
2015    114.166760
2016    146.548658
dtype: float64