# 데이터프레임

- 행과 열로 구성된 표 형태의 데이터

In [2]:
import pandas as pd

# 데이터 확인

In [5]:
df = pd.read_csv("./data/gapminder.tsv", sep = "\t")

In [3]:
df.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [6]:
type(df)

pandas.core.frame.DataFrame

## 행

- 가로로 나열되는 각 데이터의 단위
    - 로우(row) 또는 케이스(case)라고도 불림

In [7]:
df.shape # 행, 열

(1704, 6)

## 열

- 세로로 나열되는 속성
    - 컬럼(column) 또는 변수(variable)라고도 불림

In [8]:
df.columns

Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap'], dtype='object')

## 판다스와 파이썬 자료형

- 문자열
    - 파이썬 : string
    - 판다스 : object
    
- 정수
    - 파이썬 : int 
    - 판다스 : int64
    
- 실수
    - 파이썬 : float
    - 판다스 : float64
    
- datetime
    - 파이썬 : datetime
    - 판다스 : datetime64

In [10]:
# 데이터프레임 값의 자료형 확인 
df.dtypes

country       object
continent     object
year           int64
lifeExp      float64
pop            int64
gdpPercap    float64
dtype: object

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   continent  1704 non-null   object 
 2   year       1704 non-null   int64  
 3   lifeExp    1704 non-null   float64
 4   pop        1704 non-null   int64  
 5   gdpPercap  1704 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB


# 데이터 추출

In [22]:
df["country"]

0       Afghanistan
1       Afghanistan
2       Afghanistan
3       Afghanistan
4       Afghanistan
           ...     
1699       Zimbabwe
1700       Zimbabwe
1701       Zimbabwe
1702       Zimbabwe
1703       Zimbabwe
Name: country, Length: 1704, dtype: object

In [23]:
type(df["country"])

pandas.core.series.Series

In [24]:
country_se = df["country"]

In [27]:
# 데이터 앞부분 확인
country_se.head()

0    Afghanistan
1    Afghanistan
2    Afghanistan
3    Afghanistan
4    Afghanistan
Name: country, dtype: object

In [26]:
# 데이터 뒷부분 확인
country_se.tail()

1699    Zimbabwe
1700    Zimbabwe
1701    Zimbabwe
1702    Zimbabwe
1703    Zimbabwe
Name: country, dtype: object

In [29]:
df[["country", "continent", "year"]]

Unnamed: 0,country,continent,year
0,Afghanistan,Asia,1952
1,Afghanistan,Asia,1957
2,Afghanistan,Asia,1962
3,Afghanistan,Asia,1967
4,Afghanistan,Asia,1972
...,...,...,...
1699,Zimbabwe,Africa,1987
1700,Zimbabwe,Africa,1992
1701,Zimbabwe,Africa,1997
1702,Zimbabwe,Africa,2002


In [30]:
subset = df[["country", "continent", "year"]]

In [31]:
type(subset)

pandas.core.frame.DataFrame

In [32]:
subset.head()

Unnamed: 0,country,continent,year
0,Afghanistan,Asia,1952
1,Afghanistan,Asia,1957
2,Afghanistan,Asia,1962
3,Afghanistan,Asia,1967
4,Afghanistan,Asia,1972


## 행 단위 데이터 추출 

- loc
    - 인덱스를 기준으로 행 데이터 추출
- iloc
    - 행 번호를 기준으로 행 데이터 추출

### loc

- 인덱스(index) : 값의 위치를 나타낸 값
    - 데이터프레임을 만들면 자동으로 인덱스 번호가 부여됨
            - 현재 gapminder 데이터의 왼쪽에 세로로 나열된 0, 1, 2...

In [33]:
df.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [34]:
df.loc[0]

country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap     779.445314
Name: 0, dtype: object

In [35]:
df.loc[99]

country      Bangladesh
continent          Asia
year               1967
lifeExp          43.453
pop            62821884
gdpPercap    721.186086
Name: 99, dtype: object

In [36]:
type(df.loc[0])

pandas.core.series.Series

In [39]:
# ord() : 문자를 전달하면 해당 문자의 유니코드 값을 반환
# chr() : 특정 문자의 유니코드 값을 전달하면 해당 문자를 반환

ord("a"), ord("z")

(97, 122)

In [40]:
for i in range(97, 123):
    print(chr(i), end = " ")

a b c d e f g h i j k l m n o p q r s t u v w x y z 

In [41]:
df_index = pd.DataFrame({"value" : [i for i in range(26)]}, index = [chr(i) for i in range(97, 123)])

In [42]:
df_index.head()

Unnamed: 0,value
a,0
b,1
c,2
d,3
e,4


In [45]:
df_index.loc["a"]

value    0
Name: a, dtype: int64

In [49]:
# 하나의 행을 데이터프레임으로 추출
df_index.loc[["a"]]

Unnamed: 0,value
a,0


In [50]:
# 여러 행 추출
df_index.loc[["a", "c"]]

Unnamed: 0,value
a,0
c,2


In [65]:
# loc으로 마지막 행 추출하기
num_rows = df_index.shape[0]
last_row = num_rows - 1

In [66]:
last_row

25

In [68]:
df_index.loc[last_row]

KeyError: 25

In [58]:
df.tail()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
1699,Zimbabwe,Africa,1987,62.351,9216418,706.157306
1700,Zimbabwe,Africa,1992,60.377,10704340,693.420786
1701,Zimbabwe,Africa,1997,46.809,11404948,792.44996
1702,Zimbabwe,Africa,2002,39.989,11926563,672.038623
1703,Zimbabwe,Africa,2007,43.487,12311143,469.709298


### iloc

In [61]:
df.iloc[1]

country      Afghanistan
continent           Asia
year                1957
lifeExp           30.332
pop              9240934
gdpPercap      820.85303
Name: 1, dtype: object

In [62]:
df_index.iloc[2]

value    2
Name: c, dtype: int64

In [63]:
df.iloc[-1]

country        Zimbabwe
continent        Africa
year               2007
lifeExp          43.487
pop            12311143
gdpPercap    469.709298
Name: 1703, dtype: object

In [64]:
df.iloc[[0, 99, 999]]

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
99,Bangladesh,Asia,1967,43.453,62821884,721.186086
999,Mongolia,Asia,1967,51.253,1149500,1226.04113


# 행과 열 추출

In [70]:
df.loc[:, "year"]

0       1952
1       1957
2       1962
3       1967
4       1972
        ... 
1699    1987
1700    1992
1701    1997
1702    2002
1703    2007
Name: year, Length: 1704, dtype: int64

In [71]:
df.loc[:, ["year", "pop"]]

Unnamed: 0,year,pop
0,1952,8425333
1,1957,9240934
2,1962,10267083
3,1967,11537966
4,1972,13079460
...,...,...
1699,1987,9216418
1700,1992,10704340
1701,1997,11404948
1702,2002,11926563


In [74]:
df.iloc[:, [2, 4, -1]].head()

Unnamed: 0,year,pop,gdpPercap
0,1952,8425333,779.445314
1,1957,9240934,820.85303
2,1962,10267083,853.10071
3,1967,11537966,836.197138
4,1972,13079460,739.981106


In [75]:
df.iloc[:5, :3].head()

Unnamed: 0,country,continent,year
0,Afghanistan,Asia,1952
1,Afghanistan,Asia,1957
2,Afghanistan,Asia,1962
3,Afghanistan,Asia,1967
4,Afghanistan,Asia,1972


In [76]:
df.iloc[:, 0:6:2].head()

Unnamed: 0,country,year,pop
0,Afghanistan,1952,8425333
1,Afghanistan,1957,9240934
2,Afghanistan,1962,10267083
3,Afghanistan,1967,11537966
4,Afghanistan,1972,13079460


In [77]:
df.iloc[[0, 99, 999], [0, 3, 5]]

Unnamed: 0,country,lifeExp,gdpPercap
0,Afghanistan,28.801,779.445314
99,Bangladesh,43.453,721.186086
999,Mongolia,51.253,1226.04113


In [78]:
df.loc[[0, 99, 999], ["country", "lifeExp", "gdpPercap"]]

Unnamed: 0,country,lifeExp,gdpPercap
0,Afghanistan,28.801,779.445314
99,Bangladesh,43.453,721.186086
999,Mongolia,51.253,1226.04113


# 기초적인 통계 계산

In [80]:
df.describe()

Unnamed: 0,year,lifeExp,pop,gdpPercap
count,1704.0,1704.0,1704.0,1704.0
mean,1979.5,59.474439,29601210.0,7215.327081
std,17.26533,12.917107,106157900.0,9857.454543
min,1952.0,23.599,60011.0,241.165876
25%,1965.75,48.198,2793664.0,1202.060309
50%,1979.5,60.7125,7023596.0,3531.846988
75%,1993.25,70.8455,19585220.0,9325.462346
max,2007.0,82.603,1318683000.0,113523.1329


## 그룹화한 데이터의 평균 구하기

In [83]:
df.head()

Unnamed: 0,country,continent,year,lifeExp,pop,gdpPercap
0,Afghanistan,Asia,1952,28.801,8425333,779.445314
1,Afghanistan,Asia,1957,30.332,9240934,820.85303
2,Afghanistan,Asia,1962,31.997,10267083,853.10071
3,Afghanistan,Asia,1967,34.02,11537966,836.197138
4,Afghanistan,Asia,1972,36.088,13079460,739.981106


In [85]:
df.groupby("year")["lifeExp"].mean()

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

In [88]:
df.groupby(["year", "continent"])[["lifeExp", "gdpPercap"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,lifeExp,gdpPercap
year,continent,Unnamed: 2_level_1,Unnamed: 3_level_1
1952,Africa,39.1355,1252.572466
1952,Americas,53.27984,4079.062552
1952,Asia,46.314394,5195.484004
1952,Europe,64.4085,5661.057435
1952,Oceania,69.255,10298.08565
1957,Africa,41.266346,1385.236062
1957,Americas,55.96028,4616.043733
1957,Asia,49.318544,5787.73294
1957,Europe,66.703067,6963.012816
1957,Oceania,70.295,11598.522455


## 그룹화한 데이터 개수 세기

In [89]:
df.groupby("continent")["country"].nunique() # country의 개수

continent
Africa      52
Americas    25
Asia        33
Europe      30
Oceania      2
Name: country, dtype: int64

In [90]:
df.groupby("continent")["country"].value_counts() # 각 country별 데이터 개수 

continent  country       
Africa     Algeria           12
           Angola            12
           Benin             12
           Botswana          12
           Burkina Faso      12
                             ..
Europe     Switzerland       12
           Turkey            12
           United Kingdom    12
Oceania    Australia         12
           New Zealand       12
Name: country, Length: 142, dtype: int64

In [92]:
df.groupby("continent")["country"].count() # continent별 데이터 개수

continent
Africa      624
Americas    300
Asia        396
Europe      360
Oceania      24
Name: country, dtype: int64

# 데이터 생성

In [93]:
pd.Series(["banana", 42])

0    banana
1        42
dtype: object

In [95]:
pd.Series(["Wes McKinney", "Creator of Pandas"])

0         Wes McKinney
1    Creator of Pandas
dtype: object

In [96]:
pd.Series(["Wes McKinney", "Creator of Pandas"], index = ["Person", "Who"])

Person         Wes McKinney
Who       Creator of Pandas
dtype: object

In [102]:
scientists = pd.DataFrame(
    data = {"Occupation" : ["Chemist", "Statistician"],
           "Born" : ["1920-07-25", "1876-06-13"],
           "Died" : ["1958-04-16", "1937-10-16"],
           "Age" : [37, 61]},
    index = ["Rosaline Franklin", "William Gosset"],
    columns = ["Occupation", "Born", "Age", "Died"])

In [103]:
scientists

Unnamed: 0,Occupation,Born,Age,Died
Rosaline Franklin,Chemist,1920-07-25,37,1958-04-16
William Gosset,Statistician,1876-06-13,61,1937-10-16


# 시리즈 다루기

In [104]:
scientists.loc["William Gosset"]

Occupation    Statistician
Born            1876-06-13
Age                     61
Died            1937-10-16
Name: William Gosset, dtype: object

In [105]:
type(scientists.loc["William Gosset"])

pandas.core.series.Series

In [106]:
scientists.loc["William Gosset"].index

Index(['Occupation', 'Born', 'Age', 'Died'], dtype='object')

In [107]:
scientists.loc["William Gosset"].keys()

Index(['Occupation', 'Born', 'Age', 'Died'], dtype='object')

In [111]:
scientists.loc["William Gosset"].values

array(['Statistician', '1876-06-13', 61, '1937-10-16'], dtype=object)

In [113]:
scientists.loc["William Gosset"].index[0]

'Occupation'

## 시리즈에 기초통계 메서드 사용

In [114]:
scientists["Age"].mean()

49.0

In [115]:
scientists["Age"].min()

37

In [116]:
scientists["Age"].max()

61

In [117]:
scientists["Age"].std()

16.97056274847714

In [119]:
scientists = pd.read_csv("./data/scientists.csv")

In [127]:
scientists.head()

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist


In [128]:
scientists["Age"].max()

90

In [129]:
scientists["Age"].mean()

59.125

In [130]:
scientists["Age"].std()

18.325918413937288

In [132]:
scientists["Age"].min()

37

# 시리즈 다루기 응용

In [135]:
# 평균보다 나이가 많은 사람의 데이터 추출
ages = scientists["Age"]

In [136]:
ages[ages > ages.mean()]

1    61
2    90
3    66
7    77
Name: Age, dtype: int64

In [137]:
ages > ages.mean()

0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool

In [138]:
ages[[True, True, False, False, True, True, False, True]]

0    37
1    61
4    56
5    45
7    77
Name: Age, dtype: int64