# Pandas Tutorial

In [1]:
!pip install pandas numpy



In [2]:
import numpy as np  # 여러개의 숫자를 동시에 잘 다룰 수 있게 하는 라이브러리, 벡터, 행렬 연산에 도움을 주는 라이브러리
import pandas as pd

# 기본적인 자료구조

1. Series: 어떤 타입의 데이터든지 가질 수 있는 레이블이 있는 일차원 배열입니다. 예를 들어 정수, 문자열, 파이썬 객체 등
2. DataFrame: 데이터를 두 차원 배열이나 행과 열이 있는 테이블처럼 보유하는 두 차원 데이터 구조


# 객체 생성

값의 리스트를 전달하여 시리즈를 생성하고, 판다스가 기본 RangeIndex를 생성하도록 합니다.

## Series

In [3]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])

In [4]:
s

Unnamed: 0,0
0,1.0
1,3.0
2,5.0
3,
4,6.0
5,8.0


# DataFrame

## 여러가지 DataFrame 만드는 방법

In [5]:
data = [{"name": "genji", "hp": 200},
        {"name": "doomfist", "hp":450},
        {"name": "merci", "hp": 200},
       ]

df = pd.DataFrame(data)

In [6]:
df

Unnamed: 0,name,hp
0,genji,200
1,doomfist,450
2,merci,200


In [7]:
data = {"name": ["genji", "doomfist", "merci"],
        "hp": [200, 450, 200]
        }
df = pd.DataFrame(data)

In [8]:
df

Unnamed: 0,name,hp
0,genji,200
1,doomfist,450
2,merci,200


data, column name, index를 step by step 으로 만들어보기

In [9]:
mat = [ [ float(f"{row}.{col}") for col in range(4)] for row in range(15)]

In [10]:
mat

[[0.0, 0.1, 0.2, 0.3],
 [1.0, 1.1, 1.2, 1.3],
 [2.0, 2.1, 2.2, 2.3],
 [3.0, 3.1, 3.2, 3.3],
 [4.0, 4.1, 4.2, 4.3],
 [5.0, 5.1, 5.2, 5.3],
 [6.0, 6.1, 6.2, 6.3],
 [7.0, 7.1, 7.2, 7.3],
 [8.0, 8.1, 8.2, 8.3],
 [9.0, 9.1, 9.2, 9.3],
 [10.0, 10.1, 10.2, 10.3],
 [11.0, 11.1, 11.2, 11.3],
 [12.0, 12.1, 12.2, 12.3],
 [13.0, 13.1, 13.2, 13.3],
 [14.0, 14.1, 14.2, 14.3]]

In [11]:
df = pd.DataFrame(mat)

In [12]:
df

Unnamed: 0,0,1,2,3
0,0.0,0.1,0.2,0.3
1,1.0,1.1,1.2,1.3
2,2.0,2.1,2.2,2.3
3,3.0,3.1,3.2,3.3
4,4.0,4.1,4.2,4.3
5,5.0,5.1,5.2,5.3
6,6.0,6.1,6.2,6.3
7,7.0,7.1,7.2,7.3
8,8.0,8.1,8.2,8.3
9,9.0,9.1,9.2,9.3


컬렁명 추가

In [13]:
df = pd.DataFrame(mat, columns=["A", "B", "C", "D"])

In [14]:
df

Unnamed: 0,A,B,C,D
0,0.0,0.1,0.2,0.3
1,1.0,1.1,1.2,1.3
2,2.0,2.1,2.2,2.3
3,3.0,3.1,3.2,3.3
4,4.0,4.1,4.2,4.3
5,5.0,5.1,5.2,5.3
6,6.0,6.1,6.2,6.3
7,7.0,7.1,7.2,7.3
8,8.0,8.1,8.2,8.3
9,9.0,9.1,9.2,9.3


인덱스 추가

In [15]:
dates = pd.date_range("20240101", periods=15) # data_range()

In [16]:
dates

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06', '2024-01-07', '2024-01-08',
               '2024-01-09', '2024-01-10', '2024-01-11', '2024-01-12',
               '2024-01-13', '2024-01-14', '2024-01-15'],
              dtype='datetime64[ns]', freq='D')

In [17]:
df = pd.DataFrame(mat, index=dates, columns=["A", "B", "C", "D"])

In [18]:
df

Unnamed: 0,A,B,C,D
2024-01-01,0.0,0.1,0.2,0.3
2024-01-02,1.0,1.1,1.2,1.3
2024-01-03,2.0,2.1,2.2,2.3
2024-01-04,3.0,3.1,3.2,3.3
2024-01-05,4.0,4.1,4.2,4.3
2024-01-06,5.0,5.1,5.2,5.3
2024-01-07,6.0,6.1,6.2,6.3
2024-01-08,7.0,7.1,7.2,7.3
2024-01-09,8.0,8.1,8.2,8.3
2024-01-10,9.0,9.1,9.2,9.3


# 데이터 조회하기

In [19]:
df.head(2)

Unnamed: 0,A,B,C,D
2024-01-01,0.0,0.1,0.2,0.3
2024-01-02,1.0,1.1,1.2,1.3


In [20]:
df.tail(2)

Unnamed: 0,A,B,C,D
2024-01-14,13.0,13.1,13.2,13.3
2024-01-15,14.0,14.1,14.2,14.3


In [21]:
df.index

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06', '2024-01-07', '2024-01-08',
               '2024-01-09', '2024-01-10', '2024-01-11', '2024-01-12',
               '2024-01-13', '2024-01-14', '2024-01-15'],
              dtype='datetime64[ns]', freq='D')

In [22]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [23]:
df.describe()

Unnamed: 0,A,B,C,D
count,15.0,15.0,15.0,15.0
mean,7.0,7.1,7.2,7.3
std,4.472136,4.472136,4.472136,4.472136
min,0.0,0.1,0.2,0.3
25%,3.5,3.6,3.7,3.8
50%,7.0,7.1,7.2,7.3
75%,10.5,10.6,10.7,10.8
max,14.0,14.1,14.2,14.3


In [24]:
df

Unnamed: 0,A,B,C,D
2024-01-01,0.0,0.1,0.2,0.3
2024-01-02,1.0,1.1,1.2,1.3
2024-01-03,2.0,2.1,2.2,2.3
2024-01-04,3.0,3.1,3.2,3.3
2024-01-05,4.0,4.1,4.2,4.3
2024-01-06,5.0,5.1,5.2,5.3
2024-01-07,6.0,6.1,6.2,6.3
2024-01-08,7.0,7.1,7.2,7.3
2024-01-09,8.0,8.1,8.2,8.3
2024-01-10,9.0,9.1,9.2,9.3


In [25]:
df.T # 데이터 프레임의 전치(Tranpsoe), 행과 열 뒤바꾸기

Unnamed: 0,2024-01-01,2024-01-02,2024-01-03,2024-01-04,2024-01-05,2024-01-06,2024-01-07,2024-01-08,2024-01-09,2024-01-10,2024-01-11,2024-01-12,2024-01-13,2024-01-14,2024-01-15
A,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0
B,0.1,1.1,2.1,3.1,4.1,5.1,6.1,7.1,8.1,9.1,10.1,11.1,12.1,13.1,14.1
C,0.2,1.2,2.2,3.2,4.2,5.2,6.2,7.2,8.2,9.2,10.2,11.2,12.2,13.2,14.2
D,0.3,1.3,2.3,3.3,4.3,5.3,6.3,7.3,8.3,9.3,10.3,11.3,12.3,13.3,14.3


In [26]:
df.sort_index(axis=1, ascending=False)  # ascending: 정렬 순서 지정 (True: 오름차순)

Unnamed: 0,D,C,B,A
2024-01-01,0.3,0.2,0.1,0.0
2024-01-02,1.3,1.2,1.1,1.0
2024-01-03,2.3,2.2,2.1,2.0
2024-01-04,3.3,3.2,3.1,3.0
2024-01-05,4.3,4.2,4.1,4.0
2024-01-06,5.3,5.2,5.1,5.0
2024-01-07,6.3,6.2,6.1,6.0
2024-01-08,7.3,7.2,7.1,7.0
2024-01-09,8.3,8.2,8.1,8.0
2024-01-10,9.3,9.2,9.1,9.0


- 오름차순 정렬 (작은 값에서 큰 값으로)
df.sort_values(by='A', ascending=True)

- 내림차순 정렬 (큰 값에서 작은 값으로)
df.sort_values(by='A', ascending=False)


In [None]:
# 각 열의 값을 더하는 경우 (행 방향, 열 단위로 합산)
df.sum(axis=0)

# 각 행의 값을 더하는 경우 (열 방향, 행 단위로 합산)
df.sum(axis=1)

In [27]:
df

Unnamed: 0,A,B,C,D
2024-01-01,0.0,0.1,0.2,0.3
2024-01-02,1.0,1.1,1.2,1.3
2024-01-03,2.0,2.1,2.2,2.3
2024-01-04,3.0,3.1,3.2,3.3
2024-01-05,4.0,4.1,4.2,4.3
2024-01-06,5.0,5.1,5.2,5.3
2024-01-07,6.0,6.1,6.2,6.3
2024-01-08,7.0,7.1,7.2,7.3
2024-01-09,8.0,8.1,8.2,8.3
2024-01-10,9.0,9.1,9.2,9.3


In [28]:
df_2 = df.sort_values(by="B", ascending=False) # 열 B를 기준으로 정렬

In [29]:
df

Unnamed: 0,A,B,C,D
2024-01-01,0.0,0.1,0.2,0.3
2024-01-02,1.0,1.1,1.2,1.3
2024-01-03,2.0,2.1,2.2,2.3
2024-01-04,3.0,3.1,3.2,3.3
2024-01-05,4.0,4.1,4.2,4.3
2024-01-06,5.0,5.1,5.2,5.3
2024-01-07,6.0,6.1,6.2,6.3
2024-01-08,7.0,7.1,7.2,7.3
2024-01-09,8.0,8.1,8.2,8.3
2024-01-10,9.0,9.1,9.2,9.3


In [None]:
df_2

Unnamed: 0,A,B,C,D
2024-01-15,14.0,14.1,14.2,14.3
2024-01-14,13.0,13.1,13.2,13.3
2024-01-13,12.0,12.1,12.2,12.3
2024-01-12,11.0,11.1,11.2,11.3
2024-01-11,10.0,10.1,10.2,10.3
2024-01-10,9.0,9.1,9.2,9.3
2024-01-09,8.0,8.1,8.2,8.3
2024-01-08,7.0,7.1,7.2,7.3
2024-01-07,6.0,6.1,6.2,6.3
2024-01-06,5.0,5.1,5.2,5.3


## Selection

특정 값에 대한 조회 방법은 여러가지가 있습니다.
일반적으로 []와 같이 일반적인 리스트나 numpy 의 요소 접근 방법을 사용 할 수도 있고, DataFrame.at(), DataFrame.iat(), DataFrame.loc(), DataFrame.iloc()의 함수를 사용해서 접근 할 수 있습니다.

> 만약, 이것이 Production 용이라면, DataFrame.at(), DataFrame.iat(), DataFrame.loc(), DataFrame.iloc()의 사용을 권장합니다.


### GetItem

In [30]:
df["A"]

Unnamed: 0,A
2024-01-01,0.0
2024-01-02,1.0
2024-01-03,2.0
2024-01-04,3.0
2024-01-05,4.0
2024-01-06,5.0
2024-01-07,6.0
2024-01-08,7.0
2024-01-09,8.0
2024-01-10,9.0


In [31]:
df[0:3]

Unnamed: 0,A,B,C,D
2024-01-01,0.0,0.1,0.2,0.3
2024-01-02,1.0,1.1,1.2,1.3
2024-01-03,2.0,2.1,2.2,2.3


In [32]:
df["20240102":"20240104"]

Unnamed: 0,A,B,C,D
2024-01-02,1.0,1.1,1.2,1.3
2024-01-03,2.0,2.1,2.2,2.3
2024-01-04,3.0,3.1,3.2,3.3


### Selection by Label

In [33]:
dates[0]

Timestamp('2024-01-01 00:00:00')

In [34]:
df.loc[dates[0]]

Unnamed: 0,2024-01-01
A,0.0
B,0.1
C,0.2
D,0.3


In [35]:
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
2024-01-01,0.0,0.1
2024-01-02,1.0,1.1
2024-01-03,2.0,2.1
2024-01-04,3.0,3.1
2024-01-05,4.0,4.1
2024-01-06,5.0,5.1
2024-01-07,6.0,6.1
2024-01-08,7.0,7.1
2024-01-09,8.0,8.1
2024-01-10,9.0,9.1


In [36]:
df.loc["20240102":"20240104", ["A", "B"]]

Unnamed: 0,A,B
2024-01-02,1.0,1.1
2024-01-03,2.0,2.1
2024-01-04,3.0,3.1


In [37]:
df.loc[dates[0], "A"]

0.0

- .iloc (Integer-location based Indexing)
- 정수 인덱스(위치 기반)를 기준으로 데이터 선택
- df.iloc[행 번호, 열 번호]

In [None]:
# 행 라벨이 2인 데이터를 선택 (인덱스 라벨이 2인 행)
df.loc[2]

# 행 라벨이 2인 데이터의 A 열 선택
df.loc[2, 'A']

# 행 라벨이 0에서 2 사이의 데이터 선택 (포함 범위)
df.loc[0:2]

# 조건에 맞는 행 선택 (예: A 열의 값이 5보다 큰 행)
df.loc[df['A'] > 5]

In [38]:
df.at[dates[0], "A"] # 단일값 선택할 때 사용 (단일 요소 선택하거나 수정할 때 .loc보다 빠름)

0.0

### Selection by position

In [39]:
df

Unnamed: 0,A,B,C,D
2024-01-01,0.0,0.1,0.2,0.3
2024-01-02,1.0,1.1,1.2,1.3
2024-01-03,2.0,2.1,2.2,2.3
2024-01-04,3.0,3.1,3.2,3.3
2024-01-05,4.0,4.1,4.2,4.3
2024-01-06,5.0,5.1,5.2,5.3
2024-01-07,6.0,6.1,6.2,6.3
2024-01-08,7.0,7.1,7.2,7.3
2024-01-09,8.0,8.1,8.2,8.3
2024-01-10,9.0,9.1,9.2,9.3


In [40]:
df.iloc[3]

Unnamed: 0,2024-01-04
A,3.0
B,3.1
C,3.2
D,3.3


In [41]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2024-01-04,3.0,3.1
2024-01-05,4.0,4.1


In [42]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2024-01-02,1.0,1.2
2024-01-03,2.0,2.2
2024-01-05,4.0,4.2


In [43]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2024-01-02,1.0,1.1,1.2,1.3
2024-01-03,2.0,2.1,2.2,2.3


In [None]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2024-01-01,0.1,0.2
2024-01-02,1.1,1.2
2024-01-03,2.1,2.2
2024-01-04,3.1,3.2
2024-01-05,4.1,4.2
2024-01-06,5.1,5.2
2024-01-07,6.1,6.2
2024-01-08,7.1,7.2
2024-01-09,8.1,8.2
2024-01-10,9.1,9.2


In [44]:
df.iloc[1, 1]

1.1

In [45]:
df.iat[1, 1]

1.1

# 😵‍💫 [] , loc, at, iloc, iat
pandas에서 데이터를 선택하고 조작하는 데 사용되는 다양한 메서드들인 [], loc, at, iloc, iat의 차이점은 아래와 같습니다.ㅇ

1. [] (대괄호) 인덱싱:
  - 가장 기본적인 인덱싱 방식입니다.
  - 열 이름으로 열을 선택하거나 불리언 배열을 사용하여 행을 필터링할 수 있습니다.
  - 예: df['column_name'] 또는 df[df['column_name'] > 0].
2. loc:
  - 레이블 기반 인덱싱을 위해 사용됩니다.
  - 행과 열 모두에 대해 레이블 이름(인덱스)을 사용하여 데이터를 선택합니다.
  - 슬라이스, 단일 레이블, 레이블 리스트, 불리언 배열 등 다양한 방식으로 선택할 수 있습니다.
  - 예: df.loc[0:5, 'column_name'].
3. at:
  - loc와 비슷하지만 단일 셀에 대한 빠른 접근을 제공합니다.
  - 레이블 기반 인덱싱으로, 오직 단일 레이블의 행과 열에 대한 스칼라 값을 얻는 데 사용됩니다.
  - 예: df.at[4, 'column_name'].
4. iloc:
  - 위치 기반 인덱싱을 위해 사용됩니다.
  - 정수형 위치 인덱스를 사용하여 행과 열의 위치에 따라 데이터를 선택합니다.
  - 슬라이스, 단일 정수, 정수 리스트, 불리언 배열 등 다양한 방식으로 선택할 수 있습니다.
  - 예: df.iloc[0:5, 1:3].
5. iat:
  - iloc와 비슷하지만 단일 셀에 대한 빠른 접근을 제공합니다.
  - 위치 기반 인덱싱으로, 오직 정수형 위치 인덱스를 사용하여 단일 스칼라 값을 얻습니다.
  - 예: df.iat[1, 2].
이러한 메서드들의 주요 차이점은 레이블 기반 인덱싱(loc, at)과 위치 기반 인덱싱(iloc, iat) 사이, 그리고 단일 값에 대한 빠른 접근(at, iat)과 다양한 형태의 데이터 선택(loc, iloc) 사이에 있습니다. [] 인덱싱은 가장 기본적이지만, 상대적으로 제한적인 기능을 제공합니다.

### Boolean Indexing

In [46]:
df

Unnamed: 0,A,B,C,D
2024-01-01,0.0,0.1,0.2,0.3
2024-01-02,1.0,1.1,1.2,1.3
2024-01-03,2.0,2.1,2.2,2.3
2024-01-04,3.0,3.1,3.2,3.3
2024-01-05,4.0,4.1,4.2,4.3
2024-01-06,5.0,5.1,5.2,5.3
2024-01-07,6.0,6.1,6.2,6.3
2024-01-08,7.0,7.1,7.2,7.3
2024-01-09,8.0,8.1,8.2,8.3
2024-01-10,9.0,9.1,9.2,9.3


In [47]:
df[df["A"] > 4]

Unnamed: 0,A,B,C,D
2024-01-06,5.0,5.1,5.2,5.3
2024-01-07,6.0,6.1,6.2,6.3
2024-01-08,7.0,7.1,7.2,7.3
2024-01-09,8.0,8.1,8.2,8.3
2024-01-10,9.0,9.1,9.2,9.3
2024-01-11,10.0,10.1,10.2,10.3
2024-01-12,11.0,11.1,11.2,11.3
2024-01-13,12.0,12.1,12.2,12.3
2024-01-14,13.0,13.1,13.2,13.3
2024-01-15,14.0,14.1,14.2,14.3


In [48]:
df["A"] > 4

Unnamed: 0,A
2024-01-01,False
2024-01-02,False
2024-01-03,False
2024-01-04,False
2024-01-05,False
2024-01-06,True
2024-01-07,True
2024-01-08,True
2024-01-09,True
2024-01-10,True


In [49]:
df > 4

Unnamed: 0,A,B,C,D
2024-01-01,False,False,False,False
2024-01-02,False,False,False,False
2024-01-03,False,False,False,False
2024-01-04,False,False,False,False
2024-01-05,False,True,True,True
2024-01-06,True,True,True,True
2024-01-07,True,True,True,True
2024-01-08,True,True,True,True
2024-01-09,True,True,True,True
2024-01-10,True,True,True,True


In [50]:
df[df > 4]

Unnamed: 0,A,B,C,D
2024-01-01,,,,
2024-01-02,,,,
2024-01-03,,,,
2024-01-04,,,,
2024-01-05,,4.1,4.2,4.3
2024-01-06,5.0,5.1,5.2,5.3
2024-01-07,6.0,6.1,6.2,6.3
2024-01-08,7.0,7.1,7.2,7.3
2024-01-09,8.0,8.1,8.2,8.3
2024-01-10,9.0,9.1,9.2,9.3


In [51]:
df[df > 4].isna() # 결측값(NaN)을 찾기 위해 사용하는 함수

Unnamed: 0,A,B,C,D
2024-01-01,True,True,True,True
2024-01-02,True,True,True,True
2024-01-03,True,True,True,True
2024-01-04,True,True,True,True
2024-01-05,True,False,False,False
2024-01-06,False,False,False,False
2024-01-07,False,False,False,False
2024-01-08,False,False,False,False
2024-01-09,False,False,False,False
2024-01-10,False,False,False,False


In [52]:
df[df[df > 4].isna()]

Unnamed: 0,A,B,C,D
2024-01-01,0.0,0.1,0.2,0.3
2024-01-02,1.0,1.1,1.2,1.3
2024-01-03,2.0,2.1,2.2,2.3
2024-01-04,3.0,3.1,3.2,3.3
2024-01-05,4.0,,,
2024-01-06,,,,
2024-01-07,,,,
2024-01-08,,,,
2024-01-09,,,,
2024-01-10,,,,


## Setting

In [53]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range("20240102", periods=6))

In [54]:
s1

Unnamed: 0,0
2024-01-02,1
2024-01-03,2
2024-01-04,3
2024-01-05,4
2024-01-06,5
2024-01-07,6


In [55]:
df

Unnamed: 0,A,B,C,D
2024-01-01,0.0,0.1,0.2,0.3
2024-01-02,1.0,1.1,1.2,1.3
2024-01-03,2.0,2.1,2.2,2.3
2024-01-04,3.0,3.1,3.2,3.3
2024-01-05,4.0,4.1,4.2,4.3
2024-01-06,5.0,5.1,5.2,5.3
2024-01-07,6.0,6.1,6.2,6.3
2024-01-08,7.0,7.1,7.2,7.3
2024-01-09,8.0,8.1,8.2,8.3
2024-01-10,9.0,9.1,9.2,9.3


In [56]:
df["F"] = s1

In [57]:
df

Unnamed: 0,A,B,C,D,F
2024-01-01,0.0,0.1,0.2,0.3,
2024-01-02,1.0,1.1,1.2,1.3,1.0
2024-01-03,2.0,2.1,2.2,2.3,2.0
2024-01-04,3.0,3.1,3.2,3.3,3.0
2024-01-05,4.0,4.1,4.2,4.3,4.0
2024-01-06,5.0,5.1,5.2,5.3,5.0
2024-01-07,6.0,6.1,6.2,6.3,6.0
2024-01-08,7.0,7.1,7.2,7.3,
2024-01-09,8.0,8.1,8.2,8.3,
2024-01-10,9.0,9.1,9.2,9.3,


In [58]:
dates[0]

Timestamp('2024-01-01 00:00:00')

In [59]:
# label을 이용해서 특정 요소에 값 설정
df.at[dates[0], "A"] = 100

In [60]:
df

Unnamed: 0,A,B,C,D,F
2024-01-01,100.0,0.1,0.2,0.3,
2024-01-02,1.0,1.1,1.2,1.3,1.0
2024-01-03,2.0,2.1,2.2,2.3,2.0
2024-01-04,3.0,3.1,3.2,3.3,3.0
2024-01-05,4.0,4.1,4.2,4.3,4.0
2024-01-06,5.0,5.1,5.2,5.3,5.0
2024-01-07,6.0,6.1,6.2,6.3,6.0
2024-01-08,7.0,7.1,7.2,7.3,
2024-01-09,8.0,8.1,8.2,8.3,
2024-01-10,9.0,9.1,9.2,9.3,


In [61]:
# position을 이용해서 특정 요소에 값 설정
df.iat[0, 1] = 200

In [62]:
df

Unnamed: 0,A,B,C,D,F
2024-01-01,100.0,200.0,0.2,0.3,
2024-01-02,1.0,1.1,1.2,1.3,1.0
2024-01-03,2.0,2.1,2.2,2.3,2.0
2024-01-04,3.0,3.1,3.2,3.3,3.0
2024-01-05,4.0,4.1,4.2,4.3,4.0
2024-01-06,5.0,5.1,5.2,5.3,5.0
2024-01-07,6.0,6.1,6.2,6.3,6.0
2024-01-08,7.0,7.1,7.2,7.3,
2024-01-09,8.0,8.1,8.2,8.3,
2024-01-10,9.0,9.1,9.2,9.3,


In [63]:
df.loc[:, "D"] = [500] * len(df) # 500이 15번 반복된 리스트 생성 = 총 15개의 500

In [67]:
len(df)

15

In [68]:
df

Unnamed: 0,A,B,C,D,F
2024-01-01,100.0,200.0,0.2,500.0,
2024-01-02,1.0,1.1,1.2,500.0,1.0
2024-01-03,2.0,2.1,2.2,500.0,2.0
2024-01-04,3.0,3.1,3.2,500.0,3.0
2024-01-05,4.0,4.1,4.2,500.0,4.0
2024-01-06,5.0,5.1,5.2,500.0,5.0
2024-01-07,6.0,6.1,6.2,500.0,6.0
2024-01-08,7.0,7.1,7.2,500.0,
2024-01-09,8.0,8.1,8.2,500.0,
2024-01-10,9.0,9.1,9.2,500.0,


In [69]:
df2 = df.copy()

In [70]:
df2[df2 > 5] = -df2

In [71]:
df2

Unnamed: 0,A,B,C,D,F
2024-01-01,-100.0,-200.0,0.2,-500.0,
2024-01-02,1.0,1.1,1.2,-500.0,1.0
2024-01-03,2.0,2.1,2.2,-500.0,2.0
2024-01-04,3.0,3.1,3.2,-500.0,3.0
2024-01-05,4.0,4.1,4.2,-500.0,4.0
2024-01-06,5.0,-5.1,-5.2,-500.0,5.0
2024-01-07,-6.0,-6.1,-6.2,-500.0,-6.0
2024-01-08,-7.0,-7.1,-7.2,-500.0,
2024-01-09,-8.0,-8.1,-8.2,-500.0,
2024-01-10,-9.0,-9.1,-9.2,-500.0,


### 두개의 DataFrame 이어붙이기

### concat() VS merge()
- concat(): 행 또는 열을 단순하게 이어 붙이는 작업
- 따라서, 인덱스 중복될 수 있음
- merge(): JOIN 연산처럼, 두 데이터프레임을 특정 열을 기준으로 병합
- 공통된 열을 기준으로 병합

In [72]:
df3 = pd.concat([df, df2])

In [73]:
df3

Unnamed: 0,A,B,C,D,F
2024-01-01,100.0,200.0,0.2,500.0,
2024-01-02,1.0,1.1,1.2,500.0,1.0
2024-01-03,2.0,2.1,2.2,500.0,2.0
2024-01-04,3.0,3.1,3.2,500.0,3.0
2024-01-05,4.0,4.1,4.2,500.0,4.0
2024-01-06,5.0,5.1,5.2,500.0,5.0
2024-01-07,6.0,6.1,6.2,500.0,6.0
2024-01-08,7.0,7.1,7.2,500.0,
2024-01-09,8.0,8.1,8.2,500.0,
2024-01-10,9.0,9.1,9.2,500.0,


## 기본적인 연산

In [74]:
df.describe()

Unnamed: 0,A,B,C,D,F
count,15.0,15.0,15.0,15.0,6.0
mean,13.666667,20.426667,7.2,500.0,3.5
std,24.221203,49.840753,4.472136,0.0,1.870829
min,1.0,1.1,0.2,500.0,1.0
25%,4.5,4.6,3.7,500.0,2.25
50%,8.0,8.1,7.2,500.0,3.5
75%,11.5,11.6,10.7,500.0,4.75
max,100.0,200.0,14.2,500.0,6.0


In [75]:
df["A"].mean()

13.666666666666666

In [76]:
df["A"].std()

24.221202832779934

In [77]:
df["A"].median()

8.0

In [78]:
df["A"].sum()

205.0