Pandas

In [None]:
# Numpy를 기반으로 하여, 표 형태의 데이터를 효율적으로 처리하고 분석할 수 있는 다양한 함수와 자료 구조를 제공
# Series(1차원 배열과 유사하며, 각 요소에 인덱스 부여), DataFrame (표 형태의 2차원 데이터를 나타내며, 행과 열로 구성)

# 특징
# 데이터 읽기/쓰기 : 다양한 형식의 데이터(CSV, Excel, SQL 등)를 읽고 쓸 수 있음
# 데이터 선택 및 필터링 : 원하는 데이터를 선택하고 조건에 맞는 데이터를 추출할 수 있음
# 데이터 변형 : 데이터를 정렬, 그룹화, 평균 등 다양한 방식으로 변형할 수 있음
# 결측치 처리 : 결측치를 찾고 처리하는 다양한 방법을 제공
# 데이터 시각화 : Matplotlib와 연동하여 데이터를 시각화 할 수 있음

Pandas 함수 예시

In [2]:
# read_csv()
# CSV 파일 읽기
import pandas as pd
df = pd.read_csv("../dataset/diabetes.csv", encoding="CP949")
print(df)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   50        1  
1                  

In [43]:
# head(), tail()
# 데이터프레임의 처음 또는 마지막 부분 확인
df.head()
df.tail()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


In [45]:
# shape
# 데이터프레임의 행과 열 개수 확인
df.shape # 출력 : (3행 4열)

(768, 9)

In [46]:
# info()
# 데이터프레임의 요약 정보 확인
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [47]:
# describe()
# 데이터프레임의 기본적인 통계 정보 확인
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [52]:
# loc[]
# 데이터 선택
df.loc[:, "Pregnancies"]
df["Pregnancies"]
# loc[:, "컬럼명"]
# df["컬럼명"]

0       6
1       1
2       8
3       1
4       0
       ..
763    10
764     2
765     5
766     1
767     1
Name: Pregnancies, Length: 768, dtype: int64

In [53]:
# iloc[]
# 데이터 선택
df.iloc[1] # 하나의 행값 전체를 출력

Pregnancies                  1.000
Glucose                     85.000
BloodPressure               66.000
SkinThickness               29.000
Insulin                      0.000
BMI                         26.600
DiabetesPedigreeFunction     0.351
Age                         31.000
Outcome                      0.000
Name: 1, dtype: float64

In [54]:
# groupby
# 데이터 그룹화
df.groupby("Pregnancies").mean()

Unnamed: 0_level_0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,123.0,67.153153,22.27027,81.675676,34.29009,0.520838,27.603604,0.342342
1,112.748148,67.792593,24.437037,98.674074,31.372593,0.486496,27.37037,0.214815
2,110.796117,63.252427,21.601942,85.84466,30.583495,0.49166,27.194175,0.184466
3,123.586667,66.586667,20.08,87.453333,30.425333,0.432147,29.026667,0.36
4,125.117647,70.029412,15.882353,69.441176,32.141176,0.446353,32.779412,0.338235
5,118.859649,76.210526,17.385965,57.298246,33.192982,0.396421,39.035088,0.368421
6,120.8,68.42,17.64,63.58,30.29,0.42952,39.34,0.32
7,136.444444,70.777778,20.288889,84.466667,32.631111,0.443622,41.111111,0.555556
8,131.736842,75.184211,17.315789,92.815789,31.568421,0.504711,45.368421,0.578947
9,131.392857,77.892857,20.892857,62.428571,31.707143,0.550679,44.178571,0.642857


In [55]:
# sort_values()
# 데이터 정렬
df.sort_values("Pregnancies")

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
16,0,118,84,47,230,45.8,0.551,31,1
736,0,126,86,27,120,27.4,0.515,21,0
713,0,134,58,20,291,26.4,0.352,21,0
727,0,141,84,26,0,32.4,0.433,22,0
681,0,162,76,36,0,49.6,0.364,26,1
...,...,...,...,...,...,...,...,...,...
691,13,158,114,0,0,42.3,0.257,44,1
298,14,100,78,25,184,36.6,0.412,46,1
455,14,175,62,30,0,33.6,0.212,38,1
88,15,136,70,32,110,37.1,0.153,43,1


In [56]:
# fillna()
# 결측치 채우기
df.fillna(0)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [57]:
# dropna()
# 결측치 행 삭제
df.dropna()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
