In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./insurance.csv') # 데이터 로딩
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [3]:
df.head(5) # df의 상위 5개 항목 출력

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df.describe() # df의 요약 (평균, 4분위 수, 전체 개수 등)

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [5]:
# 신규 컬럼(double bm)을 생성하고 bmi의 2배가 되는 값을 할당하기
df['double bmi'] = df['bmi']*2
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,double bmi
0,19,female,27.9,0,yes,southwest,16884.924,55.8
1,18,male,33.77,1,no,southeast,1725.5523,67.54
2,28,male,33.0,3,no,southeast,4449.462,66.0
3,33,male,22.705,0,no,northwest,21984.47061,45.41
4,32,male,28.88,0,no,northwest,3866.8552,57.76


In [6]:
# 신규 컬럼(bebt)을 생성하고 0으로 채우기
df['debt'] = 0
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,double bmi,debt
0,19,female,27.9,0,yes,southwest,16884.924,55.8,0
1,18,male,33.77,1,no,southeast,1725.5523,67.54,0
2,28,male,33.0,3,no,southeast,4449.462,66.0,0
3,33,male,22.705,0,no,northwest,21984.47061,45.41,0
4,32,male,28.88,0,no,northwest,3866.8552,57.76,0


In [7]:
# debt에 0 부터 값이 증가되도록 채우기
df['debt'] = range(0,len(df))
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,double bmi,debt
0,19,female,27.900,0,yes,southwest,16884.92400,55.80,0
1,18,male,33.770,1,no,southeast,1725.55230,67.54,1
2,28,male,33.000,3,no,southeast,4449.46200,66.00,2
3,33,male,22.705,0,no,northwest,21984.47061,45.41,3
4,32,male,28.880,0,no,northwest,3866.85520,57.76,4
...,...,...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830,61.94,1333
1334,18,female,31.920,0,no,northeast,2205.98080,63.84,1334
1335,18,female,36.850,0,no,southeast,1629.83350,73.70,1335
1336,21,female,25.800,0,no,southwest,2007.94500,51.60,1336


In [8]:
# 1번 인덱스의 정보출력 : iloc
df.iloc[1, :]

age                  18
sex                male
bmi               33.77
children              1
smoker               no
region        southeast
charges       1725.5523
double bmi        67.54
debt                  1
Name: 1, dtype: object

In [9]:
# 3, 5, 8의 인덱스 'debt' 값을 12, 13, 14로 설정하기
def change(x):
    if x==3:
        return 12
    elif x==5:
        return 13
    elif x==8:
        return 14
    else:
        return np.nan
df['debt'] = df['debt'].apply(change)
df

# 더 간단한 방법
# val = pd.Series([12, 13, 14], index = [3, 5, 8])
# df['debt'] = val
# print(df)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,double bmi,debt
0,19,female,27.900,0,yes,southwest,16884.92400,55.80,
1,18,male,33.770,1,no,southeast,1725.55230,67.54,
2,28,male,33.000,3,no,southeast,4449.46200,66.00,
3,33,male,22.705,0,no,northwest,21984.47061,45.41,12.0
4,32,male,28.880,0,no,northwest,3866.85520,57.76,
...,...,...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830,61.94,
1334,18,female,31.920,0,no,northeast,2205.98080,63.84,
1335,18,female,36.850,0,no,southeast,1629.83350,73.70,
1336,21,female,25.800,0,no,southwest,2007.94500,51.60,


In [10]:
# 'double bmi' 열 삭제하기
df.drop('double bmi' , axis=1, inplace=True)
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,debt
0,19,female,27.900,0,yes,southwest,16884.92400,
1,18,male,33.770,1,no,southeast,1725.55230,
2,28,male,33.000,3,no,southeast,4449.46200,
3,33,male,22.705,0,no,northwest,21984.47061,12.0
4,32,male,28.880,0,no,northwest,3866.85520,
...,...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830,
1334,18,female,31.920,0,no,northeast,2205.98080,
1335,18,female,36.850,0,no,southeast,1629.83350,
1336,21,female,25.800,0,no,southwest,2007.94500,


In [11]:
# 'age"와 'sex' 열만 출력하기
df[['age', 'sex']]

Unnamed: 0,age,sex
0,19,female
1,18,male
2,28,male
3,33,male
4,32,male
...,...,...
1333,50,male
1334,18,female
1335,18,female
1336,21,female


In [12]:
# 함수 생성, 적용(apply) : age 1 증가 
df['age'].apply(lambda x: x+1)

0       20
1       19
2       29
3       34
4       33
        ..
1333    51
1334    19
1335    19
1336    22
1337    62
Name: age, Length: 1338, dtype: int64

In [13]:
# 함수 생성, 적용(map)
df['charges'] = round(df['charges'],2)
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,debt
0,19,female,27.900,0,yes,southwest,16884.92,
1,18,male,33.770,1,no,southeast,1725.55,
2,28,male,33.000,3,no,southeast,4449.46,
3,33,male,22.705,0,no,northwest,21984.47,12.0
4,32,male,28.880,0,no,northwest,3866.86,
...,...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.55,
1334,18,female,31.920,0,no,northeast,2205.98,
1335,18,female,36.850,0,no,southeast,1629.83,
1336,21,female,25.800,0,no,southwest,2007.94,


In [14]:
# 함수 생성, 적용(applymap) - 데이터프레임 전체 값을 대괄효[] 로 감싸기
def brackets(x):
    return "[" + str(x) + ']'
brackets_df = df.applymap(brackets)
brackets_df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,debt
0,[19],[female],[27.9],[0],[yes],[southwest],[16884.92],[nan]
1,[18],[male],[33.77],[1],[no],[southeast],[1725.55],[nan]
2,[28],[male],[33.0],[3],[no],[southeast],[4449.46],[nan]
3,[33],[male],[22.705],[0],[no],[northwest],[21984.47],[12.0]
4,[32],[male],[28.88],[0],[no],[northwest],[3866.86],[nan]
...,...,...,...,...,...,...,...,...
1333,[50],[male],[30.97],[3],[no],[northwest],[10600.55],[nan]
1334,[18],[female],[31.92],[0],[no],[northeast],[2205.98],[nan]
1335,[18],[female],[36.85],[0],[no],[southeast],[1629.83],[nan]
1336,[21],[female],[25.8],[0],[no],[southwest],[2007.94],[nan]


In [15]:
# 컬럼 정렬(알파벳 순서대로): sort index
df.sort_index(axis=1)

Unnamed: 0,age,bmi,charges,children,debt,region,sex,smoker
0,19,27.900,16884.92,0,,southwest,female,yes
1,18,33.770,1725.55,1,,southeast,male,no
2,28,33.000,4449.46,3,,southeast,male,no
3,33,22.705,21984.47,0,12.0,northwest,male,no
4,32,28.880,3866.86,0,,northwest,male,no
...,...,...,...,...,...,...,...,...
1333,50,30.970,10600.55,3,,northwest,male,no
1334,18,31.920,2205.98,0,,northeast,female,no
1335,18,36.850,1629.83,0,,southeast,female,no
1336,21,25.800,2007.94,0,,southwest,female,no


In [16]:
# 값 정렬(age) : sort_values - 나이 순으로 정렬
df.sort_values('age')

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,debt
1248,18,female,39.820,0,no,southeast,1633.96,
482,18,female,31.350,0,no,southeast,1622.19,
492,18,female,25.080,0,no,northeast,2196.47,
525,18,female,33.880,0,no,southeast,11482.63,
529,18,male,25.460,0,no,northeast,1708.00,
...,...,...,...,...,...,...,...,...
398,64,male,25.600,2,no,southwest,14988.43,
335,64,male,34.500,0,no,southwest,13822.80,
378,64,female,30.115,3,no,northwest,16455.71,
1265,64,male,23.760,0,yes,southeast,26926.51,


In [17]:
# 평균 구하기 : mean
df.mean()

  df.mean()


age            39.207025
bmi            30.663397
children        1.094918
charges     13270.422280
debt           13.000000
dtype: float64

In [18]:
# 남자. 여자가 각각 몇명 있는지 구하기
df.value_counts(df['sex'])

sex
male      676
female    662
dtype: int64

In [19]:
# 자녀(children)이 3명인 데이터 출력
df[df['children']==3]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,debt
2,28,male,33.000,3,no,southeast,4449.46,
7,37,female,27.740,3,no,northwest,7281.51,
25,59,female,27.720,3,no,southeast,14001.13,
36,62,female,32.965,3,no,northwest,15612.19,
54,40,female,28.690,3,no,northwest,8059.68,
...,...,...,...,...,...,...,...,...
1301,62,male,30.875,3,yes,northwest,46718.16,
1314,30,female,23.655,3,yes,northwest,18765.88,
1320,31,male,31.065,3,no,northwest,5425.02,
1332,52,female,44.700,3,no,southwest,11411.68,


In [20]:
# 자녀(children)이 5명 이상인 데이터 출력
df[df['children']>=5]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,debt
32,19,female,28.6,5,no,southwest,4687.8,
71,31,male,28.5,5,no,northeast,6799.46,
166,20,female,37.0,5,no,southwest,4830.63,
413,25,male,23.9,5,no,southwest,5080.1,
425,45,male,24.31,5,no,southeast,9788.87,
438,52,female,46.75,5,no,southeast,12592.53,
568,49,female,31.9,5,no,southwest,11552.9,
640,33,male,42.4,5,no,southwest,6666.24,
877,33,male,33.44,5,no,southeast,6653.79,
932,46,male,25.8,5,no,southwest,10096.97,


In [21]:
# bmi가 17 미만인 데이터 출력
df[df['bmi']<17]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,debt
172,18,male,15.96,0,no,northeast,1694.8,
428,21,female,16.815,1,no,northeast,3167.46,
1226,38,male,16.815,2,no,northeast,6640.54,


In [22]:
# debt가 null 이 아닌 데이터 출력하기
df[df['debt'].notnull()]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,debt
3,33,male,22.705,0,no,northwest,21984.47,12.0
5,31,female,25.74,0,no,southeast,3756.62,13.0
8,37,male,29.83,2,no,northeast,6406.41,14.0


In [23]:
# age가 60보다 큰 값만 출력하기
df[df['age']>60]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,debt
11,62,female,26.290,0,yes,southeast,27808.73,
26,63,female,23.085,0,no,northeast,14451.84,
33,63,male,28.310,0,no,northwest,13770.10,
36,62,female,32.965,3,no,northwest,15612.19,
62,64,male,24.700,1,no,northwest,30166.62,
...,...,...,...,...,...,...,...,...
1301,62,male,30.875,3,yes,northwest,46718.16,
1321,62,male,26.695,0,yes,northeast,28101.33,
1322,62,male,38.830,0,no,southeast,12981.35,
1325,61,male,33.535,0,no,northeast,13143.34,
