## 데이터 불러오기

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('mtcars.csv')
df.head()

Unnamed: 0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


## 1. 데이터 type(object, int, float...)

In [4]:
# 데이터 타입 확인
df.dtypes

model     object
mpg      float64
cyl        int64
disp     float64
hp         int64
drat     float64
wt       float64
qsec     float64
vs         int64
am         int64
gear       int64
carb       int64
dtype: object

In [6]:
# 데이터 타입 변경 1개
df1 = df.copy()
df1 = df1.astype({'cyl':'object'})
print(df1.dtypes)

model     object
mpg      float64
cyl       object
disp     float64
hp         int64
drat     float64
wt       float64
qsec     float64
vs         int64
am         int64
gear       int64
carb       int64
dtype: object


In [7]:
# 데이터 타입 변경 2개
df1 = df1.astype({'cyl' : 'int', 'gear' : 'object'})
print(df1.dtypes)

model     object
mpg      float64
cyl        int32
disp     float64
hp         int64
drat     float64
wt       float64
qsec     float64
vs         int64
am         int64
gear      object
carb       int64
dtype: object


In [11]:
df['cyl']

0     6
1     6
2     4
3     6
4     8
5     6
6     8
7     4
8     4
9     6
10    6
11    8
12    8
13    8
14    8
15    8
16    8
17    4
18    4
19    4
20    4
21    8
22    8
23    8
24    8
25    4
26    4
27    4
28    8
29    6
30    8
31    4
Name: cyl, dtype: int64

In [12]:
df1['cyl'].value_counts()

8    14
4    11
6     7
Name: cyl, dtype: int64

## 2. 기초 통계량(평균, 중앙값, IQR, 표준편차..)

### 1) 중심 측도를 나타내는 값(평균, 중앙값, 최빈값)

In [14]:
df = pd.read_csv('mtcars.csv')
df.head()

Unnamed: 0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [16]:
df.shape # 행, 열

(32, 12)

In [22]:
# 평균값 구하기
mpg_mean = df['mpg'].mean()

In [23]:
print(mpg_mean)

20.090625000000003


In [24]:
# 중앙값 구하기

In [25]:
mpg_median = df['mpg'].median()

In [26]:
print(mpg_median)

19.2


In [29]:
# 최빈값 구하기
mpg_mode = df['cyl'].mode()

In [30]:
print(mpg_mode)

0    8
Name: cyl, dtype: int64


In [32]:
print(mpg_mode[0])

8


In [34]:
df['cyl'].value_counts()

8    14
4    11
6     7
Name: cyl, dtype: int64

### 2) 산포도를 나타내는 값(분산, 표준편차. IQR, 범위(min-max)..)

In [36]:
# 분산
mpg_var = df['mpg'].var()

In [37]:
print(mpg_var)

36.32410282258065


In [38]:
# 표준편차
mpg_std = df['mpg'].std()

In [39]:
print(mpg_std)

6.026948052089105


In [41]:
# IQR
Q1 = df['mpg'].quantile(.25)
print(Q1)

15.425


In [43]:
Q3 = df['mpg'].quantile(.75)
print(Q3)

22.8


In [45]:
IQR = Q3 - Q1
print(IQR)

7.375


In [46]:
Q2 = df['mpg'].quantile(.50)
print(Q2)
print(mpg_median)

19.2
19.2


In [47]:
mpg_max = df['mpg'].max()
print(mpg_max)

33.9


In [49]:
mpg_min = df['mpg'].min()
print(mpg_min)

10.4


In [50]:
mpg_range = mpg_max - mpg_min
print(mpg_range)

23.5


### 3) 분포의 비대칭도

In [52]:
# 왜도
mpg_skew = df['mpg'].skew()
print(mpg_skew)

0.6723771376290805


In [53]:
# 첨도
mpg_kurt = df['mpg'].kurt()
print(mpg_kurt)

-0.0220062914240855


### 4) 기타(합계, 절대값, 데이터 수..)

In [54]:
mpg_sum = df['mpg'].sum()
print(mpg_sum)

642.9000000000001


In [55]:
# 절댓값
IQR2 = Q1 - Q3
print(IQR2)
print(abs(IQR2))

-7.375
7.375


In [56]:
# 데이터 수
len(df['mpg'])

32

### 5) 그룹화하여 계산하기 (groupby 활용)

In [57]:
# species 별로 각 변수의 평균 구해보기
import seaborn as sns
df = sns.load_dataset('iris')
print(df.head())
df.groupby('species').mean()

   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa


Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.006,3.428,1.462,0.246
versicolor,5.936,2.77,4.26,1.326
virginica,6.588,2.974,5.552,2.026


In [58]:
df.groupby('species').median()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.0,3.4,1.5,0.2
versicolor,5.9,2.8,4.35,1.3
virginica,6.5,3.0,5.55,2.0
