## 데이터 불러오기

In [4]:
!pip install pandas
!pip install numpy

Collecting pandas
  Using cached pandas-2.2.2-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.26.0 (from pandas)
  Using cached numpy-1.26.4-cp312-cp312-win_amd64.whl.metadata (61 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.2-cp312-cp312-win_amd64.whl (11.5 MB)
Using cached numpy-1.26.4-cp312-cp312-win_amd64.whl (15.5 MB)
Using cached pytz-2024.1-py2.py3-none-any.whl (505 kB)
Using cached tzdata-2024.1-py2.py3-none-any.whl (345 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
Successfully installed numpy-1.26.4 pandas-2.2.2 pytz-2024.1 tzdata-2024.1


In [5]:
import pandas as pd
import numpy as np

In [6]:
df = pd.read_csv("mtcars.csv")
df.head()

Unnamed: 0,car,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


## 1. 데이터 type(object,int,float 등)

In [7]:
# 데이터 타입 확인
df.dtypes

car      object
mpg     float64
cyl       int64
disp    float64
hp        int64
drat    float64
wt      float64
qsec    float64
vs        int64
am        int64
gear      int64
carb      int64
dtype: object

In [8]:
# 데이터 타입 변경(1개)
df1 = df.copy()
df1 = df1.astype({'cyl':'object'})
print(df1.dtypes)

car      object
mpg     float64
cyl      object
disp    float64
hp        int64
drat    float64
wt      float64
qsec    float64
vs        int64
am        int64
gear      int64
carb      int64
dtype: object


In [9]:
# 데이터 타입 변경(2개)
df1 = df1.astype({'cyl':'int','gear':'object'})
print(df1.dtypes)

car      object
mpg     float64
cyl       int32
disp    float64
hp        int64
drat    float64
wt      float64
qsec    float64
vs        int64
am        int64
gear     object
carb      int64
dtype: object


In [10]:
df1['cyl'].value_counts()

cyl
8    14
4    11
6     7
Name: count, dtype: int64

## 2. 기초통계량(평균, 중앙값, IQR, 표준편차 등)

### 1) 중심측도를 나타내는 값(평균, 중앙값, 최빈값)

In [11]:
df = pd.read_csv('mtcars.csv')
df.head()

Unnamed: 0,car,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [12]:
df.shape # 행,열

(32, 12)

In [15]:
# 평균값 구하기
mpg_mean = df['mpg'].mean()
print(mpg_mean)

20.090625000000003


In [14]:
print(df['mpg'].mean())

20.090625000000003


In [16]:
# 중앙값 구하기
mpg_median = df['mpg'].median()
print(mpg_median)

19.2


In [17]:
# 최빈값 구하기
cyl_mode = df['cyl'].mode()
print(cyl_mode)

0    8
Name: cyl, dtype: int64


In [18]:
print(cyl_mode[0])

8


In [19]:
df['cyl'].value_counts()

cyl
8    14
4    11
6     7
Name: count, dtype: int64

### 2) 산포도를 나타내는 값(분산, 표준편차, IQR, 범위(최대-최소)등)

In [20]:
# 분산
mpg_var = df['mpg'].var()
print(mpg_var)

36.32410282258064


In [21]:
# 표준편차
mpg_std = df['mpg'].std()
print(mpg_std)

6.026948052089104


In [22]:
# IQR
Q1 = df['mpg'].quantile(0.25)
Q3 = df['mpg'].quantile(0.75)
print(Q1)
print(Q3)

15.425
22.8


In [23]:
IQR = Q3-Q1
print(IQR)

7.375


In [24]:
Q2 = df['mpg'].quantile(0.50)
print(Q2)
print(df['mpg'].median())

19.2
19.2


In [25]:
# 범위(Range) = 최대값 - 최소값
mpg_max = df['mpg'].max()
mpg_min = df['mpg'].min()
mpg_range = mpg_max - mpg_min
print(mpg_range)

23.5


### 3) 분포의 비대칭도

In [26]:
# 왜도
mpg_skew = df['mpg'].skew()
print(mpg_skew)

0.6723771376290805


In [27]:
# 첨도
mpg_kurt = df['mpg'].kurt()
print(mpg_kurt)

-0.0220062914240855


### 4) 기타(합계, 절대값, 데이터 수 등)

In [28]:
# 합계
mpg_sum = df['mpg'].sum()
print(mpg_sum)

642.9000000000001


In [29]:
# 절대값
IQR2 = Q1 - Q3
print(IQR2)
print(abs(IQR2))

-7.375
7.375


In [30]:
# 데이터 수
print(len(df['mpg']))

32


### 5) 그룹화하여 계산하기 (groupby 활용)

In [32]:
!pip install seaborn

Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting matplotlib!=3.6.1,>=3.4 (from seaborn)
  Using cached matplotlib-3.8.4-cp312-cp312-win_amd64.whl.metadata (5.9 kB)
Collecting contourpy>=1.0.1 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Using cached contourpy-1.2.1-cp312-cp312-win_amd64.whl.metadata (5.8 kB)
Collecting cycler>=0.10 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Using cached fonttools-4.51.0-cp312-cp312-win_amd64.whl.metadata (162 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Using cached kiwisolver-1.4.5-cp312-cp312-win_amd64.whl.metadata (6.5 kB)
Collecting pillow>=8 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Using cached pillow-10.3.0-cp312-cp312-win_amd64.whl.metadata (9.4 kB)
Collecting pyparsing>=2.3.1 (from matplotlib!=3.6.1,>=3.4->seaborn)
  Using cached pyparsing-3.

In [33]:
# species 별로 각 변수의 평균 구해보기
import seaborn as sns
df = sns.load_dataset('iris')
print(df.head())

   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa


In [37]:
df.groupby('species').mean()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.006,3.428,1.462,0.246
versicolor,5.936,2.77,4.26,1.326
virginica,6.588,2.974,5.552,2.026


In [38]:
df.groupby('species').median()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
setosa,5.0,3.4,1.5,0.2
versicolor,5.9,2.8,4.35,1.3
virginica,6.5,3.0,5.55,2.0
