# 라이브러리 가져오기

In [10]:
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib as mpl

print(np.__version__)
print(pd.__version__)
print(sns.__version__)
print(mpl.__version__)

2.2.4
2.2.3
0.13.2
3.10.1


# 샘플데이터 가져오기

In [16]:
iris = sns.load_dataset("iris")
iris.head(1)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa


In [20]:
tips = sns.load_dataset("tips")
tips.head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2


# 결측치 확인
- ## 데이터가 비어 있나?

In [17]:
iris.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [18]:
iris.shape

(150, 5)

In [21]:
tips.shape

(244, 7)

In [35]:
a = tips['day']
type(a)          #시리즈

pandas.core.series.Series

In [37]:
type(tips)      #데이터프레임

pandas.core.frame.DataFrame

In [38]:
tips['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

# 상위 5개만 보기
- https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.nlargest.html
- 관련 메서드
    + DataFrame.nsmallest
    + DataFrame.sort_values
    + DataFrame.head


In [40]:
# 숫자열을 sort() 내림차순 정렬
# 상위 5개만 인덱싱
iris.nlargest(5, "sepal_length")

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
131,7.9,3.8,6.4,2.0,virginica
117,7.7,3.8,6.7,2.2,virginica
118,7.7,2.6,6.9,2.3,virginica
122,7.7,2.8,6.7,2.0,virginica
135,7.7,3.0,6.1,2.3,virginica


# 필터링
- NumPy와 문법 동일

In [42]:
# tips의 평균 구하기
# 평균보다 큰 데이터만 조회
mean_tip = tips['tip'].mean()
mean_tip
# numpy 문법 이용 ==> a[a>12]
tips[tips['tip'] > mean_tip]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
7,26.88,3.12,Male,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
232,11.61,3.39,Male,No,Sat,Dinner,2
234,15.53,3.00,Male,Yes,Sat,Dinner,2
238,35.83,4.67,Female,No,Sat,Dinner,3
239,29.03,5.92,Male,No,Sat,Dinner,3


In [45]:
# 평균까지 한번에 작성도 가능
tips[tips['tip'] > tips['tip'].mean()].head(1)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
2,21.01,3.5,Male,No,Sun,Dinner,3


- ### smoker가 No인 것만 조회

In [52]:
tips['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [49]:
tips[tips['smoker'] == 'Yes']

# day가 토요일
tips[tips['day'] == 'Sat']

# time이 저녁
tips[tips['time'] == 'Dinner']

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [54]:
# day가 토요일
tips[tips['day'] == 'Sat']    #인덱스가 19부터 나옴... 리셋 필요

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
19,20.65,3.35,Male,No,Sat,Dinner,3
20,17.92,4.08,Male,No,Sat,Dinner,2
21,20.29,2.75,Female,No,Sat,Dinner,2
22,15.77,2.23,Female,No,Sat,Dinner,2
23,39.42,7.58,Male,No,Sat,Dinner,4
...,...,...,...,...,...,...,...
238,35.83,4.67,Female,No,Sat,Dinner,3
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2


In [56]:
#인덱스 리셋 - 인덱스 번호를 0번째부터 재정렬
tips[tips['day'] == 'Sat'].reset_index(drop=True)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,20.65,3.35,Male,No,Sat,Dinner,3
1,17.92,4.08,Male,No,Sat,Dinner,2
2,20.29,2.75,Female,No,Sat,Dinner,2
3,15.77,2.23,Female,No,Sat,Dinner,2
4,39.42,7.58,Male,No,Sat,Dinner,4
...,...,...,...,...,...,...,...
82,35.83,4.67,Female,No,Sat,Dinner,3
83,29.03,5.92,Male,No,Sat,Dinner,3
84,27.18,2.00,Female,Yes,Sat,Dinner,2
85,22.67,2.00,Male,Yes,Sat,Dinner,2


# loc vs iloc
- 코드 비교

## loc

In [60]:
# tips.loc[tips['day'] == 'Sat']
# tips.loc[행, 열] - 끝 포함 O
tips.loc[0:2, ['total_bill', 'tip', 'day']]

Unnamed: 0,total_bill,tip,day
0,16.99,1.01,Sun
1,10.34,1.66,Sun
2,21.01,3.5,Sun


In [61]:
# iloc - 끝 포함 X
tips.iloc[0:1, [0, 1, 4]]

Unnamed: 0,total_bill,tip,day
0,16.99,1.01,Sun


In [63]:
tips.loc[tips['day'] == 'Sat', ['total_bill', 'tip', 'day']].reset_index(drop=True)

Unnamed: 0,total_bill,tip,day
0,20.65,3.35,Sat
1,17.92,4.08,Sat
2,20.29,2.75,Sat
3,15.77,2.23,Sat
4,39.42,7.58,Sat
...,...,...,...
82,35.83,4.67,Sat
83,29.03,5.92,Sat
84,27.18,2.00,Sat
85,22.67,2.00,Sat


In [65]:
# 전체 데이터 조회
tips.loc[:,:]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [68]:
# total_bill이 11이하인 것만 조회
tips.loc[tips['total_bill'] <= 11, :].reset_index(drop=True).head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,10.34,1.66,Male,No,Sun,Dinner,3
1,8.77,2.0,Male,No,Sun,Dinner,2
2,10.27,1.71,Male,No,Sun,Dinner,2
3,10.33,1.67,Female,No,Sun,Dinner,3
4,9.55,1.45,Male,No,Sat,Dinner,2


In [69]:
# time이 Dinner인 것만 loc 방식으로 적용해서 조회
tips.loc[tips['time'] == 'Dinner', :].reset_index(drop=True).head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [84]:
# time이 Dinner이면서 total_bill이 11이하인 것만 조회
result = tips.loc[tips['total_bill'] <= 11, :]
result.loc[result['time'] == 'Dinner', :].reset_index(drop=True).head()

# 한꺼번에 처리 by & 이용 ==> tips.loc[(조건식 1) & (조건식 2), :]
tips.loc[(tips['time'] == 'Dinner') & (tips['total_bill'] <= 11), :].reset_index(drop=True).head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,10.34,1.66,Male,No,Sun,Dinner,3
1,8.77,2.0,Male,No,Sun,Dinner,2
2,10.27,1.71,Male,No,Sun,Dinner,2
3,10.33,1.67,Female,No,Sun,Dinner,3
4,9.55,1.45,Male,No,Sat,Dinner,2


In [93]:
# iris
# 품종이 virginica 이거나 sepal_length가 5이상인 값만 가져오고 ==> | 사용
# 컬럼은 sepal_length, petal_length, species만 가져오기

iris.loc[(iris['sepal_length'] >= 5) | (iris['species'] == 'virginica'), ['sepal_length', 'petal_length', 'species']].reset_index(drop=True).head()

Unnamed: 0,sepal_length,petal_length,species
0,5.1,1.4,setosa
1,5.0,1.4,setosa
2,5.4,1.7,setosa
3,5.0,1.5,setosa
4,5.4,1.5,setosa


# 파일 입출력
- csv
- excel

In [100]:
import seaborn as sns
import pandas as pd

iris = sns.load_dataset('iris')
result = iris.loc[:, ['sepal_length', 'species']]
result

Unnamed: 0,sepal_length,species
0,5.1,setosa
1,4.9,setosa
2,4.7,setosa
3,4.6,setosa
4,5.0,setosa
...,...,...
145,6.7,virginica
146,6.3,virginica
147,6.5,virginica
148,6.2,virginica


# CSV

In [101]:
# 파일 내보내기
result.to_csv('iris_result.csv', index=False)

In [103]:
# lab01파일 안에 csv파일을 저장하는법
result.to_csv('lab01/iris_result.csv', index=False)

In [104]:
# dataset파일 안에 csv파일을 저장하는법
result.to_csv('dataset/iris_result.csv', index=False)

In [106]:
# 파일 불러오기 - dataset파일 안에 있는 iris_result csv파일 불러오기
iris_df = pd.read_csv('dataset/iris_result.csv')
iris_df

Unnamed: 0,sepal_length,species
0,5.1,setosa
1,4.9,setosa
2,4.7,setosa
3,4.6,setosa
4,5.0,setosa
...,...,...
145,6.7,virginica
146,6.3,virginica
147,6.5,virginica
148,6.2,virginica
