In [1]:
# !pip install seaborn

In [2]:
import pandas as pd

pd.__version__

'2.1.4'

## pandas의 데이터 구조
- p.237
- Series 데이터 구조, 1차원 데이터: 컬럼의 개수가 1개인 데이터
- DataFrame 데이터 구조, 컬럼 개수가 여러 개인 데이터

In [8]:
s1 = pd.Series([10, 20, 30, 40, 50])
s1

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [9]:
s1.index

RangeIndex(start=0, stop=5, step=1)

In [10]:
s1.values

array([10, 20, 30, 40, 50], dtype=int64)

In [11]:
s2 = pd.Series(['a', 'b', 'c', 1, 2, 3])
s2

0    a
1    b
2    c
3    1
4    2
5    3
dtype: object

In [12]:
import numpy as np

s3 = pd.Series([np.nan, 10, 30])  # NaN: 결측치, missing values
s3

0     NaN
1    10.0
2    30.0
dtype: float64

In [13]:
index_date = ['2018-10-07', '2018-10-08']
s4 = pd.Series([200, 195], index = index_date)
s4

2018-10-07    200
2018-10-08    195
dtype: int64

In [14]:
index_date = ['2018-10-08', '2018-10-08']
s4 = pd.Series([200, 195], index = index_date)
s4

# index 설정의 기본 원칙은 중복이 되면 안 된다. 코드 상으로는 중복 허용하더라.

2018-10-08    200
2018-10-08    195
dtype: int64

In [3]:
data_dict = {
    '국어': 100,
    '영어': 95,
    '수학': 90
}
s5 = pd.Series(data_dict)
s5

국어    100
영어     95
수학     90
dtype: int64

## 날짜 데이터

In [16]:
pd.date_range(start='2024/01/01', end='2024.01.07')

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06', '2024-01-07'],
              dtype='datetime64[ns]', freq='D')

In [17]:
pd.date_range(start='2024/01/01', periods = 7)

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06', '2024-01-07'],
              dtype='datetime64[ns]', freq='D')

In [18]:
pd.date_range(start='2024/01/01', periods = 4, freq = '2D')

DatetimeIndex(['2024-01-01', '2024-01-03', '2024-01-05', '2024-01-07'], dtype='datetime64[ns]', freq='2D')

In [19]:
pd.date_range(start='2024-01-01 08:00', periods = 4, freq = 'H')

DatetimeIndex(['2024-01-01 08:00:00', '2024-01-01 09:00:00',
               '2024-01-01 10:00:00', '2024-01-01 11:00:00'],
              dtype='datetime64[ns]', freq='H')

## DataFrame을 활용한 데이터 생성

In [20]:
# !pip install seaborn

In [5]:
import seaborn as sns  # 시각화 라이브러리(통계 시각화)
sns.__version__

'0.13.1'

In [22]:
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [6]:
iris = sns.load_dataset('iris')
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [24]:
# p.247은 참고만 하세요
# p.249 딕셔너리 스타일... 강사님 선호
table_data = {
    '연도': [2015, 2016, 2016, 2017, 2017],
    '지사': ['한국', '한국', '미국', '한국', '미국'],
    '고객 수': [200, 250, 450, 300, 500]
}
table_data

{'연도': [2015, 2016, 2016, 2017, 2017],
 '지사': ['한국', '한국', '미국', '한국', '미국'],
 '고객 수': [200, 250, 450, 300, 500]}

In [25]:
data = pd.DataFrame(table_data)
data

Unnamed: 0,연도,지사,고객 수
0,2015,한국,200
1,2016,한국,250
2,2016,미국,450
3,2017,한국,300
4,2017,미국,500


In [26]:
data.index

RangeIndex(start=0, stop=5, step=1)

In [27]:
data.values

array([[2015, '한국', 200],
       [2016, '한국', 250],
       [2016, '미국', 450],
       [2017, '한국', 300],
       [2017, '미국', 500]], dtype=object)

In [28]:
data.values.shape

(5, 3)

- 데이터 가공 시, NumPy 메서드와 pandas 메서드를 조합해서 처리하는 경우가 많음
    + vectorization으로 처리 / 파이썬 기초문법 (for-loop) 대신
    + 속도가 매우 빠름

In [29]:
data.columns

Index(['연도', '지사', '고객 수'], dtype='object')

## 스터디
- https://pandas.pydata.org/docs/getting_started/intro_tutorials/03_subset_data.html

In [37]:
titanic = sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


### the age of the Titanic passengers

In [31]:
ages = titanic["age"]
ages.head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: age, dtype: float64

In [25]:
type(titanic["age"])

pandas.core.series.Series

In [26]:
titanic["age"].shape

(891,)

### the age and sex of the Titanic passengers

In [32]:
age_sex = titanic[["age", "sex"]]
age_sex.head()

Unnamed: 0,age,sex
0,22.0,male
1,38.0,female
2,26.0,female
3,35.0,female
4,35.0,male


In [16]:
type(titanic[["age", "sex"]])

pandas.core.frame.DataFrame

In [17]:
titanic[["age", "sex"]].shape

(891, 2)

### the passengers older than 35 years

In [33]:
above_35 = titanic[titanic["age"] > 35]
above_35.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
11,1,1,female,58.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True
13,0,3,male,39.0,1,5,31.275,S,Third,man,True,,Southampton,no,False
15,1,2,female,55.0,0,0,16.0,S,Second,woman,False,,Southampton,yes,True


In [19]:
titanic["age"] > 35

0      False
1       True
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: age, Length: 891, dtype: bool

In [20]:
above_35.shape

(217, 15)

### the Titanic passengers from cabin class 2 and 3

In [34]:
class_23 = titanic[titanic["pclass"].isin([2, 3])]
class_23.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


In [23]:
class_23 = titanic[(titanic["pclass"] == 2) | (titanic["pclass"] == 3)]  # | = or
class_23.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


### passenger data for which the age is known

In [35]:
age_no_na = titanic[titanic["age"].notna()]
age_no_na.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [30]:
age_no_na.shape

(714, 15)

### rows 10 till 25 and columns 3 to 5

In [38]:
titanic.iloc[9:25, 2:5]

Unnamed: 0,sex,age,sibsp
9,female,14.0,1
10,female,4.0,1
11,female,58.0,0
12,male,20.0,0
13,male,39.0,1
14,female,14.0,0
15,female,55.0,0
16,male,2.0,4
17,male,,0
18,female,31.0,1


In [40]:
titanic.iloc[0:3, 3] = "anonymous"
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,anonymous,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,anonymous,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,anonymous,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## 데이터 연산

In [32]:
import pandas as pd
import seaborn as sns
import numpy as np

s1 = pd.Series([1, 2, 3, 4, 5])
s2 = pd.Series([10, 20, 30, 40, 50])
s1 + s2

0    11
1    22
2    33
3    44
4    55
dtype: int64

In [33]:
s2 - s1

0     9
1    18
2    27
3    36
4    45
dtype: int64

In [34]:
s1 * s2

0     10
1     40
2     90
3    160
4    250
dtype: int64

In [35]:
s1 / s2

0    0.1
1    0.1
2    0.1
3    0.1
4    0.1
dtype: float64

In [41]:
# 리스트와 NumPy의 배열과 다르게, 서로 크기가 달라도 연산할 수 있음

s1 = pd.Series([1, 2, 3, 4])
s2 = pd.Series([10, 20, 30, 40, 50])
s1 + s2

0    11.0
1    22.0
2    33.0
3    44.0
4     NaN
dtype: float64

In [42]:
s2 - s1

0     9.0
1    18.0
2    27.0
3    36.0
4     NaN
dtype: float64

In [38]:
s1 * s2

0     10.0
1     40.0
2     90.0
3    160.0
4      NaN
dtype: float64

In [39]:
s1 / s2

0    0.1
1    0.1
2    0.1
3    0.1
4    NaN
dtype: float64

In [40]:
table_data1 = {'A': [1, 2, 3, 4, 5],
              'B': [10, 20, 30, 40, 50],
              'C': [100, 200, 300, 400, 500]}
df1 = pd.DataFrame(table_data1)
df1

Unnamed: 0,A,B,C
0,1,10,100
1,2,20,200
2,3,30,300
3,4,40,400
4,5,50,500


In [41]:
table_data2 = {'A': [6, 7, 8],
              'B': [60, 70, 80],
              'C': [600, 700, 800]}
df2 = pd.DataFrame(table_data2)
df2

Unnamed: 0,A,B,C
0,6,60,600
1,7,70,700
2,8,80,800


In [42]:
df1 + df2

Unnamed: 0,A,B,C
0,7.0,70.0,700.0
1,9.0,90.0,900.0
2,11.0,110.0,1100.0
3,,,
4,,,


In [43]:
df1 - df2

Unnamed: 0,A,B,C
0,-5.0,-50.0,-500.0
1,-5.0,-50.0,-500.0
2,-5.0,-50.0,-500.0
3,,,
4,,,


In [44]:
df1 * df2

Unnamed: 0,A,B,C
0,6.0,600.0,60000.0
1,14.0,1400.0,140000.0
2,24.0,2400.0,240000.0
3,,,
4,,,


In [45]:
df1 / df2

Unnamed: 0,A,B,C
0,0.166667,0.166667,0.166667
1,0.285714,0.285714,0.285714
2,0.375,0.375,0.375
3,,,
4,,,


In [46]:
table_data3 = {'봄':  [256.5, 264.3, 215.9, 223.2, 312.8],
              '여름': [770.6, 567.5, 599.8, 387.1, 446.2],
              '가을': [363.5, 231.2, 293.1, 247.7, 381.6],
              '겨울': [139.3, 59.9, 76.9, 109.1, 108.1]}
columns_list = ['봄', '여름', '가을', '겨울']
index_list = ['2012', '2013', '2014', '2015', '2016']

df3 = pd.DataFrame(table_data3, columns = columns_list, index = index_list)
df3

Unnamed: 0,봄,여름,가을,겨울
2012,256.5,770.6,363.5,139.3
2013,264.3,567.5,231.2,59.9
2014,215.9,599.8,293.1,76.9
2015,223.2,387.1,247.7,109.1
2016,312.8,446.2,381.6,108.1


In [47]:
df3.mean()

봄     254.54
여름    554.24
가을    303.42
겨울     98.66
dtype: float64

In [48]:
df3.max()

봄     312.8
여름    770.6
가을    381.6
겨울    139.3
dtype: float64

In [49]:
df3.min()  # 최솟값

봄     215.9
여름    387.1
가을    231.2
겨울     59.9
dtype: float64

In [50]:
df3.std()

봄      38.628267
여름    148.888895
가을     67.358496
겨울     30.925523
dtype: float64

In [51]:
df3.mean(axis=1)  # axis=1: 행 단위 집계 연산

2012    382.475
2013    280.725
2014    296.425
2015    241.775
2016    312.175
dtype: float64

In [52]:
df3.max(axis=1)

2012    770.6
2013    567.5
2014    599.8
2015    387.1
2016    446.2
dtype: float64

In [53]:
df3.std(axis=1)

2012    274.472128
2013    211.128782
2014    221.150739
2015    114.166760
2016    146.548658
dtype: float64

In [54]:
df3.mean(axis=1)

2012    382.475
2013    280.725
2014    296.425
2015    241.775
2016    312.175
dtype: float64

In [55]:
df3.describe()

Unnamed: 0,봄,여름,가을,겨울
count,5.0,5.0,5.0,5.0
mean,254.54,554.24,303.42,98.66
std,38.628267,148.888895,67.358496,30.925523
min,215.9,387.1,231.2,59.9
25%,223.2,446.2,247.7,76.9
50%,256.5,567.5,293.1,108.1
75%,264.3,599.8,363.5,109.1
max,312.8,770.6,381.6,139.3


In [44]:
import pandas as pd
import numpy as np

KTX_data = {'경부선 KTX': [39060, 39896, 42005, 43621, 41702, 41266, 32427],
            '호남선 KTX': [7313, 6967, 6873, 6626, 8675, 10622, 9228],
            '경전선 KTX': [3627, 4168, 4088, 4424, 4606, 4984, 5570],
            '전라선 KTX': [309, 1771, 1954, 2244, 3146, 3945, 5766],
            '동해선 KTX': [np.nan,np.nan, np.nan, np.nan, 2395, 3786, 6667]}
index_list = ['2011', '2012', '2013', '2014', '2015', '2016', '2017']

df_KTX = pd.DataFrame(KTX_data, index = index_list)
df_KTX

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2011,39060,7313,3627,309,
2012,39896,6967,4168,1771,
2013,42005,6873,4088,1954,
2014,43621,6626,4424,2244,
2015,41702,8675,4606,3146,2395.0
2016,41266,10622,4984,3945,3786.0
2017,32427,9228,5570,5766,6667.0


In [45]:
# p.259
df_KTX.head()

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2011,39060,7313,3627,309,
2012,39896,6967,4168,1771,
2013,42005,6873,4088,1954,
2014,43621,6626,4424,2244,
2015,41702,8675,4606,3146,2395.0


In [46]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.head.html#pandas.DataFrame.head

In [47]:
df_KTX.head(3)

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2011,39060,7313,3627,309,
2012,39896,6967,4168,1771,
2013,42005,6873,4088,1954,


In [48]:
df_KTX.tail(3)

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2015,41702,8675,4606,3146,2395.0
2016,41266,10622,4984,3945,3786.0
2017,32427,9228,5570,5766,6667.0


## 행 선택(= 행 추출)
- Slicing과 동일한 문법으로 추출 가능

In [49]:
df_KTX[0:2]

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2011,39060,7313,3627,309,
2012,39896,6967,4168,1771,


In [50]:
df_KTX[2:5]

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2013,42005,6873,4088,1954,
2014,43621,6626,4424,2244,
2015,41702,8675,4606,3146,2395.0


## loc vs iloc를 활용한 열 추출, 행 추출
- 두 가지의 차이 기억하기
- 한 가지만 기억하고자 한다면, loc만 기억하자

In [53]:
# df_KTX.loc[행, 열]

In [52]:
df_KTX.loc['2011']

경부선 KTX    39060.0
호남선 KTX     7313.0
경전선 KTX     3627.0
전라선 KTX      309.0
동해선 KTX        NaN
Name: 2011, dtype: float64

In [54]:
df_KTX.loc['2013':'2016']

Unnamed: 0,경부선 KTX,호남선 KTX,경전선 KTX,전라선 KTX,동해선 KTX
2013,42005,6873,4088,1954,
2014,43621,6626,4424,2244,
2015,41702,8675,4606,3146,2395.0
2016,41266,10622,4984,3945,3786.0


In [57]:
df_KTX['경부선 KTX']

2011    39060
2012    39896
2013    42005
2014    43621
2015    41702
2016    41266
2017    32427
Name: 경부선 KTX, dtype: int64

In [64]:
df_KTX.loc['2013':'2016', '경부선 KTX']

2013    42005
2014    43621
2015    41702
2016    41266
Name: 경부선 KTX, dtype: int64

In [65]:
df_KTX.loc['2013':'2016', ['경부선 KTX', '호남선 KTX']]

Unnamed: 0,경부선 KTX,호남선 KTX
2013,42005,6873
2014,43621,6626
2015,41702,8675
2016,41266,10622


In [66]:
df_KTX.loc['2013':'2016', ['호남선 KTX']]

Unnamed: 0,호남선 KTX
2013,6873
2014,6626
2015,8675
2016,10622


In [67]:
# df_KTX.loc[행조건식, [컬럼명]]

In [68]:
iris['sepal_length'] >= 7

0      False
1      False
2      False
3      False
4      False
       ...  
145    False
146    False
147    False
148    False
149    False
Name: sepal_length, Length: 150, dtype: bool

In [69]:
iris = sns.load_dataset('iris')

# sepal_length의 길이가 7cm 이상인 것만 조회
iris.loc[iris['sepal_length'] >= 7 , :]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
50,7.0,3.2,4.7,1.4,versicolor
102,7.1,3.0,5.9,2.1,virginica
105,7.6,3.0,6.6,2.1,virginica
107,7.3,2.9,6.3,1.8,virginica
109,7.2,3.6,6.1,2.5,virginica
117,7.7,3.8,6.7,2.2,virginica
118,7.7,2.6,6.9,2.3,virginica
122,7.7,2.8,6.7,2.0,virginica
125,7.2,3.2,6.0,1.8,virginica
129,7.2,3.0,5.8,1.6,virginica


In [70]:
# species가 virginica인 것만 조회
iris.loc[iris['species'] == "virginica" , :]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
100,6.3,3.3,6.0,2.5,virginica
101,5.8,2.7,5.1,1.9,virginica
102,7.1,3.0,5.9,2.1,virginica
103,6.3,2.9,5.6,1.8,virginica
104,6.5,3.0,5.8,2.2,virginica
105,7.6,3.0,6.6,2.1,virginica
106,4.9,2.5,4.5,1.7,virginica
107,7.3,2.9,6.3,1.8,virginica
108,6.7,2.5,5.8,1.8,virginica
109,7.2,3.6,6.1,2.5,virginica


In [71]:
titanic = sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [72]:
# embark_town 칼럼에서 Southampton 거주민의 모든 데이터를 조회하세요
titanic.loc[titanic['embark_town'] == "Southampton" , :]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
883,0,2,male,28.0,0,0,10.5000,S,Second,man,True,,Southampton,no,True
884,0,3,male,25.0,0,0,7.0500,S,Third,man,True,,Southampton,no,True
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [73]:
# age 평균 구하고, 평균 이상인 데이터만 조회하기
titanic['age'].mean()

29.69911764705882

In [74]:
titanic.loc[titanic['age'] >= titanic['age'].mean() , :]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
873,0,3,male,47.0,0,0,9.0000,S,Third,man,True,,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
881,0,3,male,33.0,0,0,7.8958,S,Third,man,True,,Southampton,no,True
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False


In [75]:
var = 'age'
titanic.loc[titanic[var] >= titanic[var].mean() , :]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
873,0,3,male,47.0,0,0,9.0000,S,Third,man,True,,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
881,0,3,male,33.0,0,0,7.8958,S,Third,man,True,,Southampton,no,True
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False


In [76]:
tips = sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [80]:
# total_bill의 중간값 이하인 것만 조회
# 중간값: 서열을 매겼을 때 가운데의 값

var = 'total_bill'
tips.loc[tips[var] <= tips[var].median(), :]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
6,8.77,2.00,Male,No,Sun,Dinner,2
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2
...,...,...,...,...,...,...,...
232,11.61,3.39,Male,No,Sat,Dinner,2
233,10.77,1.47,Male,No,Sat,Dinner,2
234,15.53,3.00,Male,Yes,Sat,Dinner,2
235,10.07,1.25,Male,No,Sat,Dinner,2


In [79]:
# day가 Sun인 데이터만 조회하기

tips.loc[tips['day'] == "Sun", :]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
186,20.90,3.50,Female,Yes,Sun,Dinner,3
187,30.46,2.00,Male,Yes,Sun,Dinner,5
188,18.15,3.50,Female,Yes,Sun,Dinner,3
189,23.10,4.00,Male,Yes,Sun,Dinner,3


In [89]:
# tip의 평균보다 크면서, time이 Dinner인 데이터만 조회
# pandas의 다중 조건 행 추출
# tips.loc[(조건1) & (조건2), :]
tips.loc[(tips['tip'] >= tips['tip'].mean()) & (tips['time'] == "Dinner"), :]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
7,26.88,3.12,Male,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
232,11.61,3.39,Male,No,Sat,Dinner,2
234,15.53,3.00,Male,Yes,Sat,Dinner,2
238,35.83,4.67,Female,No,Sat,Dinner,3
239,29.03,5.92,Male,No,Sat,Dinner,3


In [101]:
# 다중조건 예시
# &, AND 조건 / |, OR 조건
tips.loc[(tips['tip'] >= tips['tip'].mean()) &
    (tips['time'] == "Dinner") &
    (tips['size'] == 3) &
    (tips['sex'] == "Male") &
    (tips['day'] == "Sat"), :]

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
19,20.65,3.35,Male,No,Sat,Dinner,3
35,24.06,3.6,Male,No,Sat,Dinner,3
39,31.27,5.0,Male,No,Sat,Dinner,3
65,20.08,3.15,Male,No,Sat,Dinner,3
170,50.81,10.0,Male,Yes,Sat,Dinner,3
206,26.59,3.41,Male,Yes,Sat,Dinner,3
231,15.69,3.0,Male,Yes,Sat,Dinner,3
239,29.03,5.92,Male,No,Sat,Dinner,3
