# Chapter 2: DataFrame 필수 연산

In [3]:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 40

## DataFrame에서 복수의 열 선택
- 방법 1. DataFrame[[사용할 컬럼들]]
- 방법 2. 같은 방법인데 이중 인덱스 안에 길게 표기하는 게 싫을 때.. -> 아래에 따로 추가된 사항 방법이 이거임
    - cols = [사용할 컬럼들]
    - DataFrame[cols]

In [4]:
movie = pd.read_csv(r'C:\Users\user\jupyterpractice\EDA\Pandas-Cookbook-master\data\movie.csv')
movie_actor_director = movie[['actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name']]  # 이중 인덱스 안하면 에러뜬다 ! 주의!!
# 이중인덱스를 사용하거나 or cols = [선택할 컬럼들]로 지정한 후, movie[cols]로 뽑던가 (같은 거임)
movie_actor_director.head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Christopher Nolan
4,Doug Walker,Rob Walker,,Doug Walker


In [5]:
movie[['director_name']].head()

Unnamed: 0,director_name
0,James Cameron
1,Gore Verbinski
2,Sam Mendes
3,Christopher Nolan
4,Doug Walker


## 기존에 쓰던 방식) 사용할 column을 list로 미리 지정 후 indexing

In [8]:
cols =['actor_1_name', 'actor_2_name', 'actor_3_name', 'director_name']
movie_actor_director = movie[cols]
movie_actor_director.head(3)

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,director_name
0,CCH Pounder,Joel David Moore,Wes Studi,James Cameron
1,Johnny Depp,Orlando Bloom,Jack Davenport,Gore Verbinski
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Sam Mendes


## ★★메서드를 사용한 열 선택
- select_dtypes & filter 두 가지 메서드 존재
- 1. select_dtypes
    - select_dtypes : 특정 데이터 타입의 열만 뽑아 낼 수 있음
    - dataframe.select_dtypes(include=['data type])
- 2. filter
    - filter method : 사용된 매개변수에 따라 열 이름 or 인덱스 레이블을 검색
    - 매개변수
        - like : 문자열 검색
        - regex : 정규식 검색
        - **items : 정확한 열 이름의 리스트를 입력으로 받아들임**
            - items는 columns 명칭으로 구성된 cols list로 dataframe[cols]하는 방식과 동일
            - but, dataframe[cols]는 컬럼명 잘못 입력 -> Error
            - filter method + items 매개변수 -> 컬럼명 잘못 입력해도 Error X. 잘 입력된 것만 extraction
    - dataframe.filter(like='검색하려는 문자열') -> like 매개변수로 전달된 문자열을 포함한 column을 모두 추출

In [15]:
# data set loading & index를 movie_title로 해보기
movie = pd.read_csv(r'C:\Users\user\jupyterpractice\EDA\Pandas-Cookbook-master\data\movie.csv', index_col='movie_title')
movie.dtypes.value_counts()

float64    13
object     11
int64       3
dtype: int64

In [7]:
# select_dtypes mothod : 특정 데이터 타입의 열만 뽑아 낼 수 있음
# 정수 열만 선택하기
movie.select_dtypes(include=['int']).head()

Unnamed: 0_level_0,num_voted_users,cast_total_facebook_likes,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Avatar,886204,4834,33000
Pirates of the Caribbean: At World's End,471220,48350,0
Spectre,275868,11700,85000
The Dark Knight Rises,1144337,106759,164000
Star Wars: Episode VII - The Force Awakens,8,143,0


In [8]:
# 숫자로된 모든 열을 선택하려면 include 매개변수에 'number' 전달
movie.select_dtypes(include=['number']).head()

Unnamed: 0_level_0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Avatar,723.0,178.0,0.0,855.0,1000.0,760505847.0,886204,4834,0.0,3054.0,237000000.0,2009.0,936.0,7.9,1.78,33000
Pirates of the Caribbean: At World's End,302.0,169.0,563.0,1000.0,40000.0,309404152.0,471220,48350,0.0,1238.0,300000000.0,2007.0,5000.0,7.1,2.35,0
Spectre,602.0,148.0,0.0,161.0,11000.0,200074175.0,275868,11700,1.0,994.0,245000000.0,2015.0,393.0,6.8,2.35,85000
The Dark Knight Rises,813.0,164.0,22000.0,23000.0,27000.0,448130642.0,1144337,106759,0.0,2701.0,250000000.0,2012.0,23000.0,8.5,2.35,164000
Star Wars: Episode VII - The Force Awakens,,,131.0,,131.0,,8,143,0.0,,,,12.0,7.1,,0


- filter method : 사용된 매개변수에 따라 열 이름 or 인덱스 레이블을 검색

In [16]:
# like 매개변수에 입력한 키워드 -> 열 or index에 해당 키워드가 포함되면 추출
movie.filter(like='facebook').head()

Unnamed: 0_level_0,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,cast_total_facebook_likes,actor_2_facebook_likes,movie_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Avatar,0.0,855.0,1000.0,4834,936.0,33000
Pirates of the Caribbean: At World's End,563.0,1000.0,40000.0,48350,5000.0,0
Spectre,0.0,161.0,11000.0,11700,393.0,85000
The Dark Knight Rises,22000.0,23000.0,27000.0,106759,23000.0,164000
Star Wars: Episode VII - The Force Awakens,131.0,,131.0,143,12.0,0


In [17]:
# filter method의 regex 매개변수 -> 정규식(regular expressions)을 사용해 열을 검색할 수 있음. 
movie.filter(regex='\d').head() # '\d' : 이름에 숫자가 포함된 모든 열을 검색

Unnamed: 0_level_0,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,actor_1_name,actor_3_name,actor_2_facebook_likes
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Avatar,855.0,Joel David Moore,1000.0,CCH Pounder,Wes Studi,936.0
Pirates of the Caribbean: At World's End,1000.0,Orlando Bloom,40000.0,Johnny Depp,Jack Davenport,5000.0
Spectre,161.0,Rory Kinnear,11000.0,Christoph Waltz,Stephanie Sigman,393.0
The Dark Knight Rises,23000.0,Christian Bale,27000.0,Tom Hardy,Joseph Gordon-Levitt,23000.0
Star Wars: Episode VII - The Force Awakens,,Rob Walker,131.0,Doug Walker,,12.0


In [19]:
# items 매개변수로는 list type의 입력이 들어감 -> 정확한 열 이름이 들어감
# 장점 : dataframe[cols]는 잘못된 column명이 있으면 error -> but filter는 error없이 정확히 일치하는 column만 데려옴
movie.filter(items=['actor_1_name', 'asdf']).head()

Unnamed: 0_level_0,actor_1_name
movie_title,Unnamed: 1_level_1
Avatar,CCH Pounder
Pirates of the Caribbean: At World's End,Johnny Depp
Spectre,Christoph Waltz
The Dark Knight Rises,Tom Hardy
Star Wars: Episode VII - The Force Awakens,Doug Walker


## 열 이름 일목요연하게 정렬하기
- 가이드 라인
    - 1. 각 열을 연속 / 불연속에 따라 분류
    - 2. 연속 / 불연속 내에서 공통적인 열은 그룹으로 만들기
    - 3. 그룹 내 가장 중요한 열이 가장 먼저 나오게 하고, 범주형 열을 연속형보다 먼저 나오게 하기
- 추가로 볼 논문 : Tidy Data (http://bit.ly/2v1hvH5)

In [25]:
movie = pd.read_csv(r'C:\Users\user\jupyterpractice\EDA\Pandas-Cookbook-master\data\movie.csv')

In [26]:
movie.head(2)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,actor_1_name,movie_title,num_voted_users,cast_total_facebook_likes,actor_3_name,facenumber_in_poster,plot_keywords,movie_imdb_link,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,4834,Wes Studi,0.0,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,48350,Jack Davenport,0.0,goddess|marriage ceremony|marriage proposal|pi...,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0


In [27]:
movie.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [28]:
# 불연속형 그룹
disc_core = ['movie_title','title_year', 'content_rating','genres']
disc_people = ['director_name','actor_1_name', 'actor_2_name','actor_3_name']
disc_other = ['color','country','language','plot_keywords','movie_imdb_link']

# 연속형 그룹
cont_fb = ['director_facebook_likes','actor_1_facebook_likes','actor_2_facebook_likes',
           'actor_3_facebook_likes', 'cast_total_facebook_likes', 'movie_facebook_likes']
cont_finance = ['budget','gross']
cont_num_reviews = ['num_voted_users','num_user_for_reviews', 'num_critic_for_reviews']
cont_other = ['imdb_score','duration', 'aspect_ratio', 'facenumber_in_poster']

In [29]:
new_col_order = disc_core + disc_people + disc_other + \
                    cont_fb + cont_finance + cont_num_reviews + cont_other

# python 집합은 순서가 없으므로 같은지 확인하는 연산은 한 집합의 원소가 다른 집합의 원소와 동일한지 확인하는 것.
# 누락된 column이 없는지 확인한다
set(movie.columns) == set(new_col_order)

True

In [30]:
movie2 = movie[new_col_order]
movie2.head()

Unnamed: 0,movie_title,title_year,content_rating,genres,director_name,actor_1_name,actor_2_name,actor_3_name,color,country,language,plot_keywords,movie_imdb_link,director_facebook_likes,actor_1_facebook_likes,actor_2_facebook_likes,actor_3_facebook_likes,cast_total_facebook_likes,movie_facebook_likes,budget,gross,num_voted_users,num_user_for_reviews,num_critic_for_reviews,imdb_score,duration,aspect_ratio,facenumber_in_poster
0,Avatar,2009.0,PG-13,Action|Adventure|Fantasy|Sci-Fi,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Color,USA,English,avatar|future|marine|native|paraplegic,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,0.0,1000.0,936.0,855.0,4834,33000,237000000.0,760505847.0,886204,3054.0,723.0,7.9,178.0,1.78,0.0
1,Pirates of the Caribbean: At World's End,2007.0,PG-13,Action|Adventure|Fantasy,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Color,USA,English,goddess|marriage ceremony|marriage proposal|pi...,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,563.0,40000.0,5000.0,1000.0,48350,0,300000000.0,309404152.0,471220,1238.0,302.0,7.1,169.0,2.35,0.0
2,Spectre,2015.0,PG-13,Action|Adventure|Thriller,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Color,UK,English,bomb|espionage|sequel|spy|terrorist,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,0.0,11000.0,393.0,161.0,11700,85000,245000000.0,200074175.0,275868,994.0,602.0,6.8,148.0,2.35,1.0
3,The Dark Knight Rises,2012.0,PG-13,Action|Thriller,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Color,USA,English,deception|imprisonment|lawlessness|police offi...,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,22000.0,27000.0,23000.0,23000.0,106759,164000,250000000.0,448130642.0,1144337,2701.0,813.0,8.5,164.0,2.35,0.0
4,Star Wars: Episode VII - The Force Awakens,,,Documentary,Doug Walker,Doug Walker,Rob Walker,,,,,,http://www.imdb.com/title/tt5289954/?ref_=fn_t...,131.0,131.0,12.0,,143,0,,,8,,,7.1,,,0.0


## 전체 DataFrame에 대한 연산

In [31]:
pd.options.display.max_rows = 8
movie = pd.read_csv(r'C:\Users\user\jupyterpractice\EDA\Pandas-Cookbook-master\data\movie.csv')
movie.shape

(4916, 28)

In [32]:
movie.size

137648

In [33]:
# numpy에서 ndim은 차원의 수를 반환함
movie.ndim

2

In [34]:
# len은 row의 수를 반환함
len(movie)

4916

In [35]:
# count method로 각 열의 누락값을 제외한 실제 값의 개수를 알아낸다.
movie.count()

color                     4897
director_name             4814
num_critic_for_reviews    4867
duration                  4901
                          ... 
actor_2_facebook_likes    4903
imdb_score                4916
aspect_ratio              4590
movie_facebook_likes      4916
Length: 28, dtype: int64

In [36]:
movie.min()

num_critic_for_reviews        1
duration                      7
director_facebook_likes       0
actor_3_facebook_likes        0
                           ... 
actor_2_facebook_likes        0
imdb_score                  1.6
aspect_ratio               1.18
movie_facebook_likes          0
Length: 19, dtype: object

In [37]:
# 위의 모든 descriptive statistics(기술적인 통계)를 반환하는 describe() method
# 결과는 descriptive statistics를 index로 가지는 DataFrame
movie.describe()

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
count,4867.0,4901.0,4814.0,4893.0,4909.0,4054.0,4916.0,4916.0,4903.0,4895.0,4432.0,4810.0,4903.0,4916.0,4590.0,4916.0
mean,137.988905,107.090798,691.014541,631.276313,6494.488491,47644510.0,82644.92,9579.815907,1.37732,267.668846,36547490.0,2002.447609,1621.923516,6.437429,2.222349,7348.294142
std,120.239379,25.286015,2832.954125,1625.874802,15106.986884,67372550.0,138322.2,18164.31699,2.023826,372.934839,100242700.0,12.453977,4011.299523,1.127802,1.40294,19206.016458
min,1.0,7.0,0.0,0.0,0.0,162.0,5.0,0.0,0.0,1.0,218.0,1916.0,0.0,1.6,1.18,0.0
25%,49.0,93.0,7.0,132.0,607.0,5019656.0,8361.75,1394.75,0.0,64.0,6000000.0,1999.0,277.0,5.8,1.85,0.0
50%,108.0,103.0,48.0,366.0,982.0,25043960.0,33132.5,3049.0,1.0,153.0,19850000.0,2005.0,593.0,6.6,2.35,159.0
75%,191.0,118.0,189.75,633.0,11000.0,61108410.0,93772.75,13616.75,2.0,320.5,43000000.0,2011.0,912.0,7.2,2.35,2000.0
max,813.0,511.0,23000.0,23000.0,640000.0,760505800.0,1689764.0,656730.0,43.0,5060.0,4200000000.0,2016.0,137000.0,9.5,16.0,349000.0


In [38]:
pd.options.display.max_rows = 10

In [39]:
# percentiles 매개변수를 통해 정확한 분위수를 지정할 수 있음
movie.describe(percentiles=[.01, .3, .99])

Unnamed: 0,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,facenumber_in_poster,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
count,4867.0,4901.0,4814.0,4893.0,4909.0,4054.0,4916.0,4916.0,4903.0,4895.0,4432.0,4810.0,4903.0,4916.0,4590.0,4916.0
mean,137.988905,107.090798,691.014541,631.276313,6494.488491,47644510.0,82644.92,9579.815907,1.37732,267.668846,36547490.0,2002.447609,1621.923516,6.437429,2.222349,7348.294142
std,120.239379,25.286015,2832.954125,1625.874802,15106.986884,67372550.0,138322.2,18164.31699,2.023826,372.934839,100242700.0,12.453977,4011.299523,1.127802,1.40294,19206.016458
min,1.0,7.0,0.0,0.0,0.0,162.0,5.0,0.0,0.0,1.0,218.0,1916.0,0.0,1.6,1.18,0.0
1%,2.0,43.0,0.0,0.0,6.08,8474.8,53.0,6.0,0.0,1.94,60000.0,1951.0,0.0,3.1,1.33,0.0
30%,60.0,95.0,11.0,176.0,694.0,7914069.0,11864.5,1684.5,0.0,80.0,8000000.0,2000.0,345.0,6.0,1.85,0.0
50%,108.0,103.0,48.0,366.0,982.0,25043960.0,33132.5,3049.0,1.0,153.0,19850000.0,2005.0,593.0,6.6,2.35,159.0
99%,546.68,189.0,16000.0,11000.0,44920.0,326412800.0,681584.6,62413.9,8.0,1999.24,200000000.0,2016.0,17000.0,8.5,4.0,93850.0
max,813.0,511.0,23000.0,23000.0,640000.0,760505800.0,1689764.0,656730.0,43.0,5060.0,4200000000.0,2016.0,137000.0,9.5,16.0,349000.0


In [40]:
pd.options.display.max_rows = 8

In [42]:
# 결측치(누락값)의 개수 세기 : isnull().sum() method chaining
movie.isnull().sum()

color                      19
director_name             102
num_critic_for_reviews     49
duration                   15
                         ... 
actor_2_facebook_likes     13
imdb_score                  0
aspect_ratio              326
movie_facebook_likes        0
Length: 28, dtype: int64

#### skipna 매개변수 : 결측치(누락값)을 무시하지 않는 방법
- pandas는 default로 수치열의 누락값을 무시하고 통계값을 냄 -> skipna=True가 default인 것.
- skipna = False로 해줌으로써, 하나라도 누락값(결측치)가 있으면 NaN을 반환하도록 할 수 있음.

In [29]:
movie.min(skipna=False)

num_critic_for_reviews     NaN
duration                   NaN
director_facebook_likes    NaN
actor_3_facebook_likes     NaN
                          ... 
actor_2_facebook_likes     NaN
imdb_score                 1.6
aspect_ratio               NaN
movie_facebook_likes       0.0
Length: 16, dtype: float64

# DataFrame method chain으로 묶기
- Series와 달리 DataFrame에 대한 method chaining은 각 단계에서 반환되는 객체 유형을 정확히 숙지해야 함

In [66]:
movie = pd.read_csv(r'C:\Users\user\jupyterpractice\EDA\Pandas-Cookbook-master\data\movie.csv')
movie.isnull().head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,actor_1_name,movie_title,num_voted_users,cast_total_facebook_likes,actor_3_name,facenumber_in_poster,plot_keywords,movie_imdb_link,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,True,False,True,True,False,True,False,False,True,False,False,False,False,False,True,False,True,False,True,True,True,True,True,True,False,False,True,False


In [56]:
# sum()은 isnull() Boolean 값에 대해 True = 1, False = 0으로 계산하여 합산함
movie.isnull().sum().head()

# 반환 값이 Series 형태임에 주목!

color                       19
director_name              102
num_critic_for_reviews      49
duration                    15
director_facebook_likes    102
dtype: int64

In [57]:
# 한 번 더 sum() method를 사용하면 전체 누락값(결측치)의 개수를 알려줌
movie.isnull().sum().sum()

2654

In [58]:
# 누락값의 존재 여부를 파악하는 다른 방법. any method를 두번 사용하였음
# isnull().any() 까지는 각 열에 하나라도 True값이 있으면 True로 반환함. 여기까지는 Series로 반환됨
# 마지막 any() method를 적용하면 True/False로 된 Series에 적용되어 다시 한번 True가 1개라도 있으면 True를 반환하게 되는 것임
# 결과적으로 isnull().any().any()가 True라면 전체 DataFrame 중에서 최소한 1개 이상의 누락값(결측치)이 있다는 의미가 됨
movie.isnull().any().any()

True

#### isnull method의 작동원리 
- boolean 형태의 데이터 형식을 반환함

In [59]:
movie.isnull().dtypes.value_counts()

bool    28
dtype: int64

#### object type column에 대한 누락값(결측치) 살펴보기

In [62]:
# object type의 column을 선택한 후(누락값 존재) max, min 등의 기술 통계 메서드를 적용 시 아무것도 반환하지 않음
movie[['color', 'movie_title', 'color']].max() # 왜 때문에 나는 결과가 나왔는가.....???

movie_title    Æon Flux
dtype: object

In [65]:
# 빈 문자열로 결측치(누락값)을 채운 뒤, 기술 통계 메서드 적용
movie.select_dtypes(['object']).fillna('').max()

color                                                          Color
director_name                                          Étienne Faure
actor_2_name                                           Zubaida Sahar
genres                                                       Western
                                         ...                        
movie_imdb_link    http://www.imdb.com/title/tt5574490/?ref_=fn_t...
language                                                        Zulu
country                                                 West Germany
content_rating                                                     X
Length: 12, dtype: object

In [69]:
# 가독성을 높이는 방법으로 백슬래시를 사용할 수 있음
movie.select_dtypes(['object']) \
                    .fillna('') \ # 요렇게 하면 각 단계 별로 주석달기도 가능. 가독성 상승!
                    .max()

color                                                          Color
director_name                                          Étienne Faure
actor_2_name                                           Zubaida Sahar
genres                                                       Western
                                         ...                        
movie_imdb_link    http://www.imdb.com/title/tt5574490/?ref_=fn_t...
language                                                        Zulu
country                                                 West Germany
content_rating                                                     X
Length: 12, dtype: object

## DataFrame에서 연산자 이용

In [2]:
import pandas as pd
college = pd.read_csv(r'C:\Users\user\jupyterpractice\EDA\Pandas-Cookbook-master\data\college.csv')
college + 5

TypeError: can only concatenate str (not "int") to str

In [3]:
# 인덱스 레이블로는 학교 이름을 사용
college = pd.read_csv(r'C:\Users\user\jupyterpractice\EDA\Pandas-Cookbook-master\data\college.csv', index_col='INSTNM')

# UGDS_로 시작하는 모든 열을 선택
# 학부생의 인종별 비율을 나타내는 열들
college_ugds_ = college.filter(like='UGDS_')

In [4]:
college == 'asdf'

  res_values = method(rvalues)


Unnamed: 0_level_0,CITY,STABBR,HBCU,MENONLY,WOMENONLY,RELAFFIL,SATVRMID,SATMTMID,DISTANCEONLY,UGDS,...,UGDS_2MOR,UGDS_NRA,UGDS_UNKN,PPTUG_EF,CURROPER,PCTPELL,PCTFLOAN,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alabama A & M University,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
University of Alabama at Birmingham,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Amridge University,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
University of Alabama in Huntsville,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Alabama State University,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAE Institute of Technology San Francisco,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Rasmussen College - Overland Park,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
National Personal Training Institute of Cleveland,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
Bay Area Medical Academy - San Jose Satellite Location,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [5]:
college_ugds_.head(3)

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715


In [6]:
# college_ugds_의 각 값에 0.00501
college_ugds_.head(3) + .00501

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.03831,0.94031,0.01051,0.00691,0.00741,0.00691,0.00501,0.01091,0.01881
University of Alabama at Birmingham,0.59721,0.26501,0.03331,0.05681,0.00721,0.00571,0.04181,0.02291,0.01501
Amridge University,0.30401,0.42421,0.01191,0.00841,0.00501,0.00501,0.00501,0.00501,0.27651


In [7]:
# 반올림 연산 전, 가장 가까운 정수 퍼센티지로 반올림
(college_ugds_.head(3) + .00501) // .01

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,3.0,94.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
University of Alabama at Birmingham,59.0,26.0,3.0,5.0,0.0,0.0,4.0,2.0,1.0
Amridge University,30.0,42.0,1.0,0.0,0.0,0.0,0.0,0.0,27.0


In [8]:
# 반올림을 위해 100으로 나눔
college_ugds_op_round = (college_ugds_ + .00501) // .01 / 100
college_ugds_op_round.head(3)

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.03,0.94,0.01,0.0,0.0,0.0,0.0,0.01,0.01
University of Alabama at Birmingham,0.59,0.26,0.03,0.05,0.0,0.0,0.04,0.02,0.01
Amridge University,0.3,0.42,0.01,0.0,0.0,0.0,0.0,0.0,0.27


#### round 반올림 활용 : .round(n) n자리까지만 나타냄. 활용 전 아주 작은 값 더해놓기 

In [10]:
# round 메서드를 사용하면 자동반올림이 가능
# Numpy는 정확히 중간에 있는 수를 짝수쪽으로 맞춰버리므로 반올림 하기 전에 아주 작은 값을 더해야함

college_ugds_round = (college_ugds_ + .00001).round(2)
college_ugds_round.head(3)

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.03,0.94,0.01,0.0,0.0,0.0,0.0,0.01,0.01
University of Alabama at Birmingham,0.59,0.26,0.03,0.05,0.0,0.0,0.04,0.02,0.01
Amridge University,0.3,0.42,0.01,0.0,0.0,0.0,0.0,0.0,0.27


In [11]:
# equal 메서드 -> 반올림 수동 계산 vs round 자동 계산 비교
college_ugds_op_round.equals(college_ugds_round)

True

#### DataFrame에서의 반올림
- 컴퓨터 수치 연산에 대한 가이드 : What every computer scientist should know about floating-point arithmetic (http://bit.ly/2vmYZKi) 

In [47]:
college_ugds_op_round_methods = college_ugds_.add(.00501).floordiv(.01).div(100)

## 누락값 비교

In [13]:
# pandas는 NumPy NaN (np.nan) 객체를 사용해 누락값을 나타냄
# 이 객체는 스스로가 자신과 같다는 등식이 성립하지 않는 특이한 객체임.
# python의 None의 경우에는 스스로를 비교할 때 True를 반환함

import numpy as np
print('NumPy의 NaN은 스스로가 자신과 같다는 등식이 성립하지 않음',np.nan == np.nan)
print('Python의 None은 등식이 성립함',None == None)

NumPy의 NaN은 스스로가 자신과 같다는 등식이 성립하지 않음 False
Python의 None은 등식이 성립함 True


In [14]:
print(5 > np.nan)
print(np.nan > 5)
print(5 != np.nan)

False
False
True


In [16]:
college = pd.read_csv(r'C:\Users\user\jupyterpractice\EDA\Pandas-Cookbook-master\data\college.csv', index_col='INSTNM')
college_ugds_ = college.filter(like='UGDS_')

In [17]:
college_ugds_.head(3) == .0019

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,False,False,False,True,False,True,False,False,False
University of Alabama at Birmingham,False,False,False,False,False,False,False,False,False
Amridge University,False,False,False,False,False,False,False,False,False


#### 결측치(누락값)이 아닌 값들만 추출하는 새로운 접근
-> 제대로 작동하지 않음.  
-> isnull().sum() or equals 메서드를 사용해야함

In [18]:
# == 연산자가 제대로 작동하지 않음
# 여기서 ==는, 두 DataFrame이 동일한 label을 가진 인덱스를 갖고 있는지에 따라
# 같은 개수의 원소를 가진 것인지 비교한 것임.
college_self_compare = college_ugds_ == college_ugds_ 
college_self_compare.head(3)

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,True,True,True,True,True,True,True,True,True
University of Alabama at Birmingham,True,True,True,True,True,True,True,True,True
Amridge University,True,True,True,True,True,True,True,True,True


In [21]:
# 각 열의 모든 값이 True 값만을 갖고 있는지 all mathod를 사용해 확인해보면 예상과 다름
# DataFrame의 어떤 열도 서로 동일하지 않다고 확인해줌
college_self_compare.all()

UGDS_WHITE    False
UGDS_BLACK    False
UGDS_HISP     False
UGDS_ASIAN    False
UGDS_AIAN     False
UGDS_NHPI     False
UGDS_2MOR     False
UGDS_NRA      False
UGDS_UNKN     False
dtype: bool

In [22]:
# 누락값이 서로 일치하는 것으로 비교되지 않기 때문에 발생하는 현상이었음

# np.nan과 그 자신이 동일하지 않음을 알려줌. False == np.nan에 대해 모두 0을 반환
(college_ugds_ == np.nan).sum()

UGDS_WHITE    0
UGDS_BLACK    0
UGDS_HISP     0
UGDS_ASIAN    0
UGDS_AIAN     0
UGDS_NHPI     0
UGDS_2MOR     0
UGDS_NRA      0
UGDS_UNKN     0
dtype: int64

In [23]:
# 누락값(결측치)를 세는 가장 근본적인 방법
college_ugds_.isnull().sum()

UGDS_WHITE    661
UGDS_BLACK    661
UGDS_HISP     661
UGDS_ASIAN    661
UGDS_AIAN     661
UGDS_NHPI     661
UGDS_2MOR     661
UGDS_NRA      661
UGDS_UNKN     661
dtype: int64

#### DataFrame 2개 간의 비교 : == 연산자 대신 equals method 사용
- DataFrame의 eq method는 == 연산자와 동일하게 원소끼리의 비교를 한다
- eq method는 equals와는 전혀 비슷하지 않음.
- college_ugds_.eq(.0019) 와 college_ugds_ == .0019는 같은 식임

In [27]:
# 두 DataFrame 전체를 비교하는 정확한 방법은 == 연산자가 아니라 equals method임
college_ugds_.equals(college_ugds_)

True

#### assert_frame_equal 함수
- pandas.testing sub package 내부에 unit test 때 개발자들이 반드시 사용해야하는 함수가 존재
- assert_frame_equal 함수는 두 DataFrame이 서로 같지 않으면 AssertionError를 발생.
- 두 DataFrame이 동일한 경우에는 none을 반환, 즉 실행값 없이 넘어감

In [28]:
# pandas.testing sub package 내부에 unit test 때 개발자들이 반드시 사용해야하는 함수가 존재
# assert_frame_equal 함수는 두 DataFrame이 서로 같지 않으면 AssertionError를 발생.
# 두 DataFrame이 동일한 경우에는 none을 반환, 즉 실행값 없이 넘어감

from pandas.testing import assert_frame_equal

In [29]:
assert_frame_equal(college_ugds_, college_ugds_)

## DataFrame 연산의 방향 바꾸기

In [31]:
college = pd.read_csv(r'C:\Users\user\jupyterpractice\EDA\Pandas-Cookbook-master\data\college.csv', index_col='INSTNM')
college_ugds_ = college.filter(like='UGDS_')
college_ugds_.head(3)

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715


In [32]:
# count : 누락되지 않은, 결측치가 아닌 값의 개수 반환
# count의 axis 매개 변수는 default가 0임. 즉 행을 기준으로 함. 열에 대한 행의 개수!
college_ugds_.count()

# college_ugds_.count(axis=0) 도는 college_ugds_.count(axis='index') 와 같음

UGDS_WHITE    6874
UGDS_BLACK    6874
UGDS_HISP     6874
UGDS_ASIAN    6874
UGDS_AIAN     6874
UGDS_NHPI     6874
UGDS_2MOR     6874
UGDS_NRA      6874
UGDS_UNKN     6874
dtype: int64

In [64]:
college_ugds_.count(axis=0)

UGDS_WHITE    6874
UGDS_BLACK    6874
UGDS_HISP     6874
UGDS_ASIAN    6874
              ... 
UGDS_NHPI     6874
UGDS_2MOR     6874
UGDS_NRA      6874
UGDS_UNKN     6874
Length: 9, dtype: int64

In [65]:
college_ugds_.count(axis='index')

UGDS_WHITE    6874
UGDS_BLACK    6874
UGDS_HISP     6874
UGDS_ASIAN    6874
              ... 
UGDS_NHPI     6874
UGDS_2MOR     6874
UGDS_NRA      6874
UGDS_UNKN     6874
Length: 9, dtype: int64

In [33]:
# count의 axis 매개변수를 columns(또는 1)로 변경하면 연산의 방향이 바뀌어
# 각 행에서 누락되지 않은 개수를 계산
college_ugds_.count(axis='columns').head()

INSTNM
Alabama A & M University               9
University of Alabama at Birmingham    9
Amridge University                     9
University of Alabama in Huntsville    9
Alabama State University               9
dtype: int64

In [34]:
# 누락되지 않은, 결측치가 아닌 값의 개수를 세는 대신
# 각 열의 모든 값을 더해볼 수 있음
# 각열의 퍼센티지의 합 = 1
# 1보다 작다 -> 결측치가 있다
college_ugds_.sum(axis='columns').head()

INSTNM
Alabama A & M University               1.0000
University of Alabama at Birmingham    0.9999
Amridge University                     1.0000
University of Alabama in Huntsville    1.0000
Alabama State University               1.0000
dtype: float64

In [35]:
# 각 열의 분포를 알아보려면 median method 사용
college_ugds_.median(axis='index')

UGDS_WHITE    0.55570
UGDS_BLACK    0.10005
UGDS_HISP     0.07140
UGDS_ASIAN    0.01290
UGDS_AIAN     0.00260
UGDS_NHPI     0.00000
UGDS_2MOR     0.01750
UGDS_NRA      0.00000
UGDS_UNKN     0.01430
dtype: float64

#### cumsum method와 axis = 1 방향 : 누적 데이터 생성

In [36]:
# cumsum method에 axis = 1 설정 : 각 열에 대한 인종 비율을 누적
# 한 행에 대해 가로로, 오른쪽으로 갈수록 누적됨.
# axis=0 default일 때는 한 행에 대해, 위에서 아래로 가면서 누적임!
college_ugds_cumsum = college_ugds_.cumsum(axis=1)
college_ugds_cumsum.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9686,0.9741,0.976,0.9784,0.9803,0.9803,0.9862,1.0
University of Alabama at Birmingham,0.5922,0.8522,0.8805,0.9323,0.9345,0.9352,0.972,0.9899,0.9999
Amridge University,0.299,0.7182,0.7251,0.7285,0.7285,0.7285,0.7285,0.7285,1.0
University of Alabama in Huntsville,0.6988,0.8243,0.8625,0.9001,0.9144,0.9146,0.9318,0.965,1.0
Alabama State University,0.0158,0.9366,0.9487,0.9506,0.9516,0.9522,0.962,0.9863,1.0


In [70]:
# UGDS_HISP 컬럼을 기준으로 정렬. 내림 차순임. 한 열을 보면 위에서 밑으로 갈수록 값이 작아짐을 확인할 수 있음.
college_ugds_cumsum.sort_values('UGDS_HISP', ascending=False)

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
New Beginning College of Cosmetology,0.8957,0.9305,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001
Virginia University of Lynchburg,0.0120,0.9921,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001
Turning Point Beauty College,0.1915,0.2341,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001
First Coast Barber Academy,0.1667,0.9445,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001,1.0001
...,...,...,...,...,...,...,...,...,...
Rasmussen College - Overland Park,,,,,,,,,
National Personal Training Institute of Cleveland,,,,,,,,,
Bay Area Medical Academy - San Jose Satellite Location,,,,,,,,,
Excel Learning Center-San Antonio South,,,,,,,,,


## 대학 캠퍼스의 다양성 지수 확인

In [37]:
pd.read_csv(r'C:\Users\user\jupyterpractice\EDA\Pandas-Cookbook-master\data\college_diversity.csv', index_col='School')

Unnamed: 0_level_0,Diversity Index
School,Unnamed: 1_level_1
"Rutgers University--Newark Newark, NJ",0.76
"Andrews University Berrien Springs, MI",0.74
"Stanford University Stanford, CA",0.74
"University of Houston Houston, TX",0.74
"University of Nevada--Las Vegas Las Vegas, NV",0.74
"University of San Francisco San Francisco, CA",0.74
"San Francisco State University San Francisco, CA",0.73
"University of Illinois--Chicago Chicago, IL",0.73
"New Jersey Institute of Technology Newark, NJ",0.72
"Texas Woman's University Denton, TX",0.72


In [38]:
college = pd.read_csv(r'C:\Users\user\jupyterpractice\EDA\Pandas-Cookbook-master\data\college.csv', index_col='INSTNM')
college_ugds_ = college.filter(like='UGDS_')
college_ugds_.head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,0.0333,0.9353,0.0055,0.0019,0.0024,0.0019,0.0,0.0059,0.0138
University of Alabama at Birmingham,0.5922,0.26,0.0283,0.0518,0.0022,0.0007,0.0368,0.0179,0.01
Amridge University,0.299,0.4192,0.0069,0.0034,0.0,0.0,0.0,0.0,0.2715
University of Alabama in Huntsville,0.6988,0.1255,0.0382,0.0376,0.0143,0.0002,0.0172,0.0332,0.035
Alabama State University,0.0158,0.9208,0.0121,0.0019,0.001,0.0006,0.0098,0.0243,0.0137


In [73]:
college_ugds_.isnull().sum(axis=1).sort_values(ascending=False).head()

INSTNM
Excel Learning Center-San Antonio South         9
Philadelphia College of Osteopathic Medicine    9
Assemblies of God Theological Seminary          9
Episcopal Divinity School                       9
Phillips Graduate Institute                     9
dtype: int64

In [39]:
# dropna -> 9개의 인종열이 모두 누락된 모든 행을 삭제함
college_ugds_ = college_ugds_.dropna(how='all')

In [40]:
college_ugds_.isnull().sum()

UGDS_WHITE    0
UGDS_BLACK    0
UGDS_HISP     0
UGDS_ASIAN    0
UGDS_AIAN     0
UGDS_NHPI     0
UGDS_2MOR     0
UGDS_NRA      0
UGDS_UNKN     0
dtype: int64

In [41]:
# ge method : 크거나 같다 연산자와 동일. boolean 값을 반환
college_ugds_.ge(.15).head()

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Alabama A & M University,False,True,False,False,False,False,False,False,False
University of Alabama at Birmingham,True,True,False,False,False,False,False,False,False
Amridge University,True,True,False,False,False,False,False,False,True
University of Alabama in Huntsville,True,False,False,False,False,False,False,False,False
Alabama State University,False,True,False,False,False,False,False,False,False


In [43]:
# sum() method로 각 대학의 True 개수만 세보기. 결과는 Series로 반환됨
diversity_metric = college_ugds_.ge(.15).sum(axis='columns')
diversity_metric.head()

INSTNM
Alabama A & M University               1
University of Alabama at Birmingham    2
Amridge University                     3
University of Alabama in Huntsville    1
Alabama State University               1
dtype: int64

In [44]:
# 분포를 보기 위해 Value_counts method 사용
diversity_metric.value_counts()

1    3042
2    2884
3     876
4      63
0       7
5       2
dtype: int64

In [45]:
# 위의 value_counts() 에서 인덱스 5인 2개 = 두 학교가 15% 이상의 서로 다른 다섯 인종이 있다는 것을 의미
# sort_values()로 학교 이름 확인
diversity_metric.sort_values(ascending=False).head()

INSTNM
Regency Beauty Institute-Austin          5
Central Texas Beauty College-Temple      5
Sullivan and Cogliano Training Center    4
Ambria College of Nursing                4
Berkeley College-New York                4
dtype: int64

In [46]:
# loc로 상위 2개 학교 데이터만 추출
college_ugds_.loc[['Regency Beauty Institute-Austin', 
                          'Central Texas Beauty College-Temple']]

Unnamed: 0_level_0,UGDS_WHITE,UGDS_BLACK,UGDS_HISP,UGDS_ASIAN,UGDS_AIAN,UGDS_NHPI,UGDS_2MOR,UGDS_NRA,UGDS_UNKN
INSTNM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Regency Beauty Institute-Austin,0.1867,0.2133,0.16,0.0,0.0,0.0,0.1733,0.0,0.2667
Central Texas Beauty College-Temple,0.1616,0.2323,0.2626,0.0202,0.0,0.0,0.1717,0.0,0.1515


In [47]:
# US NEWS 에서 선정한 상위 10개 대학이 기본적인 척도로 몇점을 받았는지 알아봄
us_news_top = ['Rutgers University-Newark', 
               'Andrews University', 
               'Stanford University', 
               'University of Houston',
               'University of Nevada-Las Vegas']

In [48]:
diversity_metric.loc[us_news_top]

INSTNM
Rutgers University-Newark         4
Andrews University                3
Stanford University               3
University of Houston             3
University of Nevada-Las Vegas    3
dtype: int64