# DataFrame Load & Save

In [1]:
import numpy as np
import pandas as pd

### read_csv()

In [2]:
df = pd.read_csv('./data/users.csv')
df

Unnamed: 0,id,first_name,last_name,email,gender,ip_address
0,1,Norbie,Wrassell,nwrassell0@bbc.co.uk,Male,201.234.222.219
1,2,Tybalt,Covendon,tcovendon1@indiegogo.com,Male,90.113.205.35
2,3,Elyn,Volk,evolk2@umich.edu,Female,254.131.69.38
3,4,Todd,Carriage,tcarriage3@youku.com,Male,242.31.116.66
4,5,Pryce,Brookzie,pbrookzie4@dmoz.org,Agender,212.63.28.128
...,...,...,...,...,...,...
995,996,Allsun,McTeague,amcteaguern@moonfruit.com,Female,134.34.119.166
996,997,Viva,Epinoy,vepinoyro@cnbc.com,Female,93.2.144.250
997,998,Heall,Mallett,hmallettrp@google.it,Male,254.24.191.245
998,999,Conroy,Scandrett,cscandrettrq@cornell.edu,Male,150.254.219.221


In [None]:
# 파일 확장자(파일 타입)이 다르더라도 csv 데이터면 read_csv()로 처리 가능
df = pd.read_csv('./data/users.txt')
df

Unnamed: 0,id,first_name,last_name,email,gender,ip_address
0,1,Norbie,Wrassell,nwrassell0@bbc.co.uk,Male,201.234.222.219
1,2,Tybalt,Covendon,tcovendon1@indiegogo.com,Male,90.113.205.35
2,3,Elyn,Volk,evolk2@umich.edu,Female,254.131.69.38
3,4,Todd,Carriage,tcarriage3@youku.com,Male,242.31.116.66
4,5,Pryce,Brookzie,pbrookzie4@dmoz.org,Agender,212.63.28.128
...,...,...,...,...,...,...
995,996,Allsun,McTeague,amcteaguern@moonfruit.com,Female,134.34.119.166
996,997,Viva,Epinoy,vepinoyro@cnbc.com,Female,93.2.144.250
997,998,Heall,Mallett,hmallettrp@google.it,Male,254.24.191.245
998,999,Conroy,Scandrett,cscandrettrq@cornell.edu,Male,150.254.219.221


In [None]:
# 구분자가 ,가 아닌 다른 문자인 경우에도 sep 인자를 넘겨주면 read_csv()로 처리 가능.
df = pd.read_csv('./data/users.tsv', sep='\t')
df

Unnamed: 0,id,first_name,last_name,email,gender,ip_address
0,1,Norbie,Wrassell,nwrassell0@bbc.co.uk,Male,201.234.222.219
1,2,Tybalt,Covendon,tcovendon1@indiegogo.com,Male,90.113.205.35
2,3,Elyn,Volk,evolk2@umich.edu,Female,254.131.69.38
3,4,Todd,Carriage,tcarriage3@youku.com,Male,242.31.116.66
4,5,Pryce,Brookzie,pbrookzie4@dmoz.org,Agender,212.63.28.128
...,...,...,...,...,...,...
995,996,Allsun,McTeague,amcteaguern@moonfruit.com,Female,134.34.119.166
996,997,Viva,Epinoy,vepinoyro@cnbc.com,Female,93.2.144.250
997,998,Heall,Mallett,hmallettrp@google.it,Male,254.24.191.245
998,999,Conroy,Scandrett,cscandrettrq@cornell.edu,Male,150.254.219.221


In [11]:
# read_csv 함수는 기본적으로 맨 위에 있는 row를 헤더로 삼음
# 그래서 헤더 없는 아래의 파일인 경우네는 header=None 옵션을 줘서 헤더를 숫자로 만들 수 있음
# 헤더의 이름을 정의하려면 names의 옵션을 컬럼 수에 맞춰 작성하면 됨.
df = pd.read_csv(
    './data/users_headless.csv'
    , header=None
    , names=['user_id', 'f_name', 'l_name', 'email', 'gender', 'ip_addr'])
df

Unnamed: 0,user_id,f_name,l_name,email,gender,ip_addr
0,1,Norbie,Wrassell,nwrassell0@bbc.co.uk,Male,201.234.222.219
1,2,Tybalt,Covendon,tcovendon1@indiegogo.com,Male,90.113.205.35
2,3,Elyn,Volk,evolk2@umich.edu,Female,254.131.69.38
3,4,Todd,Carriage,tcarriage3@youku.com,Male,242.31.116.66
4,5,Pryce,Brookzie,pbrookzie4@dmoz.org,Agender,212.63.28.128
...,...,...,...,...,...,...
995,996,Allsun,McTeague,amcteaguern@moonfruit.com,Female,134.34.119.166
996,997,Viva,Epinoy,vepinoyro@cnbc.com,Female,93.2.144.250
997,998,Heall,Mallett,hmallettrp@google.it,Male,254.24.191.245
998,999,Conroy,Scandrett,cscandrettrq@cornell.edu,Male,150.254.219.221


### df.to_csv() 파일 저장

In [13]:
df = df[df['gender'] == 'Agender']
df.info()

<class 'pandas.DataFrame'>
Index: 23 entries, 4 to 957
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   user_id  23 non-null     int64
 1   f_name   23 non-null     str  
 2   l_name   23 non-null     str  
 3   email    23 non-null     str  
 4   gender   23 non-null     str  
 5   ip_addr  23 non-null     str  
dtypes: int64(1), str(5)
memory usage: 1.3 KB


In [19]:
df.to_csv('./data/user_Agender.csv') # 기본값 index=True, header=True

In [20]:
df.to_csv('./data/user_Agender.csv', index=False)

In [21]:
df.to_csv('./data/user_Agender.csv', index=False, header=False)

### [참고] HTML 문서 내 table load
- 정적 웹 페이지 내 table 태그 부분만 추출해 DataFrame 객체로 변환 가능
- parsing 처리를 위한 lxml 패키지 필요

In [24]:
header = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/144.0.0.0 Safari/537.36'}
tables = pd.read_html('https://ko.wikipedia.org/wiki/%EB%8C%80%ED%95%9C%EB%AF%BC%EA%B5%AD%EC%9D%98_%EC%98%81%ED%99%94_%ED%9D%A5%ED%96%89_%EA%B8%B0%EB%A1%9D'
                      , storage_options=header)

len(tables)

7

In [32]:
movie_df = tables[0]
movie_df[movie_df['감독'] == '봉준호']

Unnamed: 0,순위,제목,감독,한국내 배급사,개봉일,관객수,기타
9,10,《괴물》,봉준호,쇼박스,2006-07-27,13019740,영화진흥위원회 공식통계 기준
30,31,《기생충》,봉준호,CJ엔터테인먼트,2019-05-30,10313735,영화진흥위원회 발권통계 기준
34,35,《설국열차》,봉준호,CJ E&M,2013-08-01,9354547,영화진흥위원회 발권통계 기준
