# How to load data

# 0. Example dataset
## Pima Indians Dataset
- A population of women who were at least 21 years old, of Pima Indian heritage and living near Phoenix, Arizona, was tested for diabetes according to World Health Organization criteria.
- The data were collected by the US National Institute of Diabetes and Digestive and Kidney Diseases.
- https://archive.ics.uci.edu/ml/datasets/Pima+Indians+Diabetes

# 1. Read the data file in local

## Using pandas

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Using pandas.read_csv()
# Pandas는 여러 형태의 파일을 불러들일 수 있습니다.
# pd.read_ 이후에 [TAB]을 눌러 여러 함수들이 있음을 확인

# 파일에 변수 이름이 같이 있는 경우, header = True를 통해 한 번에 변수명을 불러들일 수 있음.
data = pd.read_csv('data/pima-indians-diabetes.data', header = None)

In [3]:
# Data의 개략적 형태 확인
# pandas를 이용하였기 때문에, 읽힌 데이터가 DataFrame임을 확인할 수 있음
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
# Data의 포인트 수와 변수의 수를 확인
data.shape

(768, 9)

In [5]:
# Data의 변수명을 정의하는 방법
variables = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data.columns = variables

In [6]:
data.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [7]:
data.describe()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [8]:
# pandas.read_csv()에서 미리 변수명을 호출하는 방법도 있음
data2 = pd.read_csv('data/pima-indians-diabetes.data', names = variables)
data2.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Using numpy

In [9]:
# numpy.loadtxt를 이용
# delimiter를 정의하여 데이터를 불러들임
raw_data = open('data/pima-indians-diabetes.data', 'rb')
data = np.loadtxt(raw_data, delimiter=',')

# 가급적 open을 한 데이터는 close하는 것이 좋음
raw_data.close()

In [10]:
# numpy를 이용해 읽은 데이터는 array임.
data

array([[   6.   ,  148.   ,   72.   , ...,    0.627,   50.   ,    1.   ],
       [   1.   ,   85.   ,   66.   , ...,    0.351,   31.   ,    0.   ],
       [   8.   ,  183.   ,   64.   , ...,    0.672,   32.   ,    1.   ],
       ..., 
       [   5.   ,  121.   ,   72.   , ...,    0.245,   30.   ,    0.   ],
       [   1.   ,  126.   ,   60.   , ...,    0.349,   47.   ,    1.   ],
       [   1.   ,   93.   ,   70.   , ...,    0.315,   23.   ,    0.   ]])

In [11]:
data.shape

(768, 9)

In [12]:
# with를 사용하면 데이터 close를 하지 않아도 편하게 사용 가능하다.
with open('data/pima-indians-diabetes.data', 'rb') as raw_data:
    data2 = np.loadtxt(raw_data, delimiter=',')

In [13]:
data2

array([[   6.   ,  148.   ,   72.   , ...,    0.627,   50.   ,    1.   ],
       [   1.   ,   85.   ,   66.   , ...,    0.351,   31.   ,    0.   ],
       [   8.   ,  183.   ,   64.   , ...,    0.672,   32.   ,    1.   ],
       ..., 
       [   5.   ,  121.   ,   72.   , ...,    0.245,   30.   ,    0.   ],
       [   1.   ,  126.   ,   60.   , ...,    0.349,   47.   ,    1.   ],
       [   1.   ,   93.   ,   70.   , ...,    0.315,   23.   ,    0.   ]])

In [14]:
# 앞에서 raw_data가 closed되었다는 에러 메시지가 나타날 것이다.
data3 = np.loadtxt(raw_data, delimiter=',')

ValueError: I/O operation on closed file.

# 2. Read the data file from a URL

## Using pandas 

In [15]:
# pandas는 매우 쉽게 url로부터 데이터를 읽어들임
url = "https://goo.gl/vhm1eU"
data = pd.read_csv(url, header = None)

In [16]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [17]:
data.shape

(768, 9)

## Using numpy 

In [18]:
# urllib.request.urlopen을 통해 url의 데이터를 읽을 수 있음
import urllib
raw_data = urllib.request.urlopen(url)
data = np.loadtxt(raw_data, delimiter=',')
raw_data.close()

In [19]:
data

array([[   6.   ,  148.   ,   72.   , ...,    0.627,   50.   ,    1.   ],
       [   1.   ,   85.   ,   66.   , ...,    0.351,   31.   ,    0.   ],
       [   8.   ,  183.   ,   64.   , ...,    0.672,   32.   ,    1.   ],
       ..., 
       [   5.   ,  121.   ,   72.   , ...,    0.245,   30.   ,    0.   ],
       [   1.   ,  126.   ,   60.   , ...,    0.349,   47.   ,    1.   ],
       [   1.   ,   93.   ,   70.   , ...,    0.315,   23.   ,    0.   ]])

In [20]:
data.shape

(768, 9)