## 초기환경설정 및 라이브러리 로딩(import convention)

In [6]:
import pandas as pd # panel data & python data analysis ==> DataFrame과 Series 객체를 다루는 주요 모듈
import numpy as np # Numerical Python 고성능의 수치 계산 ==> ndarray N차원 배열객체

## 콤마기호로 구분되어 있는 abc.csv 파일 데이터 로딩

In [7]:
# abc.csv라는 raw데이터 파일을 메모리에 로딩해 abc 데이터프레임으로 생성함
abc = pd.read_csv('abc.csv', skipinitialspace=True)

In [8]:
# 메모리에 로딩된 abc 객체의 자료형식
type(abc)

pandas.core.frame.DataFrame

In [9]:
# abc 데이터프레임 앞부분 간략조회
abc.head()

Unnamed: 0,gender,job,age,position,address,total,check,price,survey
0,1,1.0,26.0,2.0,Seoul,66.3,5.0,5.1,3.0
1,1,2.0,54.0,5.0,Busan,63.7,,,
2,1,2.0,41.0,4.0,,61.1,5.0,4.7,2.0
3,2,,45.0,4.0,Gwangju,59.8,7.0,,
4,1,3.0,70.0,5.0,Suwon,650.0,5.0,5.0,4.0


In [10]:
# abc 데이터프레임 뒷부분 간략조회
abc.tail()

Unnamed: 0,gender,job,age,position,address,total,check,price,survey
145,2,1.0,38.0,3.0,Busan,87.1,2.0,5.4,4.0
146,2,3.0,,3.0,Seoul,81.9,12.0,,
147,2,3.0,63.0,5.0,Gwangju,84.5,7.0,336.5,5.0
148,1,2.0,41.0,4.0,Busan,80.6,-1.0,6.2,4.0
149,2,3.0,27.0,2.0,Seoul,76.7,4.0,5.9,3.0


In [11]:
# abc 데이터프레임의 원하는 레코드 부분 간략조회
abc[5:10]

Unnamed: 0,gender,job,age,position,address,total,check,price,survey
5,1,2.0,57.0,,Daejeon,70.2,7.0,5.4,5.0
6,2,1.0,36.0,3.0,Busan,59.8,5.0,,
7,1,2.0,,3.0,Jeju,65.0,12.0,675.0,3.0
8,1,1.0,56.0,5.0,,57.2,3.0,4.4,4.0
9,1,2.0,37.0,3.0,Busan,63.7,4.0,4.9,3.0


## 데이터 구조 파악

In [12]:
# abc 데이터프레임의 기본내부 데이터 구조정보 파악
abc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 9 columns):
gender      150 non-null int64
job         142 non-null float64
age         134 non-null float64
position    138 non-null float64
address     138 non-null object
total       150 non-null float64
check       139 non-null float64
price       90 non-null float64
survey      93 non-null float64
dtypes: float64(7), int64(1), object(1)
memory usage: 10.6+ KB


## 데이터 기술통계분석

In [13]:
# abc 데이터프레임의 기술통계분석(descriptive analysis)
abc.describe()

Unnamed: 0,gender,job,age,position,total,check,price,survey
count,150.0,142.0,134.0,138.0,150.0,139.0,90.0,93.0
mean,1.526667,2.105634,44.141791,3.471014,79.508,4.330935,15.211111,3.107527
std,0.500961,0.796337,14.926967,1.450859,49.453207,3.288955,94.985705,0.840062
min,1.0,1.0,20.0,1.0,5.0,-5.0,-345.6,1.0
25%,1.0,1.0,30.0,2.0,66.3,2.5,4.925,3.0
50%,2.0,2.0,44.0,4.0,75.4,4.0,5.7,3.0
75%,2.0,3.0,56.75,5.0,83.2,5.0,6.4,3.0
max,2.0,3.0,70.0,5.0,650.0,14.0,675.0,5.0


## 인덱싱/슬라이싱/필터링/샘플링

In [14]:
# age 변수 조회
abc['age']

0      26.0
1      54.0
2      41.0
3      45.0
4      70.0
5      57.0
6      36.0
7       NaN
8      56.0
9      37.0
10     29.0
11     35.0
12     56.0
13     20.0
14     63.0
15     49.0
16     49.0
17     49.0
18     25.0
19     57.0
20     56.0
21     21.0
22     69.0
23     63.0
24     30.0
25     34.0
26     26.0
27     59.0
28     38.0
29     57.0
       ... 
120    48.0
121    22.0
122    48.0
123    21.0
124    51.0
125    64.0
126    27.0
127    64.0
128    54.0
129     NaN
130    42.0
131    54.0
132    25.0
133    54.0
134    60.0
135     NaN
136    63.0
137    43.0
138    21.0
139    23.0
140    24.0
141    33.0
142    56.0
143    45.0
144    60.0
145    38.0
146     NaN
147    63.0
148    41.0
149    27.0
Name: age, Length: 150, dtype: float64

In [15]:
x = abc['age']
type(x)

pandas.core.series.Series

## age변수를 Series방식으로 조회

In [16]:
# age변수의 내용 조회
abc['age'].head(3)

0    26.0
1    54.0
2    41.0
Name: age, dtype: float64

In [17]:
x1 = abc['age'].head(3)
type(x1)

pandas.core.series.Series

In [18]:
# age 변수의 내용 조회
abc.age.head(3)

0    26.0
1    54.0
2    41.0
Name: age, dtype: float64

In [19]:
x2 = abc.age.head(3)
type(x2)

pandas.core.series.Series

## age변수를 DataFrame 방식으로 조회

In [20]:
# age변수 내용조회
abc[['age']].head(3)

Unnamed: 0,age
0,26.0
1,54.0
2,41.0


In [21]:
x3 = abc[['age']].head(3)
type(x3)

pandas.core.frame.DataFrame

In [25]:
# age변수내용을 컬럼자리수로 조회
abc[['age']].head(3)

Unnamed: 0,age
0,26.0
1,54.0
2,41.0


In [26]:
x4 = abc[['age']].head(3)
type(x4)

pandas.core.frame.DataFrame

## 2개 변수 이상 동시 인덱싱

In [27]:
# gender와 position 변수컬럼의 동시 인덱싱
abc[['gender', 'position']].head(3)

Unnamed: 0,gender,position
0,1,2.0
1,1,5.0
2,1,4.0


In [28]:
abc.head(3)

Unnamed: 0,gender,job,age,position,address,total,check,price,survey
0,1,1.0,26.0,2.0,Seoul,66.3,5.0,5.1,3.0
1,1,2.0,54.0,5.0,Busan,63.7,,,
2,1,2.0,41.0,4.0,,61.1,5.0,4.7,2.0


In [29]:
# gender와 position 변수컬럼의 자리수를 이용한 동시 인덱싱
abc[[0, 3]].head(3)

KeyError: '[0 3] not in index'

In [30]:
# 필요한 변수컬럼의 자유로운 인덱싱
abc[[1, 3]+list(range(6, 9))+[4]].head(3)

KeyError: '[1 3 6 7 8 4] not in index'

## .ix방식을 활용한 인덱싱

In [33]:
# job 변수컬럼 인덱싱
abc.loc[ : , 'job'].head(3)

0    1.0
1    2.0
2    2.0
Name: job, dtype: float64

In [34]:
# job 변수컬럼의 자리수를 이용한 인덱싱
abc.ix[ : , 1].head(3)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


0    1.0
1    2.0
2    2.0
Name: job, dtype: float64

In [35]:
# gender와 position 변수컬럼의 동시 인덱싱
abc.ix[:,['gender', 'position']].head(3)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,gender,position
0,1,2.0
1,1,5.0
2,1,4.0


In [36]:
# gender와 position 변수컬럼의 자리수를 이용한 동시 인덱싱
abc.ix[ : , [0, 3]].head(3)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,gender,position
0,1,2.0
1,1,5.0
2,1,4.0


## 슬라이싱 방식을 이용한 변수컬럼 선택

In [37]:
abc.head(3)

Unnamed: 0,gender,job,age,position,address,total,check,price,survey
0,1,1.0,26.0,2.0,Seoul,66.3,5.0,5.1,3.0
1,1,2.0,54.0,5.0,Busan,63.7,,,
2,1,2.0,41.0,4.0,,61.1,5.0,4.7,2.0


In [38]:
# 슬라이싱을 통한 선택
abc.ix[ : , 5:9].head(3)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,total,check,price,survey
0,66.3,5.0,5.1,3.0
1,63.7,,,
2,61.1,5.0,4.7,2.0


In [39]:
abc.ix[ : , : ].head(3)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,gender,job,age,position,address,total,check,price,survey
0,1,1.0,26.0,2.0,Seoul,66.3,5.0,5.1,3.0
1,1,2.0,54.0,5.0,Busan,63.7,,,
2,1,2.0,41.0,4.0,,61.1,5.0,4.7,2.0


In [40]:
abc.ix[ : , [1]+list(range(5, 8))+[3, 8]].head(3)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,job,total,check,price,position,survey
0,1.0,66.3,5.0,5.1,2.0,3.0
1,2.0,63.7,,,5.0,
2,2.0,61.1,5.0,4.7,4.0,2.0


## 슬라이싱을 활용한 레코드(관찰치) 선택

In [41]:
# 첫번째 레코드만 조회
abc[0:1]

Unnamed: 0,gender,job,age,position,address,total,check,price,survey
0,1,1.0,26.0,2.0,Seoul,66.3,5.0,5.1,3.0


In [42]:
# 5번째~9번째 까지의 레코드 조회
abc[5:10]

Unnamed: 0,gender,job,age,position,address,total,check,price,survey
5,1,2.0,57.0,,Daejeon,70.2,7.0,5.4,5.0
6,2,1.0,36.0,3.0,Busan,59.8,5.0,,
7,1,2.0,,3.0,Jeju,65.0,12.0,675.0,3.0
8,1,1.0,56.0,5.0,,57.2,3.0,4.4,4.0
9,1,2.0,37.0,3.0,Busan,63.7,4.0,4.9,3.0


In [43]:
# 5번째 레코드만 조회
abc.ix[5,:]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


gender            1
job               2
age              57
position        NaN
address     Daejeon
total          70.2
check             7
price           5.4
survey            5
Name: 5, dtype: object

In [44]:
type(abc.ix[5,:])

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


pandas.core.series.Series

In [45]:
# 5번째 레코드만 조회
abc.ix[[5],:]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,gender,job,age,position,address,total,check,price,survey
5,1,2.0,57.0,,Daejeon,70.2,7.0,5.4,5.0


In [46]:
type(abc.ix[[5],:])

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


pandas.core.frame.DataFrame

In [47]:
# 필요한 레코드를 선별 인덱싱하여 조회
abc.ix[[3,7],:]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,gender,job,age,position,address,total,check,price,survey
3,2,,45.0,4.0,Gwangju,59.8,7.0,,
7,1,2.0,,3.0,Jeju,65.0,12.0,675.0,3.0


In [48]:
# 필요한 레코드를 슬라이싱 방식으로 조회
abc.ix[5:10,:]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,gender,job,age,position,address,total,check,price,survey
5,1,2.0,57.0,,Daejeon,70.2,7.0,5.4,5.0
6,2,1.0,36.0,3.0,Busan,59.8,5.0,,
7,1,2.0,,3.0,Jeju,65.0,12.0,675.0,3.0
8,1,1.0,56.0,5.0,,57.2,3.0,4.4,4.0
9,1,2.0,37.0,3.0,Busan,63.7,4.0,4.9,3.0
10,2,,29.0,2.0,Suwon,70.2,5.0,,


In [49]:
# 필요한 레코드를 선별 조회
abc.ix[[3]+list(range(10,15))+[7,9],:]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,gender,job,age,position,address,total,check,price,survey
3,2,,45.0,4.0,Gwangju,59.8,7.0,,
10,2,,29.0,2.0,Suwon,70.2,5.0,,
11,1,3.0,35.0,2.0,Daejeon,62.4,,,
12,1,1.0,56.0,5.0,Seoul,62.4,4.0,,
13,2,3.0,20.0,1.0,Busan,55.9,5.0,4.3,1.0
14,1,2.0,63.0,,,75.4,12.0,225.8,4.0
7,1,2.0,,3.0,Jeju,65.0,12.0,675.0,3.0
9,1,2.0,37.0,3.0,Busan,63.7,4.0,4.9,3.0


## 인덱싱과 슬라이싱을 활용한 필요데이터 조회

In [50]:
abc.ix[[3]+list(range(10,15))+[7,9], [1,3]+list(range(6,9))+[4]]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,job,position,check,price,survey,address
3,,4.0,7.0,,,Gwangju
10,,2.0,5.0,,,Suwon
11,3.0,2.0,,,,Daejeon
12,1.0,5.0,4.0,,,Seoul
13,3.0,1.0,5.0,4.3,1.0,Busan
14,2.0,,12.0,225.8,4.0,
7,2.0,3.0,12.0,675.0,3.0,Jeju
9,2.0,3.0,4.0,4.9,3.0,Busan


## 필터링을 활용한 필요 데이터 조회

In [51]:
# 성별이 1인 남자만 조회
abc[abc.gender==1].head(3)

Unnamed: 0,gender,job,age,position,address,total,check,price,survey
0,1,1.0,26.0,2.0,Seoul,66.3,5.0,5.1,3.0
1,1,2.0,54.0,5.0,Busan,63.7,,,
2,1,2.0,41.0,4.0,,61.1,5.0,4.7,2.0


In [52]:
# 성별이 1인 남자만 조회
abc[abc['gender']==1].head(3)

Unnamed: 0,gender,job,age,position,address,total,check,price,survey
0,1,1.0,26.0,2.0,Seoul,66.3,5.0,5.1,3.0
1,1,2.0,54.0,5.0,Busan,63.7,,,
2,1,2.0,41.0,4.0,,61.1,5.0,4.7,2.0


In [53]:
# 총거래금액이 70만원 이상인 고객 조회
abc[abc.total>=70].head(3)

Unnamed: 0,gender,job,age,position,address,total,check,price,survey
4,1,3.0,70.0,5.0,Suwon,650.0,5.0,5.0,4.0
5,1,2.0,57.0,,Daejeon,70.2,7.0,5.4,5.0
10,2,,29.0,2.0,Suwon,70.2,5.0,,


In [54]:
# 총거래금액이 70만원이상이고 성별이 남자인 고객의 "총거래금액(total)" 데이터 조회
abc[(abc['total']>=70) & (abc['gender']==1)].total.head(5)

4     650.0
5      70.2
14     75.4
15     74.1
16     70.2
Name: total, dtype: float64

In [None]:
# 총거래금액이 70만원이상이고 성별이 남자인 고객의 "총거래금액(total)"과 "성별(age)" 데이터 조회
abc.ix[(abc['total']>=70) & (abc['gender']==1),['total','age']].head(5)

In [None]:
# 총거래금액이 70만원이상이고 성별이 남자인 고객의 모든 변수컬럼 데이터 조회
abc.ix[(abc['total']>=70) & (abc['gender']==1),:].head(5)

## 레코드(관찰치)의 무작위 추출

In [55]:
# abc 데이터프레임 객체에서 인덱스번호 20개를 무작위로 선택추출
x = np.random.choice(abc.index.values, 20)

In [56]:
# 무작위 추출된 인덱스번호 20개 목록조회
x

array([135,  82,  16, 135,  41, 147,  20,  91,  54,  80,  15, 139,  16,
        68,  11, 100,  63,  54,  34,  13], dtype=int64)

In [57]:
# 무작위 추출된 인덱스번호 갯수확인
len(x)

20

In [58]:
# 무작위 추출된 인덱스번호 20개를 오름차순으로 정렬 
sorted(x)

[11,
 13,
 15,
 16,
 16,
 20,
 34,
 41,
 54,
 54,
 63,
 68,
 80,
 82,
 91,
 100,
 135,
 135,
 139,
 147]

In [59]:
# 무작위 추출된 인덱스번호 20개를 내림차순으로 정렬 
sorted(x, reverse=True)

[147,
 139,
 135,
 135,
 100,
 91,
 82,
 80,
 68,
 63,
 54,
 54,
 41,
 34,
 20,
 16,
 16,
 15,
 13,
 11]

In [60]:
# 10개의 인덱스번호를 무작위로 추출하여 레코드를 선별하여 부분 데이터프레임을 만듬
choice = np.random.choice(abc.index.values, 10)
abc.ix[choice,:]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,gender,job,age,position,address,total,check,price,survey
23,1,2.0,63.0,5.0,Seoul,66.3,2.0,4.1,5.0
0,1,1.0,26.0,2.0,Seoul,66.3,5.0,5.1,3.0
22,1,1.0,69.0,5.0,Busan,59.8,4.0,,3.0
14,1,2.0,63.0,,,75.4,12.0,225.8,4.0
51,1,2.0,41.0,4.0,Busan,83.2,2.0,6.4,3.0
118,2,3.0,53.0,5.0,,10.0,2.0,7.7,4.0
63,2,2.0,28.0,,Seoul,79.3,-5.0,3.3,3.0
55,1,2.0,22.0,1.0,,74.1,12.0,4.7,3.0
82,2,2.0,29.0,2.0,,75.4,4.0,5.8,3.0
64,2,1.0,70.0,5.0,Seoul,72.8,1.0,-345.6,3.0


In [61]:
# abc 객체에 속한 .sample 속성을 활용한 필요 레코드갯수를 직접지정하는 무작위추출방법
abc.sample(n=10)

Unnamed: 0,gender,job,age,position,address,total,check,price,survey
65,2,2.0,24.0,1.0,Busan,87.1,4.0,,
51,1,2.0,41.0,4.0,Busan,83.2,2.0,6.4,3.0
4,1,3.0,70.0,5.0,Suwon,650.0,5.0,5.0,4.0
103,2,3.0,45.0,4.0,Suwon,81.9,1.0,,
144,2,3.0,60.0,5.0,Seoul,87.1,4.0,6.7,3.0
9,1,2.0,37.0,3.0,Busan,63.7,4.0,4.9,3.0
14,1,2.0,63.0,,,75.4,12.0,225.8,4.0
78,2,2.0,48.0,4.0,,78.0,4.0,,
40,1,3.0,28.0,2.0,Busan,65.0,3.0,5.0,3.0
130,2,3.0,42.0,4.0,Jeju,96.2,2.0,,


In [62]:
# abc 객체에 속한 .sample 속성을 활용한 필요 레코드 선택비율을 직접지정하는 무작위추출방법
abc.sample(frac=0.05, replace = True) # 복원 추출

Unnamed: 0,gender,job,age,position,address,total,check,price,survey
35,1,3.0,23.0,1.0,Suwon,65.0,4.0,4.0,3.0
86,2,3.0,67.0,5.0,Busan,87.1,4.0,6.7,3.0
19,1,1.0,57.0,5.0,Seoul,66.3,,,
139,2,3.0,23.0,1.0,Busan,89.7,2.0,6.9,1.0
50,2,2.0,61.0,5.0,Seoul,91.0,11.0,7.0,4.0
127,2,1.0,64.0,,Seoul,79.3,12.0,,
88,1,2.0,42.0,4.0,Suwon,72.8,4.0,5.6,3.0
99,2,2.0,43.0,4.0,Jeju,74.1,12.0,5.7,3.0


## drop속성을  활용한 샘플 선택방법

In [64]:
# abc 데이터프레임에서 무작위로 100개의 레코드 인덱스를 추출
rows = np.random.choice(abc.index.values, 100, replace = False)
rows

array([134, 108,  85, 138, 106,  23, 109, 149,  69,  95,  78, 113,  62,
        67,  50,  80, 127,  63,  10, 102, 128,  45,   8,  24,   3,  86,
        47, 145,  79,  74, 131, 125, 144,  21,  37,  26,  65,  12, 116,
        72, 146, 141,  40,  15,  35,  49, 143,  64,   4,  55,  20, 129,
       142,  42,  13, 136,  46,  84, 115, 122,   5,  59,  99, 148,  48,
        57,   0, 126,  36,  56,  17, 107,  82, 132,  76,  58,  25,  97,
        89,   9, 133,  53, 105,  33, 112, 100,  77,  71,  54,  22, 147,
       117,  98,  70,  96,  11, 104, 123,  44,  43], dtype=int64)

In [65]:
# 추출된 인덱스번호의 데이터형식
type(rows)

numpy.ndarray

In [66]:
# 추출된 인덱스번호의 소팅
rows.sort()
rows

array([  0,   3,   4,   5,   8,   9,  10,  11,  12,  13,  15,  17,  20,
        21,  22,  23,  24,  25,  26,  33,  35,  36,  37,  40,  42,  43,
        44,  45,  46,  47,  48,  49,  50,  53,  54,  55,  56,  57,  58,
        59,  62,  63,  64,  65,  67,  69,  70,  71,  72,  74,  76,  77,
        78,  79,  80,  82,  84,  85,  86,  89,  95,  96,  97,  98,  99,
       100, 102, 104, 105, 106, 107, 108, 109, 112, 113, 115, 116, 117,
       122, 123, 125, 126, 127, 128, 129, 131, 132, 133, 134, 136, 138,
       141, 142, 143, 144, 145, 146, 147, 148, 149], dtype=int64)

In [67]:
# 추출된 인덱스번호를 활용한 레코드 선택
abc.ix[rows,:].head()

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,gender,job,age,position,address,total,check,price,survey
0,1,1.0,26.0,2.0,Seoul,66.3,5.0,5.1,3.0
3,2,,45.0,4.0,Gwangju,59.8,7.0,,
4,1,3.0,70.0,5.0,Suwon,650.0,5.0,5.0,4.0
5,1,2.0,57.0,,Daejeon,70.2,7.0,5.4,5.0
8,1,1.0,56.0,5.0,,57.2,3.0,4.4,4.0


In [68]:
# 추출된 인덱스번호를 제외한 나머지 레코드 선택
abc.drop(rows).head()

Unnamed: 0,gender,job,age,position,address,total,check,price,survey
1,1,2.0,54.0,5.0,Busan,63.7,,,
2,1,2.0,41.0,4.0,,61.1,5.0,4.7,2.0
6,2,1.0,36.0,3.0,Busan,59.8,5.0,,
7,1,2.0,,3.0,Jeju,65.0,12.0,675.0,3.0
14,1,2.0,63.0,,,75.4,12.0,225.8,4.0


In [69]:
# .sample 속성을 활용해 50%의 레코드 무작위 인덱싱
sample1 = abc.sample(frac=0.50)

In [70]:
# 무작위 추출된 50%의 인덱싱번호를 활용한 레코드 선택
sample1.head()

Unnamed: 0,gender,job,age,position,address,total,check,price,survey
8,1,1.0,56.0,5.0,,57.2,3.0,4.4,4.0
34,2,1.0,49.0,4.0,Busan,63.7,3.0,,
1,1,2.0,54.0,5.0,Busan,63.7,,,
126,2,,27.0,2.0,Seoul,80.6,2.0,6.2,2.0
106,2,2.0,27.0,2.0,Daejeon,63.7,3.0,4.9,3.0


In [71]:
# 앞서 선택된 50%의 sample1 데이터프레임의 인덱스번호를 제외한 나머지 레코드 선택
sample2 = abc.drop(sample1.index)

In [72]:
sample2.head()

Unnamed: 0,gender,job,age,position,address,total,check,price,survey
0,1,1.0,26.0,2.0,Seoul,66.3,5.0,5.1,3.0
5,1,2.0,57.0,,Daejeon,70.2,7.0,5.4,5.0
11,1,3.0,35.0,2.0,Daejeon,62.4,,,
12,1,1.0,56.0,5.0,Seoul,62.4,4.0,,
16,1,2.0,49.0,4.0,Daejeon,70.2,3.0,,


## end of documents