# ADP 실기 데이터 분석 전문가 2절 "데이터 전처리" Python Code 구현

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler, MinMaxScaler

## 1. 제어문

### 입력 받은 정수가 짝수인지, 홀수인지, 그리고 정수가 맞긴한지 Judgement

In [18]:
x = input()
def even_or_odd(x):
    try:
        x = int(x)
        if x%2 == 0:
            print("{0}는 짝수입니다.".format(x))
        else: 
            print("{0}는 홀수입니다".format(x))
    except:
        print('정수를 입력해주세요.')

even_or_odd(x)        

1.5
정수를 입력해주세요.


### print문 반복

In [20]:
years = [i for i in range(2015,2019)]

for year in years:
    print('The year is {0}'.format(year))

The year is 2015
The year is 2016
The year is 2017
The year is 2018


### 파생변수 생성 in iris

In [47]:
iris = load_iris()

iris_data = iris.data
iris_target = iris.target
iris_target_names = iris.target_names
iris_feature_names = iris.feature_names

In [88]:
iris_df = pd.DataFrame(data=iris_data, columns=iris_feature_names)
iris_df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width']
print(iris_df.shape)
iris_df.head()

(150, 4)


Unnamed: 0,sepal length,sepal width,petal length,petal width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


인덱스를 1부터 시작   
ID 추가,   
종(타깃) 이름 추가

In [89]:
iris_df.index = iris_df.index+1
iris_df['Species'] = iris_target_names[iris_target]
iris_df['ID'] = np.arange(1,151)
iris_df

Unnamed: 0,sepal length,sepal width,petal length,petal width,Species,ID
1,5.1,3.5,1.4,0.2,setosa,1
2,4.9,3.0,1.4,0.2,setosa,2
3,4.7,3.2,1.3,0.2,setosa,3
4,4.6,3.1,1.5,0.2,setosa,4
5,5.0,3.6,1.4,0.2,setosa,5
...,...,...,...,...,...,...
146,6.7,3.0,5.2,2.3,virginica,146
147,6.3,2.5,5.0,1.9,virginica,147
148,6.5,3.0,5.2,2.0,virginica,148
149,6.2,3.4,5.4,2.3,virginica,149


ID 변수의 값이 짝수이면 A, 홀수이면 B로 분류하는 파생변수 생성

In [94]:
iris_df['Group'] = iris_df['ID'].apply(lambda x : 'A' if x%2==0 else 'B')
iris_df.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,Species,ID,Group
1,5.1,3.5,1.4,0.2,setosa,1,B
2,4.9,3.0,1.4,0.2,setosa,2,A
3,4.7,3.2,1.3,0.2,setosa,3,B
4,4.6,3.1,1.5,0.2,setosa,4,A
5,5.0,3.6,1.4,0.2,setosa,5,B


Sum_Length = Sepal_Length + Petal_Length

In [95]:
iris_df['sum length'] = iris_df['sepal length'] + iris_df['petal length']
iris_df.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,Species,ID,Group,sum length
1,5.1,3.5,1.4,0.2,setosa,1,B,6.5
2,4.9,3.0,1.4,0.2,setosa,2,A,6.3
3,4.7,3.2,1.3,0.2,setosa,3,B,6.0
4,4.6,3.1,1.5,0.2,setosa,4,A,6.1
5,5.0,3.6,1.4,0.2,setosa,5,B,6.4


### 함수 정의를 통한 파생변수 생성

In [101]:
df_dict = {'student_id':['s'+str(i) for i in range(1,7)],'score':[55,90,85,71,63,99]}
score_df = pd.DataFrame(df_dict)
score_df.index += 1
score_df

Unnamed: 0,student_id,score
1,s1,55
2,s2,90
3,s3,85
4,s4,71
5,s5,63
6,s6,99


In [105]:
def grade_by_score(score):
    grade = ''
    if score >= 90 : grade = '수'
    elif score >= 80 : grade = '우'
    elif score >= 70 : grade = '미'
    elif score >= 60 : grade = '양'
    else: grade = '가'
    return grade

score_df['grade'] = score_df['score'].apply(lambda x : grade_by_score(x))
score_df

Unnamed: 0,student_id,score,grade
1,s1,55,가
2,s2,90,수
3,s3,85,우
4,s4,71,미
5,s5,63,양
6,s6,99,수


## 표준화와 정규화

In [9]:
iris = load_iris()
scaler = StandardScaler()
iris_scaled = scaler.fit_transform(iris.data)

normalizer = MinMaxScaler()
iris_norm = normalizer.fit_transform(iris.data)

print(iris.data[:4],'\n')
print(iris_scaled[:4],'\n')
print(iris_norm[:4])

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]] 

[[-0.90068117  1.01900435 -1.34022653 -1.3154443 ]
 [-1.14301691 -0.13197948 -1.34022653 -1.3154443 ]
 [-1.38535265  0.32841405 -1.39706395 -1.3154443 ]
 [-1.50652052  0.09821729 -1.2833891  -1.3154443 ]] 

[[0.22222222 0.625      0.06779661 0.04166667]
 [0.16666667 0.41666667 0.06779661 0.04166667]
 [0.11111111 0.5        0.05084746 0.04166667]
 [0.08333333 0.45833333 0.08474576 0.04166667]]


## 데이터 결합 및 요약

In [24]:
# 행 결합
customer1 = pd.DataFrame({'id':['c01','c02','c03','c04'],
           'last_name':['Lee','Kim','Choi','Park']})
customer2 = pd.DataFrame({'id':['c05','c06','c07'],
           'last_name':['Lim','Bae','Kim']})
id_name = pd.concat([customer1,customer2],axis=0,ignore_index=True)
print(id_name)

# 열 결합
age_income = pd.DataFrame({'age':[20,25,37,40,32,45,37],
                         'income':[2500,6400,0,7000,3400,3800,5010]})
df = pd.concat([id_name,age_income],axis=1)
df

    id last_name
0  c01       Lee
1  c02       Kim
2  c03      Choi
3  c04      Park
4  c05       Lim
5  c06       Bae
6  c07       Kim


Unnamed: 0,id,last_name,age,income
0,c01,Lee,20,2500
1,c02,Kim,25,6400
2,c03,Choi,37,0
3,c04,Park,40,7000
4,c05,Lim,32,3400
5,c06,Bae,45,3800
6,c07,Kim,37,5010


In [32]:
# 데이터 병합
id_name = df.iloc[:4,:2]
id_age = df.iloc[2:,[0,2]]
id_age.reset_index(inplace=True, drop=True)

print(id_name)
print(id_age)

pd.merge(id_name,id_age, on='id', how='outer')

    id last_name
0  c01       Lee
1  c02       Kim
2  c03      Choi
3  c04      Park
    id  age
0  c03   37
1  c04   40
2  c05   32
3  c06   45
4  c07   37


Unnamed: 0,id,last_name,age
0,c01,Lee,
1,c02,Kim,
2,c03,Choi,37.0
3,c04,Park,40.0
4,c05,,32.0
5,c06,,45.0
6,c07,,37.0


### 데이터 요약

In [146]:
# 특정 열로 요약해서 통계량 구하기
iris = load_iris()
columns = ['sepal length','sepal width','petal length', 'petal width']
iris_df = pd.DataFrame(iris.data,columns=columns)
iris_df['target'] = iris.target
iris_df['target_names'] = iris_df['target'].apply(lambda x : iris.target_names[x])

print(iris_df.groupby('target_names').mean()['sepal width'],'\n')
print(iris_df.groupby('target_names').mean()[['sepal width','petal width']],'\n')
print(iris_df.groupby('target_names').mean().iloc[:,:-1])

target_names
setosa        3.428
versicolor    2.770
virginica     2.974
Name: sepal width, dtype: float64 

              sepal width  petal width
target_names                          
setosa              3.428        0.246
versicolor          2.770        1.326
virginica           2.974        2.026 

              sepal length  sepal width  petal length  petal width
target_names                                                      
setosa               5.006        3.428         1.462        0.246
versicolor           5.936        2.770         4.260        1.326
virginica            6.588        2.974         5.552        2.026


In [153]:
iris_df['thrs'] = (iris_df['petal length'] < 1.5)
iris_df.groupby(['target_names','thrs']).mean().iloc[:,:-1]

Unnamed: 0_level_0,Unnamed: 1_level_0,sepal length,sepal width,petal length,petal width
target_names,thrs,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
setosa,False,5.107692,3.515385,1.588462,0.273077
setosa,True,4.895833,3.333333,1.325,0.216667
versicolor,False,5.936,2.77,4.26,1.326
virginica,False,6.588,2.974,5.552,2.026


In [85]:
# 피벗 테이블을 활용해 데이터프레임 만들기
pv = iris_df.pivot_table('sepal width','target','target_names',aggfunc='count')
pv.fillna(0)

target_names,setosa,versicolor,virginica
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,50.0,0.0,0.0
1,0.0,50.0,0.0
2,0.0,0.0,50.0


In [121]:
# 요약 통계량 출력
iris_data = iris_df.iloc[:,:4]
print(np.max(iris_data[:4],axis=1),'\n')
print(np.round(np.mean(iris_data,axis=0),2),'\n')
print(np.max(iris_data[:4],axis=1))

0    5.1
1    4.9
2    4.7
3    4.6
dtype: float64 

sepal length    5.84
sepal width     3.06
petal length    3.76
petal width     1.20
dtype: float64 

0    5.1
1    4.9
2    4.7
3    4.6
dtype: float64


In [134]:
# df에 함수 적용
def grown(x):
    if x > 5 : return 'adult'
    elif x > 3 : return 'child'
    else : return 'kid'
    
iris_df['sepal length'].apply(lambda x : grown(x))

0      adult
1      child
2      child
3      child
4      child
       ...  
145    adult
146    adult
147    adult
148    adult
149    adult
Name: sepal length, Length: 150, dtype: object