# 데이터 전처리 방법 실습

In [None]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd # 데이터를 가공할때

# 데이터 로딩 및 df 프레임 만들기

In [3]:
df = pd.read_csv("./data_ml/Data.csv")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 452.0+ bytes


In [5]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [6]:
df['Purchased'].value_counts()

Purchased
No     5
Yes    5
Name: count, dtype: int64

In [7]:
df.columns

Index(['Country', 'Age', 'Salary', 'Purchased'], dtype='object')

In [9]:
df['Country'].unique()  

array(['France', 'Spain', 'Germany'], dtype=object)

# 결측치(NaN)

In [11]:
# 결측치 확인
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 452.0+ bytes


In [12]:
df.isna()

Unnamed: 0,Country,Age,Salary,Purchased
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,True,False
5,False,False,False,False
6,False,True,False,False
7,False,False,False,False
8,False,False,False,False
9,False,False,False,False


In [10]:
df.isna().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

## 결측치 처리 방법
- 삭제하던가, 채우던가

### 삭제하기

In [16]:
# 삭제 . nan이 값이 포함된 행 데이터 삭제
df = df.dropna()

### 채우기 전략
- 0, 평균, 중앙값, 최빈값

In [18]:
df.fillna(0)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
5,France,35.0,58000.0,Yes
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [19]:
df['Age'].mean()

np.float64(38.625)

In [20]:
df.mean(numeric_only=True   )

Age          38.625
Salary    65250.000
dtype: float64

In [None]:
# 숫자 컬럼의 합이 평균값으로 채구익

In [21]:
#우리가 예측하고자 하는 것 y
y = df['Purchased']
y

0     No
1    Yes
2     No
3     No
5    Yes
7    Yes
8     No
9    Yes
Name: Purchased, dtype: object

In [22]:
# x 컬럼 : Country, Age, Salary
X = df[['Country', 'Age', 'Salary']]
X

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
5,France,35.0,58000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [24]:
df['Country'].value_counts()

Country
France     4
Spain      2
Germany    2
Name: count, dtype: int64

# 레이블 인코딩 , One Hot Encoding
- 범주형(문자) 테이블 -> 숫자로 변환
- 레이블 인코딩 : 문자열 -> 0, 1, 2
- One Hot Encoding : 

### 레이블 인코딩 하는 방법

In [25]:
# 라이브러리 로딩
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [26]:
# 객체 생성
encoder = LabelEncoder()

In [27]:
encoder

## 원핫인코딩 하는 방법

In [28]:
from sklearn.compose import ColumnTransformer

In [32]:
# 객체생성
# ct = ColumnTransformer( [('encoder', OneHotEncoder(), [컬럼인덱스1, 컬럼인덱스2])] )
ct = ColumnTransformer( [('encoder', OneHotEncoder(), [0])] ) 

In [35]:
X["Country"]

0     France
1      Spain
2    Germany
3      Spain
5     France
7     France
8    Germany
9     France
Name: Country, dtype: object

In [40]:
X['Country'] = encoder.fit_transform(X['Country'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Country'] = encoder.fit_transform(X['Country'])


In [41]:
# 변환
ct.fit_transform(X)

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [42]:
X.head()

Unnamed: 0,Country,Age,Salary
0,0,44.0,72000.0
1,2,27.0,48000.0
2,1,30.0,54000.0
3,2,38.0,61000.0
5,0,35.0,58000.0


In [48]:
# 머신러닝을 위한 one-hot-encoding 전체 코드
ct = ColumnTransformer( [('encoder', OneHotEncoder(), [0])], remainder='passthrough' )  
X = ct.fit_transform(X)

## Y데이터 인코딩

In [46]:
y.value_counts()

Purchased
No     4
Yes    4
Name: count, dtype: int64

In [None]:
# 연구자가 연구하고싶어하는 대상을 = 1, 아닌것 = 0 으로 매핑
y.map({'No':0, 'Yes':1})

0    0
1    1
2    0
3    0
5    1
7    1
8    0
9    1
Name: Purchased, dtype: int64

In [51]:
X[:5]


array([[1.0e+00, 0.0e+00, 0.0e+00, 4.4e+01, 7.2e+04],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.7e+01, 4.8e+04],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.0e+01, 5.4e+04],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.8e+01, 6.1e+04],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.5e+01, 5.8e+04]])

In [52]:
# x,y 인코딩이 끝났으면, Feature Scaling(특성 스케일링) 진행


## 피처스케일링

In [56]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [68]:
# 객체 생성
# 표준화(평균 0, 표준편차 1인 정규분포를 따르는 범위 데이터로 변환)
scaler = StandardScaler()

# 변환
scaler.fit_transform(X)

array([[ 1.        , -0.57735027, -0.57735027,  0.69985807,  0.58989097],
       [-1.        , -0.57735027,  1.73205081, -1.51364653, -1.50749915],
       [-1.        ,  1.73205081, -0.57735027, -1.12302807, -0.98315162],
       [-1.        , -0.57735027,  1.73205081, -0.08137885, -0.37141284],
       [ 1.        , -0.57735027, -0.57735027, -0.47199731, -0.6335866 ],
       [ 1.        , -0.57735027, -0.57735027,  1.22068269,  1.20162976],
       [-1.        ,  1.73205081, -0.57735027,  1.48109499,  1.55119478],
       [ 1.        , -0.57735027, -0.57735027, -0.211585  ,  0.1529347 ]])

In [59]:
# 정규화(0~1 사이의 범위 데이터로 변환)
# scaler = MinMaxScaler()
# scaler.fit_transform(X)

In [None]:
# 데이터 전처리한 
# 1. 결측치 처리
# 2. x,y 데이터 분리
# 3. 범주형 데이터 인코딩(One-Hot-Encoding, Label-Encoding)(문자열 -> 숫자 )
# 4. Feature Scaling(특성 스케일링) : 데이터의 수준을 맞추기 위함, X(독립변수) 한 함

# Train, Test 데이터 분할하기

In [69]:
X_scaled = scaler.fit_transform(X)

In [70]:
X_scaled

array([[ 1.        , -0.57735027, -0.57735027,  0.69985807,  0.58989097],
       [-1.        , -0.57735027,  1.73205081, -1.51364653, -1.50749915],
       [-1.        ,  1.73205081, -0.57735027, -1.12302807, -0.98315162],
       [-1.        , -0.57735027,  1.73205081, -0.08137885, -0.37141284],
       [ 1.        , -0.57735027, -0.57735027, -0.47199731, -0.6335866 ],
       [ 1.        , -0.57735027, -0.57735027,  1.22068269,  1.20162976],
       [-1.        ,  1.73205081, -0.57735027,  1.48109499,  1.55119478],
       [ 1.        , -0.57735027, -0.57735027, -0.211585  ,  0.1529347 ]])

In [71]:
from sklearn.model_selection import train_test_split

In [76]:
# train : test = 8 : 2
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [77]:
X_train.shape, y_train.shape

((6, 5), (6,))