In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
df = pd.DataFrame([ ['green', 'L', 10.1, 'yes'], ['green', 'M', 13.5, 'no'], ['red', 'XL', 15.3, 'yes'], ['blue', 'S', 12.7, 'yes']
])
df.columns = ['color', 'size', 'price', 'label']

In [7]:
df

Unnamed: 0,color,size,price,label
0,green,L,10.1,yes
1,green,M,13.5,no
2,red,XL,15.3,yes
3,blue,S,12.7,yes


In [8]:
y = df['label'] # Pandas Series 객체

In [9]:
y1 = np.where(y =='no',0,1)

In [10]:
y1

array([1, 0, 1, 1])

In [13]:
d = {'no':0, 'yes':1}
y2 = y.map(d).to_numpy()

In [14]:
y2

array([1, 0, 1, 1], dtype=int64)

In [15]:
from sklearn.preprocessing import LabelEncoder

In [17]:
encoder = LabelEncoder()
encoder.fit(y)

LabelEncoder()

In [18]:
encoder.classes_

array(['no', 'yes'], dtype=object)

In [19]:
y3 = encoder.transform(y)

In [20]:
y3

array([1, 0, 1, 1])

## 순서가 있는 특성 매핑 : size 변환

In [22]:
X = df.drop('label',axis = 1)
X

Unnamed: 0,color,size,price
0,green,L,10.1
1,green,M,13.5
2,red,XL,15.3
3,blue,S,12.7


In [23]:
size_mapping = {'XS':1, 'S':1, 'M':2, 'L':3,'XL':4}
X['size'] = X['size'].map(size_mapping)

In [24]:
X

Unnamed: 0,color,size,price
0,green,3,10.1
1,green,2,13.5
2,red,4,15.3
3,blue,1,12.7


## 순서가 없는(명목형) 특성에 대한 원-핫 인코딩

In [27]:
X_dummy = pd.get_dummies(X)

In [28]:
X_dummy

Unnamed: 0,size,price,color_blue,color_green,color_red
0,3,10.1,0,1,0
1,2,13.5,0,1,0
2,4,15.3,0,0,1
3,1,12.7,1,0,0


## 와인 데이터

In [29]:
path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
wine = pd.read_csv(path, header=None)

In [30]:
wine

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [31]:
y = wine.iloc[:,0].to_numpy()

In [32]:
y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3], dtype=int64)

In [34]:
X = wine.iloc[:,1:].to_numpy()
X

array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
        1.050e+03],
       [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
        1.185e+03],
       ...,
       [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
        8.350e+02],
       [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
        8.400e+02],
       [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
        5.600e+02]])

In [36]:
from sklearn.model_selection import train_test_split

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [39]:
from sklearn.preprocessing import StandardScaler

In [40]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_std = scaler.transform(X_train)

# or 
# scaler = StandardScaler()
# X_train_std = scaler.fit_transform(X_train)

In [42]:
pd.DataFrame(X_train_std).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
count,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0,124.0
mean,6.890098e-15,1.683241e-16,3.79966e-15,2.856138e-16,-3.178461e-16,1.662649e-15,1.598184e-16,-1.198862e-15,1.888274e-15,-1.615195e-15,-5.783904e-16,8.631089e-16,-1.334058e-16
std,1.004057,1.004057,1.004057,1.004057,1.004057,1.004057,1.004057,1.004057,1.004057,1.004057,1.004057,1.004057,1.004057
min,-1.971837,-1.380243,-2.554934,-2.525252,-2.069521,-2.187238,-1.691226,-1.964528,-2.068907,-1.419256,-2.147813,-1.947267,-1.465378
25%,-0.8150037,-0.6275963,-0.5499141,-0.6917901,-0.7889079,-0.8535947,-0.7985839,-0.8092365,-0.6496852,-0.8006399,-0.7436984,-0.8974047,-0.782356
50%,0.0381987,-0.4351583,-0.05565323,-0.09065504,-0.113029,0.1385058,0.09896271,-0.2315907,-0.07775997,-0.1438903,0.04894675,0.2823374,-0.2499217
75%,0.8033481,0.698088,0.7370293,0.5705935,0.5094911,0.8297234,0.8322044,0.6142478,0.561949,0.4704883,0.7510038,0.7946413,0.8610452
max,2.181832,2.947475,3.152379,3.065304,3.693237,2.529306,2.95836,2.161513,3.286008,3.351712,2.109824,1.992422,2.843268


In [43]:
X_test_std = scaler.transform(X_test)

In [45]:
pd.DataFrame(X_test_std).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
count,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0,54.0
mean,-0.131835,-0.049174,-0.226279,-0.303835,0.153107,-0.159365,-0.112663,-0.168936,-0.279228,-0.044005,0.048611,-0.036483,-0.080326
std,0.945881,0.841129,1.059069,0.979626,1.04497,1.049986,0.925997,1.078199,0.864142,0.938784,1.112748,1.079511,0.885251
min,-2.433355,-1.243398,-3.823226,-2.765706,-1.286924,-1.992071,-1.563706,-1.634445,-2.085853,-1.614162,-1.876049,-1.918405,-1.351669
25%,-0.815004,-0.642564,-0.689799,-1.112585,-0.717763,-1.016234,-0.928557,-0.953648,-0.844563,-0.777336,-0.788992,-1.139126,-0.767758
50%,-0.156125,-0.400947,-0.260818,-0.316081,-0.077456,-0.178641,-0.004034,-0.602934,-0.30653,-0.207447,0.139535,0.195751,-0.260678
75%,0.566516,0.569796,0.457259,0.472909,0.829644,0.508511,0.662995,0.511097,0.269632,0.491674,0.784974,0.946168,0.396222
max,1.647442,1.929691,1.995996,2.163601,4.475833,2.480514,1.653729,2.409076,2.77763,2.368706,3.42335,1.876974,2.228625


## 차원 축소

In [46]:
from sklearn.decomposition import PCA

In [47]:
pca = PCA(n_components = 2)

In [48]:
pca.fit(X_train_std)

PCA(n_components=2)

In [49]:
X_train_pca = pca.transform(X_train_std)

In [50]:
X_train_std.shape

(124, 13)

In [51]:
pca.components_

array([[-0.13724218,  0.24724326, -0.02545159,  0.20694508, -0.15436582,
        -0.39376952, -0.41735106,  0.30572896, -0.30668347,  0.07554066,
        -0.32613263, -0.36861022, -0.29669651],
       [ 0.50303478,  0.16487119,  0.24456476, -0.11352904,  0.28974518,
         0.05080104, -0.02287338,  0.09048885,  0.00835233,  0.54977581,
        -0.20716433, -0.24902536,  0.38022942]])

In [52]:
pca.explained_variance_ratio_ # 13를 써도 괜찮지만 2개를 써도 괜찮음

array([0.36951469, 0.18434927])