In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [None]:
# Import dataset
dataset = pd.read_csv('tugaspreprocessing.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
print(X)
print(y)

[[113. 224. 213.  nan 122.]
 [316. 285. 169. 121.  nan]
 [158. 144. 503. 204. 277.]
 [244. 155. 110. 164. 165.]
 [112. 297. 239. 238.  85.]
 [289.  79. 125. 408.  24.]
 [106.  77. 119. 104. 102.]
 [131. 204. 248.  nan  16.]
 [289. 302. 200. 201. 181.]]
[105  26 318 109 177 103 133  79  19]


In [None]:
# Menghilangkan missing value NaN
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:5])
X[:, 1:5] = imputer.transform(X[:, 1:5])
print(X)

[[113.         224.         213.         205.71428571 122.        ]
 [316.         285.         169.         121.         121.5       ]
 [158.         144.         503.         204.         277.        ]
 [244.         155.         110.         164.         165.        ]
 [112.         297.         239.         238.          85.        ]
 [289.          79.         125.         408.          24.        ]
 [106.          77.         119.         104.         102.        ]
 [131.         204.         248.         205.71428571  16.        ]
 [289.         302.         200.         201.         181.        ]]


In [None]:
# Encoding data kategori (Atribut)
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
print(X)

[[  0.           0.           1.           0.           0.
    0.           0.           0.         224.         213.
  205.71428571 122.        ]
 [  0.           0.           0.           0.           0.
    0.           0.           1.         285.         169.
  121.         121.5       ]
 [  0.           0.           0.           0.           1.
    0.           0.           0.         144.         503.
  204.         277.        ]
 [  0.           0.           0.           0.           0.
    1.           0.           0.         155.         110.
  164.         165.        ]
 [  0.           1.           0.           0.           0.
    0.           0.           0.         297.         239.
  238.          85.        ]
 [  0.           0.           0.           0.           0.
    0.           1.           0.          79.         125.
  408.          24.        ]
 [  1.           0.           0.           0.           0.
    0.           0.           0.          77.         119.


In [None]:
# Encoding data kategori (class)
le = LabelEncoder()
y = le.fit_transform(y)
print(y)

[4 1 8 5 7 3 6 2 0]


In [None]:
# membagi dataset ke dalam training set dan test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print(X_train)

[[  1.           0.           0.           0.           0.
    0.           0.           0.          77.         119.
  104.         102.        ]
 [  0.           0.           0.           1.           0.
    0.           0.           0.         204.         248.
  205.71428571  16.        ]
 [  0.           0.           0.           0.           0.
    0.           0.           1.         285.         169.
  121.         121.5       ]
 [  0.           0.           1.           0.           0.
    0.           0.           0.         224.         213.
  205.71428571 122.        ]
 [  0.           1.           0.           0.           0.
    0.           0.           0.         297.         239.
  238.          85.        ]
 [  0.           0.           0.           0.           0.
    1.           0.           0.         155.         110.
  164.         165.        ]
 [  0.           0.           0.           0.           0.
    0.           1.           0.          79.         125.


In [None]:
print(X_test)

[[  0.   0.   0.   0.   0.   0.   1.   0. 302. 200. 201. 181.]
 [  0.   0.   0.   0.   1.   0.   0.   0. 144. 503. 204. 277.]]


In [None]:
print(y_train)
print(y_test)

[6 2 1 4 7 5 3]
[0 8]


In [27]:
# Feature Scaling
sc = StandardScaler()
X_train[:, 1:] = sc.fit_transform(X_train[:, 1:])
X_test[:, 1:] = sc.fit_transform(X_test[:, 1:])

In [28]:
print(X_train)
print(X_test)

[[ 1.         -0.40824829 -0.40824829 -0.40824829  0.         -0.40824829
  -0.40824829 -0.40824829 -1.34726299 -1.02292817 -1.0974727   0.22340704]
 [ 0.         -0.40824829 -0.40824829  2.44948974  0.         -0.40824829
  -0.40824829 -0.40824829  0.18434417  1.34554397 -0.0098203  -1.48985457]
 [ 0.         -0.40824829 -0.40824829 -0.40824829  0.         -0.40824829
  -0.40824829  2.44948974  1.16119598 -0.10491571 -0.9156881   0.61187914]
 [ 0.         -0.40824829  2.44948974 -0.40824829  0.         -0.40824829
  -0.40824829 -0.40824829  0.42554215  0.70293525 -0.0098203   0.62183997]
 [ 0.          2.44948974 -0.40824829 -0.40824829  0.         -0.40824829
  -0.40824829 -0.40824829  1.30591477  1.18030173  0.33541768 -0.11526095]
 [ 0.         -0.40824829 -0.40824829 -0.40824829  0.          2.44948974
  -0.40824829 -0.40824829 -0.40659088 -1.18817041 -0.45587999  1.47847077]
 [ 0.         -0.40824829 -0.40824829 -0.40824829  0.         -0.40824829
   2.44948974 -0.40824829 -1.323