<a href="https://colab.research.google.com/github/IlhamThrq/AI_TI-1B_2025/blob/main/DataPreprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Jobsheet 07: Data Pre-processing

## A. LATIHAN

### 1. Data Preparation dengan One Hot Encoding

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [11]:
# membuat data array gender
gender = ['perempuan', 'laki-laki', 'laki-laki', 'perempuan', 'perempuan']

# membuat data frame dari data gender
df = pd.DataFrame({'gender': gender})


In [20]:
# membuat instance dari OneHotEncoder
encoder = OneHotEncoder()

# melakukan encoding pada data frame
encoded_df = pd.DataFrame(encoder.fit_transform(df[['gender']]).toarray(), columns=encoder.get_feature_names_out(['gender']))

In [21]:
# menggabungkan data frame yang sudah di-encode dengan data frame awal
df_encoded = pd.concat([df, encoded_df], axis=1)

# menampilkan hasil
df_encoded


Unnamed: 0,gender,gender_laki-laki,gender_perempuan
0,perempuan,0.0,1.0
1,laki-laki,1.0,0.0
2,laki-laki,1.0,0.0
3,perempuan,0.0,1.0
4,perempuan,0.0,1.0


### 2. Data Preparation dengan Outlier Removal

In [22]:
import pandas as pd
import numpy as np

In [24]:
# membuat data frame contoh
df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'B': [15, 20, 25, 30, 35, 40, 45, 50, 55, 500]
})

In [25]:
# mencari nilai z-score untuk setiap data point pada kolom B
z_scores = np.abs((df['B'] - df['B'].mean()) / df['B'].std())

# menghilangkan data point yang memiliki z-score lebih besar dari 3
df_clean = df.loc[round(z_scores) < 3]

# menampilkan data frame yang telah di-clean
print(df_clean)

   A   B
0  1  15
1  2  20
2  3  25
3  4  30
4  5  35
5  6  40
6  7  45
7  8  50
8  9  55


### 3. Data Preparation dengan Normalization

In [26]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [27]:
# membuat data frame contoh
df = pd.DataFrame({
    'Nama': ['Andi', 'Budi', 'Cindy', 'Diana', 'Eka', 'lala'],
    'Gaji': [3000000, 5000000, 7000000, 9000000, 11000000, 9000000],
    'Umur': [25, 30, 35, 40, 45, 30]
})

In [28]:
# normalisasi data gaji dan umur menggunakan MinMaxScaler
scaler = MinMaxScaler()
df[['Gaji', 'Umur']] = scaler.fit_transform(df[['Gaji', 'Umur']])

# menampilkan data frame hasil normalisasi
df

Unnamed: 0,Nama,Gaji,Umur
0,Andi,0.0,0.0
1,Budi,0.25,0.25
2,Cindy,0.5,0.5
3,Diana,0.75,0.75
4,Eka,1.0,1.0
5,lala,0.75,0.25


### 4. Data Preparation dengan Standarization

In [29]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [30]:
# membuat data frame contoh
df = pd.DataFrame({
    'Nama': ['Andi', 'Budi', 'Cindy', 'Diana', 'Eka', 'lala'],
    'Gaji': [3000000, 5000000, 7000000, 9000000, 11000000, 9000000],
    'Umur': [25, 30, 35, 40, 45, 30]
})

In [31]:
# normalisasi data gaji dan umur menggunakan MinMaxScaler
scaler = StandardScaler()
df[['Gaji', 'Umur']] = scaler.fit_transform(df[['Gaji', 'Umur']])

# menampilkan data frame hasil normalisasi
df

Unnamed: 0,Nama,Gaji,Umur
0,Andi,-1.612452,-1.364382
1,Budi,-0.868243,-0.620174
2,Cindy,-0.124035,0.124035
3,Diana,0.620174,0.868243
4,Eka,1.364382,1.612452
5,lala,0.620174,-0.620174


### 5. Pembuatan DataSet

In [32]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [33]:
# Membuat data frame contoh
df = pd.DataFrame({
    'X': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Y': [15, 20, 25, 30, 35, 40, 45, 50, 55, 60]

})

In [35]:
# membagi data menjadi data latih dan data uji dengan rasio 80:20
X_train, X_test, y_train, y_test = train_test_split(df[['X']], df[['Y']], test_size=0.2, random_state=42)

In [38]:
#  Menampilkan data latih
print('Data Latih:')
print('X_train:', X_train)
print('y_train:', y_train)

#  Menampilkan data uji
print('\nData Uji')
print('X_test:', X_test)
print('y_test:', y_test)

Data Latih:
X_train:     X
5   6
0   1
7   8
2   3
9  10
4   5
3   4
6   7
y_train:     Y
5  40
0  15
7  50
2  25
9  60
4  35
3  30
6  45

Data Uji
X_test:    X
8  9
1  2
y_test:     Y
8  55
1  20


In [67]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Import data heart.csv
df = pd.read_csv('heart.csv');
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [68]:
# Ambil 13 data dan masukkan kedalam variabel data
data = df.iloc[:, :13]
data

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [69]:
# Rubah data ke array
data = data.values
data

array([[40, 'M', 'ATA', ..., 0.0, 'Up', 0],
       [49, 'F', 'NAP', ..., 1.0, 'Flat', 1],
       [37, 'M', 'ATA', ..., 0.0, 'Up', 0],
       ...,
       [57, 'M', 'ASY', ..., 1.2, 'Flat', 1],
       [57, 'F', 'ATA', ..., 0.0, 'Flat', 1],
       [38, 'M', 'NAP', ..., 0.0, 'Up', 0]], dtype=object)

In [72]:
# Masukkan data pada kolom terakhir dan masukkan ke dalam variabel label
label = df.iloc[:, -1]

In [73]:
# Rubah data ke array
label = label.values
label

array([0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,

In [74]:
# Bagi data menjadi data training dan data testing
data_train, data_test, label_train, label_test = train_test_split(data, label, test_size=0.2, random_state=42)

print('Ukuran data latih:', data_train.shape)
print('Ukuran data uji:', data_test.shape)

Ukuran data latih: (734, 12)
Ukuran data uji: (184, 12)


### 6. Cross Validation

In [75]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier

In [77]:
# membuat model Decision Tree
dt = DecisionTreeClassifier()

# melakukan 5-fold cross validation
scores = cross_val_score(dt, data, label, cv=5)

# menampilkan hasil cross validation
print('Hasil cross Validation:', scores)

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/tree/_classes.py", line 1024, in fit
    super()._fit(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/tree/_classes.py", line 252, in _fit
    X, y = validate_data(
           ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/validation.py", line 2956, in validate_data
    X = check_array(X, input_name="X", **check_X_params)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_array_api.py", line 839, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'F'

--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/tree/_classes.py", line 1024, in fit
    super()._fit(
  File "/usr/local/lib/python3.11/dist-packages/sklearn/tree/_classes.py", line 252, in _fit
    X, y = validate_data(
           ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/validation.py", line 2956, in validate_data
    X = check_array(X, input_name="X", **check_X_params)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/validation.py", line 1055, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/utils/_array_api.py", line 839, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'M'


## B. TUGAS PRAKTIKUM

### 1. Carilah Data pada salah satu sumber data dibawah ini

In [88]:
# Memilih Sumber dari Open Data Pemerintah Jawa Barat
import pandas as pd

# Contoh: data jumlah penduduk per kecamatan
df = pd.read_csv('DataJabar.csv');
df

Unnamed: 0,id,kode_provinsi,nama_provinsi,kode_kabupaten_kota,nama_kabupaten_kota,tempat_kejadian,jumlah_kekerasan,satuan,tahun
0,1,32,JAWA BARAT,3201,KABUPATEN BOGOR,RUMAH TANGGA,3,KASUS,2018
1,2,32,JAWA BARAT,3201,KABUPATEN BOGOR,TEMPAT KERJA,0,KASUS,2018
2,3,32,JAWA BARAT,3201,KABUPATEN BOGOR,LAINNYA,2,KASUS,2018
3,4,32,JAWA BARAT,3201,KABUPATEN BOGOR,SEKOLAH,1,KASUS,2018
4,5,32,JAWA BARAT,3201,KABUPATEN BOGOR,FASILITAS UMUM,0,KASUS,2018
...,...,...,...,...,...,...,...,...,...
1129,1130,32,JAWA BARAT,3279,KOTA BANJAR,TEMPAT KERJA,0,KASUS,2024
1130,1131,32,JAWA BARAT,3279,KOTA BANJAR,LAINNYA,18,KASUS,2024
1131,1132,32,JAWA BARAT,3279,KOTA BANJAR,SEKOLAH,0,KASUS,2024
1132,1133,32,JAWA BARAT,3279,KOTA BANJAR,FASILITAS UMUM,0,KASUS,2024


### 2. Kemudian implementasikan teknik Data Preparation yang telah kalian pelajari dalam data tersebut


In [98]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Pilih kolom kategorikal yang akan di-encode
categorical_cols = ['tempat_kejadian', 'nama_kabupaten_kota', 'tahun']

# Buat encoder dengan output array (bukan sparse matrix)
encoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' supaya tidak terjadi multikolinearitas

# Fit dan transform data pada ketiga kolom sekaligus
encoded = encoder.fit_transform(df[categorical_cols])

# Buat DataFrame hasil encoding dengan nama kolom yang jelas
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical_cols))

# Gabungkan dengan data asli yang sudah di-drop kolom asli yang di-encode
df_encoded = pd.concat([df.drop(columns=categorical_cols), encoded_df], axis=1)

# Tampilkan hasil encoding
print(df_encoded.head())


   id  kode_provinsi nama_provinsi  kode_kabupaten_kota  jumlah_kekerasan  \
0   1             32    JAWA BARAT                 3201                 3   
1   2             32    JAWA BARAT                 3201                 0   
2   3             32    JAWA BARAT                 3201                 2   
3   4             32    JAWA BARAT                 3201                 1   
4   5             32    JAWA BARAT                 3201                 0   

  satuan  tempat_kejadian_LAINNYA  tempat_kejadian_LEMBAGA PENDIDIKAN KILAT  \
0  KASUS                      0.0                                       0.0   
1  KASUS                      0.0                                       0.0   
2  KASUS                      1.0                                       0.0   
3  KASUS                      0.0                                       0.0   
4  KASUS                      0.0                                       0.0   

   tempat_kejadian_RUMAH TANGGA  tempat_kejadian_SEKOLAH  ... 

In [99]:
import numpy as np

# Kolom numerik yang dicek outlier-nya
col = 'jumlah_kekerasan'

Q1 = df_encoded[col].quantile(0.25)
Q3 = df_encoded[col].quantile(0.75)
IQR = Q3 - Q1

# Filter data yang bukan outlier
df_no_outlier = df_encoded[(df_encoded[col] >= (Q1 - 1.5 * IQR)) & (df_encoded[col] <= (Q3 + 1.5 * IQR))]

print(f'Sebelum removal: {df_encoded.shape[0]} baris')
print(f'Sesudah removal: {df_no_outlier.shape[0]} baris')

df_no_outlier.head()


Sebelum removal: 1134 baris
Sesudah removal: 986 baris


Unnamed: 0,id,kode_provinsi,nama_provinsi,kode_kabupaten_kota,jumlah_kekerasan,satuan,tempat_kejadian_LAINNYA,tempat_kejadian_LEMBAGA PENDIDIKAN KILAT,tempat_kejadian_RUMAH TANGGA,tempat_kejadian_SEKOLAH,...,nama_kabupaten_kota_KOTA CIREBON,nama_kabupaten_kota_KOTA DEPOK,nama_kabupaten_kota_KOTA SUKABUMI,nama_kabupaten_kota_KOTA TASIKMALAYA,tahun_2019,tahun_2020,tahun_2021,tahun_2022,tahun_2023,tahun_2024
0,1,32,JAWA BARAT,3201,3,KASUS,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,32,JAWA BARAT,3201,0,KASUS,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,32,JAWA BARAT,3201,2,KASUS,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,32,JAWA BARAT,3201,1,KASUS,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,32,JAWA BARAT,3201,0,KASUS,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [100]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Pilih kolom numerik yang ingin dinormalisasi
cols_to_normalize = ['jumlah_kekerasan']

# Terapkan scaler ke data
df_no_outlier[cols_to_normalize] = scaler.fit_transform(df_no_outlier[cols_to_normalize])

# Cek hasilnya
print(df_no_outlier[cols_to_normalize].describe())


       jumlah_kekerasan
count        986.000000
mean           0.145081
std            0.224352
min            0.000000
25%            0.000000
50%            0.050000
75%            0.200000
max            1.000000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_outlier[cols_to_normalize] = scaler.fit_transform(df_no_outlier[cols_to_normalize])


In [101]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

cols_to_standardize = ['jumlah_kekerasan']

df_no_outlier[cols_to_standardize] = scaler.fit_transform(df_no_outlier[cols_to_standardize])

print(df_no_outlier[cols_to_standardize].describe())


       jumlah_kekerasan
count      9.860000e+02
mean      -5.765053e-17
std        1.000507e+00
min       -6.469967e-01
25%       -6.469967e-01
50%       -4.240192e-01
75%        2.449135e-01
max        3.812554e+00


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_outlier[cols_to_standardize] = scaler.fit_transform(df_no_outlier[cols_to_standardize])


In [102]:
from sklearn.model_selection import train_test_split

# Misal targetnya 'jumlah_kekerasan'
target_col = 'jumlah_kekerasan'

# Fitur (semua kolom kecuali target)
X = df_no_outlier.drop(columns=[target_col])

# Label / target
y = df_no_outlier[target_col]

# Split data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f'Training set size: {X_train.shape[0]}')
print(f'Testing set size: {X_test.shape[0]}')


Training set size: 788
Testing set size: 198
