<a href="https://colab.research.google.com/github/JunetzMasihBelajar/2025_AI_TI1B/blob/main/Jobsheet7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Jobsheet 7: Data Pre-processing

## Latihan


### Latihan 1: Data Preparation dengan One Hot Encoding

In [7]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# membuat data array gender
gender = ['perempuan', 'laki-laki', 'laki-laki', 'perempuan', 'perempuan']

# membuat data frame dari data gender
df = pd.DataFrame({'gender': gender})

# membuat instance dari OneHotEncoder
encoder = OneHotEncoder()

# melakukan encoding pada data frame
encoded_df = pd.DataFrame(
    encoder.fit_transform(df[['gender']]).toarray(),
    columns=encoder.get_feature_names_out(['gender'])
)

# menggabungkan data frame yang sudah di-encode dengan data frame awal
df_encoded = pd.concat([df, encoded_df], axis=1)

# menampilkan hasil
print(df_encoded)


      gender  gender_laki-laki  gender_perempuan
0  perempuan               0.0               1.0
1  laki-laki               1.0               0.0
2  laki-laki               1.0               0.0
3  perempuan               0.0               1.0
4  perempuan               0.0               1.0


### Latihan 2: Data Preparation dengan Outlier Removal

In [8]:
import pandas as pd
import numpy as np

# membuat data frame contoh
df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'B': [15, 20, 25, 30, 35, 40, 45, 50, 55, 500]
})

# mencari nilai z-score untuk setiap data point pada kolom B
z_scores = np.abs((df['B'] - df['B'].mean()) / df['B'].std())

# menghilangkan data point yang memiliki z-score lebih besar dari 3
df_clean = df.loc[round(z_scores) < 3]

# menampilkan data frame yang telah di-clean
print(df_clean)




   A   B
0  1  15
1  2  20
2  3  25
3  4  30
4  5  35
5  6  40
6  7  45
7  8  50
8  9  55


### Latihan 3: Data Preparation dengan Normalization

In [9]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# membuat data frame contoh
df = pd.DataFrame({
    'Nama': ['Andi', 'Budi', 'Cindy', 'Diana', 'Eka', 'lala'],
    'Gaji': [3000000, 5000000, 7000000, 9000000, 11000000, 9000000],
    'Umur': [25, 30, 35, 40, 45, 30]
})

# normalisasi data gaji dan umur menggunakan MinMaxScaler
scaler = MinMaxScaler()
df[['Gaji', 'Umur']] = scaler.fit_transform(df[['Gaji', 'Umur']])

# menampilkan data frame hasil normalisasi
df

Unnamed: 0,Nama,Gaji,Umur
0,Andi,0.0,0.0
1,Budi,0.25,0.25
2,Cindy,0.5,0.5
3,Diana,0.75,0.75
4,Eka,1.0,1.0
5,lala,0.75,0.25


### Latihan 4: Data Preparation dengan Standarization

In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# membuat data frame contoh
df = pd.DataFrame({
    'Nama': ['Andi', 'Budi', 'Cindy', 'Diana', 'Eka', 'lala'],
    'Gaji': [3000000, 5000000, 7000000, 9000000, 11000000, 9000000],
    'Umur': [25, 30, 35, 40, 45, 30]
})

# normalisasi data gaji dan umur menggunakan MinMaxScaler
scaler = StandardScaler()
df[['Gaji', 'Umur']] = scaler.fit_transform(df[['Gaji', 'Umur']])

# menampilkan data frame hasil normalisasi
df

Unnamed: 0,Nama,Gaji,Umur
0,Andi,-1.612452,-1.364382
1,Budi,-0.868243,-0.620174
2,Cindy,-0.124035,0.124035
3,Diana,0.620174,0.868243
4,Eka,1.364382,1.612452
5,lala,0.620174,-0.620174


### Latihan 5: Pembuatan DataSet

In [11]:
from sklearn.model_selection import train_test_split
import pandas as pd

# membuat data frame contoh
df = pd.DataFrame({
    'X': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'Y': [15, 20, 25, 30, 35, 40, 45, 50, 55, 60]
})

# membagi data menjadi data latih dan data uji dengan rasio 80:20
X_train, X_test, y_train, y_test = train_test_split(df[['X']], df[['Y']], test_size=0.2, random_state=42)

# menampilkan data latih
print('Data Latih')
print('X_train:', X_train)
print('y_train:', y_train)

# menampilkan data uji
print('\nData Uji')
print('X_test:', X_test)
print('y_test:', y_test)


Data Latih
X_train:     X
5   6
0   1
7   8
2   3
9  10
4   5
3   4
6   7
y_train:     Y
5  40
0  15
7  50
2  25
9  60
4  35
3  30
6  45

Data Uji
X_test:    X
8  9
1  2
y_test:     Y
8  55
1  20


### Cross Validation


In [14]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Import data heart.csv
df = pd.read_csv('heart.csv');
df

# ambil 13 data dan masukan kedalam variabel data
data = df.iloc[:, :13]
data

# Rubah data ke array
data = data.values
data

# Masukan data pada kolom terakhir dan masukan kedalam variabel label
label = df.iloc[:, -1]
label

# Rubah data ke array
label = label.values
label

# Bagi data menjadi data training dan data testing
data_train, data_test, label_train, label_test = train_test_split(data, label, test_size=0.2, random_state=42)

print('Ukuran data latih:', data_train.shape)
print('Ukuran data uji:', data_test.shape)

Ukuran data latih: (734, 12)
Ukuran data uji: (184, 12)
