# Praktikum: Wisconsin Breast Cancer Dataset  
Tugas:
1. Pisahkan variabel yang dapat digunakan dan tidak dapat digunakan.  
2. Lakukan encoding pada kolom `diagnosis`.  
3. Lakukan standardisasi pada seluruh kolom numerik.  


In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load dataset dari file CSV
df = pd.read_csv('wbc.csv')

# Tampilkan informasi awal dataset
print(f"Jumlah data: {df.shape[0]} baris, {df.shape[1]} kolom")
print(f"\nKolom-kolom dataset:")
print(df.columns.tolist())
print(f"\nTipe data tiap kolom:")
print(df.dtypes)
df.head()

Jumlah data: 569 baris, 33 kolom

Kolom-kolom dataset:
['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32']

Tipe data tiap kolom:
id                           int64
diagnosis                   object
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [2]:
# ============================================================
# TUGAS 1: Pisahkan variabel yang dapat dan tidak dapat digunakan
# ============================================================

# Variabel yang TIDAK dapat digunakan untuk analisis:
# - 'id': hanya identifier, tidak memiliki informasi prediktif
# - 'Unnamed: 32' (jika ada): kolom kosong dari trailing comma di CSV

unused_columns = ['id']
# Cek apakah ada kolom kosong/unnamed
for col in df.columns:
    if 'Unnamed' in col or df[col].isna().all():
        unused_columns.append(col)

# Variabel yang DAPAT digunakan:
# - 'diagnosis': variabel target (M/B)
# - Semua kolom numerik lainnya: fitur untuk prediksi

usable_columns = [col for col in df.columns if col not in unused_columns]

print("=" * 60)
print("VARIABEL YANG TIDAK DAPAT DIGUNAKAN:")
print("=" * 60)
for col in unused_columns:
    print(f"  - '{col}': tidak memiliki informasi prediktif")

print("\n" + "=" * 60)
print("VARIABEL YANG DAPAT DIGUNAKAN:")
print("=" * 60)
print(f"  - 'diagnosis': variabel target (M=Malignant, B=Benign)")
print(f"  - {len(usable_columns) - 1} fitur numerik untuk prediksi")
print(f"\nDaftar fitur yang dapat digunakan:")
print(usable_columns)

VARIABEL YANG TIDAK DAPAT DIGUNAKAN:
  - 'id': tidak memiliki informasi prediktif
  - 'Unnamed: 32': tidak memiliki informasi prediktif

VARIABEL YANG DAPAT DIGUNAKAN:
  - 'diagnosis': variabel target (M=Malignant, B=Benign)
  - 30 fitur numerik untuk prediksi

Daftar fitur yang dapat digunakan:
['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']


In [3]:
# ============================================================
# TUGAS 2: Encoding kolom 'diagnosis'
# ============================================================
# Kolom diagnosis berisi nilai kategorikal: 'M' (Malignant) dan 'B' (Benign)
# Kita akan mengubahnya menjadi nilai numerik menggunakan LabelEncoder

encoder = LabelEncoder()
df['diagnosis_encoded'] = encoder.fit_transform(df['diagnosis'])

# Tampilkan mapping encoding
print("=" * 60)
print("ENCODING KOLOM 'DIAGNOSIS'")
print("=" * 60)
print(f"Nilai asli: {df['diagnosis'].unique()}")
print(f"Nilai setelah encoding: {df['diagnosis_encoded'].unique()}")
print(f"\nMapping:")
for i, label in enumerate(encoder.classes_):
    print(f"  '{label}' -> {i}")

# Tampilkan distribusi
print(f"\nDistribusi diagnosis:")
print(df['diagnosis'].value_counts())

# Tampilkan contoh hasil encoding
print("\n" + "=" * 60)
print("CONTOH HASIL ENCODING:")
print("=" * 60)
df[['diagnosis', 'diagnosis_encoded']].head(10)

ENCODING KOLOM 'DIAGNOSIS'
Nilai asli: ['M' 'B']
Nilai setelah encoding: [1 0]

Mapping:
  'B' -> 0
  'M' -> 1

Distribusi diagnosis:
diagnosis
B    357
M    212
Name: count, dtype: int64

CONTOH HASIL ENCODING:


Unnamed: 0,diagnosis,diagnosis_encoded
0,M,1
1,M,1
2,M,1
3,M,1
4,M,1
5,M,1
6,M,1
7,M,1
8,M,1
9,M,1


In [4]:
# ============================================================
# TUGAS 3: Standardisasi kolom numerik
# ============================================================
# Standardisasi menggunakan StandardScaler: z = (x - mean) / std
# Hasil: mean = 0, std = 1

scaler = StandardScaler()

# Pilih hanya kolom numerik (kecuali id dan diagnosis_encoded)
numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
# Hapus kolom yang tidak perlu di-standardisasi
cols_to_exclude = ['id', 'diagnosis_encoded']
numeric_cols = [col for col in numeric_cols if col not in cols_to_exclude]

print("=" * 60)
print("STANDARDISASI KOLOM NUMERIK")
print("=" * 60)
print(f"Jumlah kolom yang akan di-standardisasi: {len(numeric_cols)}")
print(f"\nKolom-kolom numerik:")
for i, col in enumerate(numeric_cols, 1):
    print(f"  {i:2d}. {col}")

# Buat dataframe hasil standardisasi
df_standardized = df.copy()
df_standardized[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Tampilkan statistik sebelum dan sesudah standardisasi
print("\n" + "=" * 60)
print("PERBANDINGAN SEBELUM & SESUDAH STANDARDISASI (5 kolom pertama):")
print("=" * 60)
sample_cols = numeric_cols[:5]

print("\nSebelum standardisasi:")
print(df[sample_cols].describe().loc[['mean', 'std']].round(4))

print("\nSesudah standardisasi:")
print(df_standardized[sample_cols].describe().loc[['mean', 'std']].round(4))

# Tampilkan hasil akhir
print("\n" + "=" * 60)
print("DATAFRAME HASIL STANDARDISASI:")
print("=" * 60)
df_standardized.head()

STANDARDISASI KOLOM NUMERIK
Jumlah kolom yang akan di-standardisasi: 31

Kolom-kolom numerik:
   1. radius_mean
   2. texture_mean
   3. perimeter_mean
   4. area_mean
   5. smoothness_mean
   6. compactness_mean
   7. concavity_mean
   8. concave points_mean
   9. symmetry_mean
  10. fractal_dimension_mean
  11. radius_se
  12. texture_se
  13. perimeter_se
  14. area_se
  15. smoothness_se
  16. compactness_se
  17. concavity_se
  18. concave points_se
  19. symmetry_se
  20. fractal_dimension_se
  21. radius_worst
  22. texture_worst
  23. perimeter_worst
  24. area_worst
  25. smoothness_worst
  26. compactness_worst
  27. concavity_worst
  28. concave points_worst
  29. symmetry_worst
  30. fractal_dimension_worst
  31. Unnamed: 32

PERBANDINGAN SEBELUM & SESUDAH STANDARDISASI (5 kolom pertama):

Sebelum standardisasi:
      radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean
mean      14.1273       19.2896          91.969   654.8891           0.0964
std        3

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32,diagnosis_encoded
0,842302,M,1.097064,-2.073335,1.269934,0.984375,1.568466,3.283515,2.652874,2.532475,...,2.303601,2.001237,1.307686,2.616665,2.109526,2.296076,2.750622,1.937015,,1
1,842517,M,1.829821,-0.353632,1.685955,1.908708,-0.826962,-0.487072,-0.023846,0.548144,...,1.535126,1.890489,-0.375612,-0.430444,-0.146749,1.087084,-0.24389,0.28119,,1
2,84300903,M,1.579888,0.456187,1.566503,1.558884,0.94221,1.052926,1.363478,2.037231,...,1.347475,1.456285,0.527407,1.082932,0.854974,1.955,1.152255,0.201391,,1
3,84348301,M,-0.768909,0.253732,-0.592687,-0.764464,3.283553,3.402909,1.915897,1.451707,...,-0.249939,-0.550021,3.394275,3.893397,1.989588,2.175786,6.046041,4.93501,,1
4,84358402,M,1.750297,-1.151816,1.776573,1.826229,0.280372,0.53934,1.371011,1.428493,...,1.338539,1.220724,0.220556,-0.313395,0.613179,0.729259,-0.868353,-0.3971,,1
