In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score
from IPython.display import display

# Impor data dari URL
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
column_names = ["Sample code number", "Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape",
                "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin",
                "Normal Nucleoli", "Mitoses", "Class"]
df = pd.read_csv(url, names=column_names)

# Langkah pra-pemrosesan data
df.replace('?', pd.NA, inplace=True)  # Mengganti nilai '?' dengan NA
df.dropna(inplace=True)  # Menghapus baris yang berisi nilai NA

# Tampilkan data mentah
print("Data Mentah:")
display(df.head(10))

Data Mentah:


Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
5,1017122,8,10,10,8,7,10,9,7,1,4
6,1018099,1,1,1,1,2,10,3,1,1,2
7,1018561,2,1,2,1,2,1,3,1,1,2
8,1033078,2,1,1,1,2,1,1,1,5,2
9,1033078,4,2,1,1,2,1,2,1,1,2


In [None]:
# Pisahkan fitur (X) dan label (y)
X = df.drop(columns=['Class'])
y = df['Class'].map({2: 0, 4: 1})  # Ubah label menjadi 0 dan 1

# Tampilkan informasi tentang dataset
display(pd.DataFrame({
    "Info Dataset": ["Jumlah data", "Jumlah fitur", "Jumlah kelas"],
    "Nilai": [len(df), X.shape[1], len(df['Class'].unique())]
}).set_index("Info Dataset"))


Unnamed: 0_level_0,Nilai
Info Dataset,Unnamed: 1_level_1
Jumlah data,683
Jumlah fitur,10
Jumlah kelas,2


In [None]:
# Pembagian Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tampilkan informasi tentang pembagian data
display(pd.DataFrame({
    "Info Pembagian Data": ["Jumlah data pelatihan", "Jumlah data pengujian"],
    "Nilai": [len(X_train), len(X_test)]
}).set_index("Info Pembagian Data"))

Unnamed: 0_level_0,Nilai
Info Pembagian Data,Unnamed: 1_level_1
Jumlah data pelatihan,546
Jumlah data pengujian,137


In [None]:
# Pemodelan dengan Random Forest
print("\nPemodelan dengan Random Forest:")
# Inisialisasi dan latih model Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)  # contoh parameter
rf_model.fit(X_train, y_train)
print("Model Random Forest berhasil dilatih.")

# Evaluasi Model
print("\nEvaluasi Model:")
# Prediksi label pada data pengujian
y_pred = rf_model.predict(X_test)

# Hitung akurasi dan presisi
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)  # karena ini klasifikasi biner, tidak perlu average='binary'

# Tampilkan hasil evaluasi
display(pd.DataFrame({
    "Metrik Evaluasi": ["Accuracy", "Precision"],
    "Nilai": [accuracy, precision]
}).set_index("Metrik Evaluasi"))


Pemodelan dengan Random Forest:
Model Random Forest berhasil dilatih.

Evaluasi Model:


Unnamed: 0_level_0,Nilai
Metrik Evaluasi,Unnamed: 1_level_1
Accuracy,0.956204
Precision,0.981481
