In [None]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Memuat dataset kanker payudara
data = load_breast_cancer()

In [None]:
# Mengonversi dataset menjadi DataFrame pandas
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

In [None]:
# Menampilkan lima baris pertama dataset
print("Lima baris pertama dataset:\n")
display(df.head())

In [None]:
# Menampilkan informasi dataset
print("Informasi Dataset:")
df.info()

In [None]:
# Menampilkan ringkasan statistik dataset
print("\nStatistik Dataset:")
display(df.describe())

In [None]:
# Menampilkan distribusi variabel target
plt.figure(figsize=(8, 6))
sns.countplot(x='target', data=df)
plt.show()

In [None]:
# Menampilkan matriks korelasi dengan menggunakan fungsi pairplot dari Seaborn
sns.pairplot(df, hue='target', diag_kind='hist')
plt.show()

In [None]:
# Menampilkan matriks korelasi menggunakan fungsi heatmap dari Seaborn
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), cmap='Blues')
plt.show()

In [None]:
# Membagi dataset menjadi set pelatihan dan pengujian
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Melakukan Normalisasi data menggunakan StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Melatih model self training
model1 = LogisticRegression()
model1.fit(X_train_scaled, y_train)

In [None]:
# Melatih model random forest
model2 = RandomForestClassifier(n_estimators=100, random_state=42)
model2.fit(X_train_scaled, y_train)

In [None]:
# Melatih model decission tree
model3 = DecisionTreeClassifier(random_state=42)
model3.fit(X_train, y_train)

In [None]:
# Mengevaluasi model pada set pengujian
y_pred1 = model1.predict(X_test_scaled)
y_pred2 = model2.predict(X_test_scaled)
y_pred3 = model3.predict(X_test)

accuracy1 = accuracy_score(y_test, y_pred1)
accuracy2 = accuracy_score(y_test, y_pred2)
accuracy3 = accuracy_score(y_test, y_pred3)

print(f"Akurasi model self training: {accuracy1}")
print(f"Akurasi model random forest: {accuracy2}")
print(f"Akurasi model decission tree: {accuracy3}")

In [None]:
# Menampilkan classification report
report1 = classification_report(y_test, y_pred1, output_dict=True)
report2 = classification_report(y_test, y_pred2, output_dict=True)
report3 = classification_report(y_test, y_pred3, output_dict=True)

df_report1 = pd.DataFrame(report1).transpose()
df_report1.drop('support', axis=1, inplace=True)
df_report1.drop('accuracy', axis=0, inplace=True)

df_report2 = pd.DataFrame(report2).transpose()
df_report2.drop('support', axis=1, inplace=True)
df_report2.drop('accuracy', axis=0, inplace=True)

df_report3 = pd.DataFrame(report3).transpose()
df_report3.drop('support', axis=1, inplace=True)
df_report3.drop('accuracy', axis=0, inplace=True)

fig, axs = plt.subplots(1, 3, figsize=(15,5))
fig.suptitle("Classification Report")

sns.barplot(x=df_report1.index, y=df_report1['f1-score'], ax=axs[0])
sns.barplot(x=df_report2.index, y=df_report2['f1-score'], ax=axs[1])
sns.barplot(x=df_report3.index, y=df_report3['f1-score'], ax=axs[2])

plt.show()

In [None]:
# Menampilkan confusion matrix
cm1 = confusion_matrix(y_test, y_pred1)
cm2 = confusion_matrix(y_test, y_pred2)
cm3 = confusion_matrix(y_test, y_pred3)

plt.figure(figsize=(15,5))
plt.subplot(1, 3, 1)
sns.heatmap(cm1, annot=True, cmap='Blues', fmt='.0f')
plt.title("Confusion Matrix Self Training")
plt.xlabel("Label Prediksi")
plt.ylabel("Label Sebenarnya")

plt.subplot(1, 3, 2)
sns.heatmap(cm2, annot=True, cmap='Blues', fmt='.0f')
plt.title("Confusion Matrix Random Forest")
plt.xlabel("Label Prediksi")
plt.ylabel("Label Sebenarnya")

plt.subplot(1, 3, 3)
sns.heatmap(cm3, annot=True, cmap='Blues', fmt='.0f')
plt.title("Confusion Matrix Decission Tree")
plt.xlabel("Label Prediksi")
plt.ylabel("Label Sebenarnya")

plt.show()

In [None]:
# Visualisasi tingkat kepentingan fitur
feature_importance = pd.Series(model2.feature_importances_, index=X.columns)
feature_importance.nlargest(10).plot(kind='barh')
plt.title("10 Fitur Terpenting Random Forest")
plt.show()

In [None]:
# Visualisasi pohon keputusan
plt.figure(figsize=(20,10))
plot_tree(model3, feature_names=X.columns, class_names=['ganas', 'jinak'], filled=True)
plt.show()