In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier,export_text, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import recall_score, precision_score, f1_score, classification_report,confusion_matrix

from ucimlrepo import fetch_ucirepo

In [2]:
bcw = fetch_ucirepo(id=17)

In [3]:
bcw_X = bcw.data.features
bcw_y = bcw.data.targets
print(bcw_y)

    Diagnosis
0           M
1           M
2           M
3           M
4           M
..        ...
564         M
565         M
566         M
567         M
568         B

[569 rows x 1 columns]


In [4]:
X=bcw_X.iloc[:, 2:].values
y=bcw_y.iloc[:, 0].values

In [5]:
le = LabelEncoder()
y = le.fit_transform(y)

In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X_scaled)

X_pca1 = principal_components[:, 0].reshape(-1, 1)

X_pca2 = principal_components[:, :2]

X_original = X_scaled

In [8]:
X_train_original, X_test_original, y_train_original, y_test_original = train_test_split(
    X_original, y, test_size=0.2, random_state=42
)

# 第一主成分
X_train_pca1, X_test_pca1, y_train_pca1, y_test_pca1 = train_test_split(
    X_pca1, y, test_size=0.2, random_state=42
)

# 第一和第二主成分
X_train_pca2, X_test_pca2, y_train_pca2, y_test_pca2 = train_test_split(
    X_pca2, y, test_size=0.2, random_state=42
)

In [9]:
data = {
    'Data Component': [],
    'F1 Score': [],
    'Precision': [],
    'Recall': [],
    'False Positives (FP)': [],
    'True Positives (TP)': [],
    'False Positive Rate (FPR)': [],
    'True Positive Rate (TPR)': []
}


**original data**

In [10]:
clf_original = DecisionTreeClassifier(min_samples_leaf=2,min_samples_split=5,max_depth=2,criterion='gini',random_state=42)
clf_original.fit(X_train_original, y_train_original)

y_pred_original = clf_original.predict(X_test_original)

f1_original = f1_score(y_test_original, y_pred_original)
precision_original = precision_score(y_test_original, y_pred_original)
recall_original = recall_score(y_test_original, y_pred_original)

cm_original = confusion_matrix(y_test_original, y_pred_original)
tn_original, fp_original, fn_original, tp_original = cm_original.ravel()
fpr_original = fp_original / (fp_original + tn_original)
tpr_original = tp_original / (tp_original + fn_original)

data['Data Component'].append('Original Data')
data['F1 Score'].append(f1_original)
data['Precision'].append(precision_original)
data['Recall'].append(recall_original)
data['False Positives (FP)'].append(fp_original)
data['True Positives (TP)'].append(tp_original)
data['False Positive Rate (FPR)'].append(fpr_original)
data['True Positive Rate (TPR)'].append(tpr_original)

**Using only the first principal component of the data**

In [11]:
clf_pca1 = DecisionTreeClassifier(min_samples_leaf=2,min_samples_split=5,max_depth=2,criterion='gini',random_state=42)
clf_pca1.fit(X_train_pca1, y_train_pca1)

y_pred_pca1 = clf_pca1.predict(X_test_pca1)

f1_pca1 = f1_score(y_test_pca1, y_pred_pca1)
precision_pca1 = precision_score(y_test_pca1, y_pred_pca1)
recall_pca1 = recall_score(y_test_pca1, y_pred_pca1)

cm_pca1 = confusion_matrix(y_test_pca1, y_pred_pca1)
tn_pca1, fp_pca1, fn_pca1, tp_pca1 = cm_pca1.ravel()
fpr_pca1 = fp_pca1 / (fp_pca1 + tn_pca1)
tpr_pca1 = tp_pca1 / (tp_pca1 + fn_pca1)

data['Data Component'].append('First Principal Component')
data['F1 Score'].append(f1_pca1)
data['Precision'].append(precision_pca1)
data['Recall'].append(recall_pca1)
data['False Positives (FP)'].append(fp_pca1)
data['True Positives (TP)'].append(tp_pca1)
data['False Positive Rate (FPR)'].append(fpr_pca1)
data['True Positive Rate (TPR)'].append(tpr_pca1)

**Using the first and second principal components**

In [12]:
clf_pca2 = DecisionTreeClassifier(min_samples_leaf=2,min_samples_split=5,max_depth=2,criterion='gini',random_state=42)
clf_pca2.fit(X_train_pca2, y_train_pca2)

y_pred_pca2 = clf_pca2.predict(X_test_pca2)

f1_pca2 = f1_score(y_test_pca2, y_pred_pca2)
precision_pca2 = precision_score(y_test_pca2, y_pred_pca2)
recall_pca2 = recall_score(y_test_pca2, y_pred_pca2)

tn_pca2, fp_pca2, fn_pca2, tp_pca2 = confusion_matrix(y_test_pca2, y_pred_pca2).ravel()

fpr_pca2 = fp_pca2 / (fp_pca2 + tn_pca2)
tpr_pca2 = tp_pca2 / (tp_pca2 + fn_pca2)

data['Data Component'].append('First and Second Principal Components')
data['F1 Score'].append(f1_pca2)
data['Precision'].append(precision_pca2)
data['Recall'].append(recall_pca2)
data['False Positives (FP)'].append(fp_pca2)
data['True Positives (TP)'].append(tp_pca2)
data['False Positive Rate (FPR)'].append(fpr_pca2)
data['True Positive Rate (TPR)'].append(tpr_pca2)

In [13]:
data_df = pd.DataFrame(data)
print(data_df)

                          Data Component  F1 Score  Precision    Recall  \
0                          Original Data  0.902439   0.948718  0.860465   
1              First Principal Component  0.930233   0.930233  0.930233   
2  First and Second Principal Components  0.891566   0.925000  0.860465   

   False Positives (FP)  True Positives (TP)  False Positive Rate (FPR)  \
0                     2                   37                   0.028169   
1                     3                   40                   0.042254   
2                     3                   37                   0.042254   

   True Positive Rate (TPR)  
0                  0.860465  
1                  0.930233  
2                  0.860465  
