In [None]:
# Dataset visualization
import pandas as pd
from sklearn.datasets import load_breast_cancer

# Load the Breast Cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names

# Create a DataFrame to display the dataset
df = pd.DataFrame(X, columns=feature_names)
df['target'] = y

# Display the feature names and a sample of the dataset
print("Features in the Breast Cancer dataset:")
for feature in feature_names:
    print(feature)

print("\nSample of the Breast Cancer dataset:")
#print(df.head())
print(df)

Features in the Breast Cancer dataset:
mean radius
mean texture
mean perimeter
mean area
mean smoothness
mean compactness
mean concavity
mean concave points
mean symmetry
mean fractal dimension
radius error
texture error
perimeter error
area error
smoothness error
compactness error
concavity error
concave points error
symmetry error
fractal dimension error
worst radius
worst texture
worst perimeter
worst area
worst smoothness
worst compactness
worst concavity
worst concave points
worst symmetry
worst fractal dimension

Sample of the Breast Cancer dataset:
     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0          17.99         10.38          122.80     1001.0          0.11840   
1          20.57         17.77          132.90     1326.0          0.08474   
2          19.69         21.25          130.00     1203.0          0.10960   
3          11.42         20.38           77.58      386.1          0.14250   
4          20.29         14.34          135.10  

In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score

# Load the Breast Cancer dataset
data = datasets.load_breast_cancer()
X = data.data
y = data.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# Fit the classifier on the training data
clf.fit(X_train, y_train)

# Get feature importance scores for all features
feature_importance_scores = clf.feature_importances_

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Calculate evaluation metrics using confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Feature Importance Scores:")
for i, score in enumerate(feature_importance_scores):
    print(f"Feature {i+1}: {score}")

print("\nConfusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("F1 Score:", f1)

Feature Importance Scores:
Feature 1: 0.0
Feature 2: 0.05847766231107586
Feature 3: 0.0
Feature 4: 0.0
Feature 5: 0.0
Feature 6: 0.0
Feature 7: 0.0
Feature 8: 0.6914195549049809
Feature 9: 0.0
Feature 10: 0.0
Feature 11: 0.0
Feature 12: 0.0
Feature 13: 0.0
Feature 14: 0.011982573676838769
Feature 15: 0.0012367800829339453
Feature 16: 0.0
Feature 17: 0.0062757755065447375
Feature 18: 0.015930814747382796
Feature 19: 0.0
Feature 20: 0.018554466715001834
Feature 21: 0.05229926933685694
Feature 22: 0.017445161675930944
Feature 23: 0.051493960584869665
Feature 24: 0.0
Feature 25: 0.009233190446208121
Feature 26: 0.0
Feature 27: 0.0
Feature 28: 0.06565079001137543
Feature 29: 0.0
Feature 30: 0.0

Confusion Matrix:
[[40  3]
 [ 3 68]]
Accuracy: 0.9473684210526315
Precision: 0.9577464788732394
F1 Score: 0.9577464788732394


In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score

# Load the Breast Cancer dataset
data = datasets.load_breast_cancer()
X = data.data
y = data.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# Feature selection using SelectFromModel
feature_selector = SelectFromModel(clf)
X_train_selected = feature_selector.fit_transform(X_train, y_train)
X_test_selected = feature_selector.transform(X_test)

# Fit the classifier on the training data with selected features
clf.fit(X_train_selected, y_train)

# Get feature importance scores
feature_importance_scores = clf.feature_importances_

# Make predictions on the test data
y_pred = clf.predict(X_test_selected)

# Calculate evaluation metrics using confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Feature Importance Scores:")
for i, score in enumerate(feature_importance_scores):
    print(f"Feature {i+1}: {score}")

print("\nConfusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("F1 Score:", f1)

Feature Importance Scores:
Feature 1: 0.06955994892620462
Feature 2: 0.7068506760241523
Feature 3: 0.06494002451684547
Feature 4: 0.07605396665375148
Feature 5: 0.08259538387904623

Confusion Matrix:
[[38  5]
 [ 3 68]]
Accuracy: 0.9298245614035088
Precision: 0.9315068493150684
F1 Score: 0.9444444444444444


In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score

# Load the Breast Cancer dataset
data = datasets.load_breast_cancer()
X = data.data
y = data.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature selection using SelectKBest with f_classif (ANOVA F-value between label/feature)
k_best_selector = SelectKBest(score_func=f_classif, k=10)  # Select top 10 features
X_train_selected = k_best_selector.fit_transform(X_train, y_train)
X_test_selected = k_best_selector.transform(X_test)

# Create a decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# Fit the classifier on the training data with selected features
clf.fit(X_train_selected, y_train)

# Make predictions on the test data with selected features
y_pred = clf.predict(X_test_selected)

# Calculate evaluation metrics using confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Selected Features Indices:", k_best_selector.get_support(indices=True))
print("\nConfusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("F1 Score:", f1)

Selected Features Indices: [ 0  2  3  6  7 20 22 23 26 27]

Confusion Matrix:
[[38  5]
 [ 3 68]]
Accuracy: 0.9298245614035088
Precision: 0.9315068493150684
F1 Score: 0.9444444444444444


In [None]:
#Wrapper based approach
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score

# Load the Breast Cancer dataset
data = datasets.load_breast_cancer()
X = data.data
y = data.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# Feature selection using Recursive Feature Elimination (RFE)
rfe = RFE(estimator=clf, n_features_to_select=12, step=1)
#X_train_selected = rfe.fit_transform(X_train, y_train)
#X_test_selected = rfe.transform(X_test)
X_selected = rfe.fit_transform(X,y)
score=cross_val_score(clf,X_selected,y,cv=5,scoring='accuracy')

# Fit the classifier on the training data with selected features
clf.fit(X_train_selected, y_train)

# Make predictions on the test data with selected features
y_pred = clf.predict(X_test_selected)

# Calculate evaluation metrics using confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Selected Features Support:", rfe.support_)
print("\nConfusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("F1 Score:", f1)

Selected Features Support: [False False False False False False False  True False False False False
 False  True  True False False  True False  True  True  True  True  True
  True False  True  True False False]

Confusion Matrix:
[[39  4]
 [ 3 68]]
Accuracy: 0.9385964912280702
Precision: 0.9444444444444444
F1 Score: 0.951048951048951


In [None]:
#Wrapper based approach
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score

# Load the Breast Cancer dataset
data = datasets.load_breast_cancer()
X = data.data
y = data.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# Feature selection using Recursive Feature Elimination (RFE)
rfe = RFE(estimator=clf, n_features_to_select=12, step=1)

X_selected = rfe.fit_transform(X, y)

# Perform 5-fold cross-validation with the selected features
cv_scores = cross_val_score(clf, X_selected, y, cv=5, scoring='accuracy')

print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())
print("Standard Deviation of CV Accuracy:", cv_scores.std())

Cross-Validation Scores: [0.93859649 0.92105263 0.93859649 0.96491228 0.92035398]
Mean CV Accuracy: 0.9367023754075454
Standard Deviation of CV Accuracy: 0.016218252342727998


In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score

# Load the Breast Cancer dataset
data = datasets.load_breast_cancer()
X = data.data
y = data.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# Feature selection using Recursive Feature Elimination (RFE)
rfe = RFE(estimator=clf, n_features_to_select=1, step=1)
rfe.fit(X_train, y_train)

# Retrieve ranking of features based on their accuracy
feature_ranks = sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), data.feature_names))

# Select the top features with the highest accuracy
selected_features = [feature for rank, feature in feature_ranks[:10]]

# Filter the dataset to include only the selected features
X_train_selected = X_train[:, rfe.support_]
X_test_selected = X_test[:, rfe.support_]

# Fit the classifier on the training data with selected features
clf.fit(X_train_selected, y_train)

# Make predictions on the test data with selected features
y_pred = clf.predict(X_test_selected)

# Calculate evaluation metrics using confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Selected Features:")
for rank, feature in feature_ranks[:10]:
    print(f"Feature {feature} - Rank: {rank}")

print("\nConfusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("F1 Score:", f1)

Selected Features:
Feature mean concave points - Rank: 1
Feature worst perimeter - Rank: 2
Feature worst texture - Rank: 3
Feature worst concave points - Rank: 4
Feature worst radius - Rank: 5
Feature fractal dimension error - Rank: 6
Feature concave points error - Rank: 7
Feature worst smoothness - Rank: 8
Feature area error - Rank: 9
Feature smoothness error - Rank: 10

Confusion Matrix:
[[34  9]
 [ 9 62]]
Accuracy: 0.8421052631578947
Precision: 0.8732394366197183
F1 Score: 0.8732394366197183


In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load the Breast Cancer dataset
data = datasets.load_breast_cancer()
X = data.data
y = data.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# Calculate accuracy of each feature
feature_accuracies = []
for i in range(X.shape[1]):
    # Select the current feature
    X_train_feature = X_train[:, i].reshape(-1, 1)
    X_test_feature = X_test[:, i].reshape(-1, 1)

    # Fit the classifier with the current feature
    clf.fit(X_train_feature, y_train)

    # Make predictions
    y_pred = clf.predict(X_test_feature)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    feature_accuracies.append((data.feature_names[i], accuracy))

# Sort feature accuracies in descending order
feature_accuracies.sort(key=lambda x: x[1], reverse=True)

# Display accuracy of each feature
print("Accuracy of each feature:")
for feature, accuracy in feature_accuracies:
    print(f"Feature '{feature}': {accuracy}")

# Select the top features with the highest accuracy
selected_features = [feature for feature, accuracy in feature_accuracies[:10]]

print("\nTop selected features:")
for feature, accuracy in feature_accuracies[:10]:
    print(f"Feature '{feature}': {accuracy}")

Accuracy of each feature:
Feature 'worst area': 0.9210526315789473
Feature 'worst concave points': 0.9122807017543859
Feature 'worst radius': 0.8771929824561403
Feature 'area error': 0.8596491228070176
Feature 'worst perimeter': 0.8596491228070176
Feature 'mean radius': 0.8508771929824561
Feature 'mean concave points': 0.8421052631578947
Feature 'mean concavity': 0.8333333333333334
Feature 'mean area': 0.8157894736842105
Feature 'mean perimeter': 0.8070175438596491
Feature 'worst concavity': 0.7982456140350878
Feature 'perimeter error': 0.7807017543859649
Feature 'radius error': 0.7631578947368421
Feature 'worst compactness': 0.7368421052631579
Feature 'mean compactness': 0.7280701754385965
Feature 'worst fractal dimension': 0.6491228070175439
Feature 'concavity error': 0.6403508771929824
Feature 'concave points error': 0.6403508771929824
Feature 'mean texture': 0.631578947368421
Feature 'worst texture': 0.6052631578947368
Feature 'mean symmetry': 0.5877192982456141
Feature 'worst smoo

In [None]:
#PCA coding
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score

# Load the Breast Cancer dataset
data = datasets.load_breast_cancer()
X = data.data
y = data.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply PCA for feature selection
pca = PCA(n_components=10)  # Select top 10 principal components
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Create a decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# Fit the classifier on the training data with selected features
clf.fit(X_train_pca, y_train)

# Make predictions on the test data with selected features
y_pred = clf.predict(X_test_pca)

# Calculate evaluation metrics using confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Evaluation Metrics:")
print("\nConfusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("F1 Score:", f1)

Evaluation Metrics:

Confusion Matrix:
[[38  5]
 [ 2 69]]
Accuracy: 0.9385964912280702
Precision: 0.9324324324324325
F1 Score: 0.9517241379310345


In [None]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, f1_score

# Load the Breast Cancer dataset
data = datasets.load_breast_cancer()
X = data.data
y = data.target

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply PCA for feature selection
pca = PCA(n_components=10)  # Select top 10 principal components
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Get the explained variance ratio of the selected features
variance_ratio = pca.explained_variance_ratio_

# Create a decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# Fit the classifier on the training data with selected features
clf.fit(X_train_pca, y_train)

# Make predictions on the test data with selected features
y_pred = clf.predict(X_test_pca)

# Calculate evaluation metrics using confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Explained Variance Ratio of the Selected Features:")
for i in range(len(variance_ratio)):
    print(f"Principal Component {i+1}: {variance_ratio[i]}")

print("\nEvaluation Metrics:")
print("\nConfusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("F1 Score:", f1)

Explained Variance Ratio of the Selected Features:
Principal Component 1: 0.9818326416645895
Principal Component 2: 0.016136613191477394
Principal Component 3: 0.0018045063717473996
Principal Component 4: 0.00012677638709809055
Principal Component 5: 8.769173234885731e-05
Principal Component 6: 6.324788770067277e-06
Principal Component 7: 3.948898478820392e-06
Principal Component 8: 8.436724009174741e-07
Principal Component 9: 3.6358271473516936e-07
Principal Component 10: 1.8626617680085465e-07

Evaluation Metrics:

Confusion Matrix:
[[38  5]
 [ 2 69]]
Accuracy: 0.9385964912280702
Precision: 0.9324324324324325
F1 Score: 0.9517241379310345
