# **Import Needed Liberary**

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import random
import warnings

warnings.filterwarnings('ignore')


# **Read Data & Preprocessing**

In [None]:
# Load Breast Cancer Wisconsin Dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='target')

print(f"Dataset: Breast Cancer Wisconsin (Diagnostic)")
print(f"Number of Samples: {X.shape[0]}")
print(f"Number of Features: {X.shape[1]}")
print(f"Classes: {data.target_names}")
print(f"\nFeature Names:\n{list(data.feature_names)}")

Dataset: Breast Cancer Wisconsin (Diagnostic)
Number of Samples: 569
Number of Features: 30
Classes: ['malignant' 'benign']

Feature Names:
[np.str_('mean radius'), np.str_('mean texture'), np.str_('mean perimeter'), np.str_('mean area'), np.str_('mean smoothness'), np.str_('mean compactness'), np.str_('mean concavity'), np.str_('mean concave points'), np.str_('mean symmetry'), np.str_('mean fractal dimension'), np.str_('radius error'), np.str_('texture error'), np.str_('perimeter error'), np.str_('area error'), np.str_('smoothness error'), np.str_('compactness error'), np.str_('concavity error'), np.str_('concave points error'), np.str_('symmetry error'), np.str_('fractal dimension error'), np.str_('worst radius'), np.str_('worst texture'), np.str_('worst perimeter'), np.str_('worst area'), np.str_('worst smoothness'), np.str_('worst compactness'), np.str_('worst concavity'), np.str_('worst concave points'), np.str_('worst symmetry'), np.str_('worst fractal dimension')]


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Data Preprocessing
print(f"\nMissing Values: {X.isnull().sum().sum()}")

# Normalize the features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Train-Test Split (80/20)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTrain Set Size: {X_train.shape[0]}")
print(f"Test Set Size: {X_test.shape[0]}")


Missing Values: 0

Train Set Size: 455
Test Set Size: 114


# **Build ML Models**

In [None]:
# ML Implementation - Original Dataset


# SVM Hyperparameters
svm_params = {'kernel': 'rbf', 'C': 1.0, 'gamma': 'scale', 'random_state': 42}
print(f"\nSVM Hyperparameters:")
for key, value in svm_params.items():
    print(f"  {key}: {value}")

# Train SVM on original dataset
svm_original = SVC(**svm_params)
svm_original.fit(X_train, y_train)

# Predictions
y_pred_original = svm_original.predict(X_test)

# Performance Metrics
metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_original),
    'Precision': precision_score(y_test, y_pred_original, average='weighted'),
    'Recall': recall_score(y_test, y_pred_original, average='weighted'),
    'F1-Score': f1_score(y_test, y_pred_original, average='weighted'),
}
cm_original = confusion_matrix(y_test, y_pred_original)

print(f"\nPerformance Metrics (Original Dataset):")
for metric, value in metrics.items():
    print(f"  {metric}: {value:.4f}")
print(f"\nConfusion Matrix (Original):\n{cm_original}")


SVM Hyperparameters:
  kernel: rbf
  C: 1.0
  gamma: scale
  random_state: 42

Performance Metrics (Original Dataset):
  Accuracy: 0.9825
  Precision: 0.9825
  Recall: 0.9825
  F1-Score: 0.9825

Confusion Matrix (Original):
[[41  1]
 [ 1 71]]


**GA**

In [None]:
# Genetic Algorithm for Feature Selection

# GA Hyperparameters
GA_PARAMS = {
    'POPULATION_SIZE': 50,
    'GENERATIONS': 30,
    'CROSSOVER_PROB': 0.8,
    'MUTATION_PROB': 0.1
}

# Initialize population
def initialize_population(pop_size, num_features):
    population = []
    for _ in range(pop_size):
        chromosome = [random.randint(0, 1) for _ in range(num_features)]
        while sum(chromosome) == 0:
            chromosome = [random.randint(0, 1) for _ in range(num_features)]
        population.append(chromosome)
    return population

# Fitness function
def fitness_function(chromosome, X_tr, y_tr, X_val, y_val):
    selected_features = [i for i, gene in enumerate(chromosome) if gene == 1]
    if len(selected_features) == 0:
        return 0.0
    X_tr_selected = X_tr[:, selected_features]
    X_val_selected = X_val[:, selected_features]

    svm = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
    svm.fit(X_tr_selected, y_tr)
    y_pred = svm.predict(X_val_selected)

    return accuracy_score(y_val, y_pred)

# Tournament Selection
def tournament_selection(population, fitness_scores, k=3):
    selected = []
    for _ in range(len(population)):
        tournament_idx = random.sample(range(len(population)), k)
        tournament_fitness = [fitness_scores[i] for i in tournament_idx]
        winner_idx = tournament_idx[np.argmax(tournament_fitness)]
        selected.append(population[winner_idx].copy())
    return selected

# Crossover
def crossover(parent1, parent2):
    if random.random() < GA_PARAMS['CROSSOVER_PROB']:
        point = random.randint(1, len(parent1) - 1)
        child1 = parent1[:point] + parent2[point:]
        child2 = parent2[:point] + parent1[point:]
        return child1, child2
    return parent1.copy(), parent2.copy()

# Mutation
def mutate(chromosome):
    mutated = chromosome.copy()
    for i in range(len(mutated)):
        if random.random() < GA_PARAMS['MUTATION_PROB']:
            mutated[i] = 1 - mutated[i]
    if sum(mutated) == 0:
        mutated[random.randint(0, len(mutated) - 1)] = 1
    return mutated


In [None]:
# Split training data for GA validation
X_tr_ga, X_val_ga, y_tr_ga, y_val_ga = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)
X_tr_ga_np = X_tr_ga.values
X_val_ga_np = X_val_ga.values

# Run Genetic Algorithm
population = initialize_population(GA_PARAMS['POPULATION_SIZE'], X_train.shape[1])

best_fitness_history = []
best_chromosome = None
best_fitness = 0.0

for gen in range(GA_PARAMS['GENERATIONS']):
    fitness_scores = [
        fitness_function(chrom, X_tr_ga_np, y_tr_ga.values, X_val_ga_np, y_val_ga.values)
        for chrom in population
    ]
    gen_best_fitness = max(fitness_scores)
    gen_best_idx = fitness_scores.index(gen_best_fitness)

    if gen_best_fitness > best_fitness:
        best_fitness = gen_best_fitness
        best_chromosome = population[gen_best_idx].copy()

    best_fitness_history.append(best_fitness)

    if (gen + 1) % 5 == 0:
        print(f"Generation {gen+1}/{GA_PARAMS['GENERATIONS']} - Best Fitness: {best_fitness:.4f}")

    selected = tournament_selection(population, fitness_scores)

    next_population = []
    for i in range(0, len(selected), 2):
        parent1 = selected[i]
        parent2 = selected[i + 1] if i + 1 < len(selected) else selected[0]

        child1, child2 = crossover(parent1, parent2)
        child1 = mutate(child1)
        child2 = mutate(child2)

        next_population.append(child1)
        if len(next_population) < GA_PARAMS['POPULATION_SIZE']:
            next_population.append(child2)

    population = next_population

# Get selected features
selected_feature_indices = [i for i, gene in enumerate(best_chromosome) if gene == 1]
selected_feature_names = [X.columns[i] for i in selected_feature_indices]

print(f"\nGA Optimization Complete!")
print(f"Original Number of Features: {X_train.shape[1]}")
print(f"Reduced Number of Features: {len(selected_feature_indices)}")
print(f"Best Fitness Score: {best_fitness:.4f}")
print(f"\nSelected Features ({len(selected_feature_indices)}):")
for i, feat in enumerate(selected_feature_names, 1):
    print(f"  {i}. {feat}")

Generation 5/30 - Best Fitness: 0.9890
Generation 10/30 - Best Fitness: 0.9890
Generation 15/30 - Best Fitness: 0.9890
Generation 20/30 - Best Fitness: 0.9890
Generation 25/30 - Best Fitness: 0.9890
Generation 30/30 - Best Fitness: 0.9890

GA Optimization Complete!
Original Number of Features: 30
Reduced Number of Features: 19
Best Fitness Score: 0.9890

Selected Features (19):
  1. mean texture
  2. mean perimeter
  3. mean smoothness
  4. mean compactness
  5. mean concavity
  6. mean fractal dimension
  7. radius error
  8. texture error
  9. perimeter error
  10. area error
  11. smoothness error
  12. compactness error
  13. concavity error
  14. worst radius
  15. worst texture
  16. worst perimeter
  17. worst smoothness
  18. worst symmetry
  19. worst fractal dimension


**GA-Reduced Dataset**

In [None]:

# ML Implementation - GA-Reduced Dataset

X_train_reduced = X_train.iloc[:, selected_feature_indices]
X_test_reduced = X_test.iloc[:, selected_feature_indices]

svm_reduced = SVC(**svm_params)
svm_reduced.fit(X_train_reduced, y_train)

y_pred_reduced = svm_reduced.predict(X_test_reduced)

metrics_reduced = {
    'Accuracy': accuracy_score(y_test, y_pred_reduced),
    'Precision': precision_score(y_test, y_pred_reduced, average='weighted'),
    'Recall': recall_score(y_test, y_pred_reduced, average='weighted'),
    'F1-Score': f1_score(y_test, y_pred_reduced, average='weighted'),
}
cm_reduced = confusion_matrix(y_test, y_pred_reduced)

print(f"\nPerformance Metrics (GA-Reduced Dataset):")
for metric, value in metrics_reduced.items():
    print(f"  {metric}: {value:.4f}")
print(f"\nConfusion Matrix (Reduced):\n{cm_reduced}")


Performance Metrics (GA-Reduced Dataset):
  Accuracy: 0.9737
  Precision: 0.9740
  Recall: 0.9737
  F1-Score: 0.9737

Confusion Matrix (Reduced):
[[41  1]
 [ 2 70]]


In [None]:
# Comparative Analysis


comparison_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'Number of Features'],
    'Original Dataset': [f"{metrics['Accuracy']:.4f}", f"{metrics['Precision']:.4f}", f"{metrics['Recall']:.4f}", f"{metrics['F1-Score']:.4f}", str(X_train.shape[1])],
    'GA-Reduced Dataset': [f"{metrics_reduced['Accuracy']:.4f}", f"{metrics_reduced['Precision']:.4f}", f"{metrics_reduced['Recall']:.4f}", f"{metrics_reduced['F1-Score']:.4f}", str(len(selected_feature_indices))],
    'Difference': [
        f"{metrics_reduced['Accuracy'] - metrics['Accuracy']:+.4f}",
        f"{metrics_reduced['Precision'] - metrics['Precision']:+.4f}",
        f"{metrics_reduced['Recall'] - metrics['Recall']:+.4f}",
        f"{metrics_reduced['F1-Score'] - metrics['F1-Score']:+.4f}",
        f"{len(selected_feature_indices) - X_train.shape[1]:+d}"
    ]
})

print(comparison_df)

               Metric Original Dataset GA-Reduced Dataset Difference
0            Accuracy           0.9825             0.9737    -0.0088
1           Precision           0.9825             0.9740    -0.0085
2              Recall           0.9825             0.9737    -0.0088
3            F1-Score           0.9825             0.9737    -0.0087
4  Number of Features               30                 19        -11


In [None]:
# Insights and Conclusion


accuracy_change = metrics_reduced['Accuracy'] - metrics['Accuracy']
feature_reduction_pct = (1 - len(selected_feature_indices) / X_train.shape[1]) * 100

print(f"\nKey Findings:")
print(f"  • Feature reduction: {X_train.shape[1]} → {len(selected_feature_indices)} ({feature_reduction_pct:.1f}% reduction)")
print(f"  • Accuracy change: {accuracy_change:+.4f}")

if accuracy_change >= 0:
    print(f"\n✓ The GA-based dimensionality reduction improved model performance.")
else:
    print(f"\n✗ The GA-based dimensionality reduction slightly decreased performance but achieved {feature_reduction_pct:.1f}% feature reduction.")




Key Findings:
  • Feature reduction: 30 → 19 (36.7% reduction)
  • Accuracy change: -0.0088

✗ The GA-based dimensionality reduction slightly decreased performance but achieved 36.7% feature reduction.


# **Save Model**

In [None]:
# Save results
comparison_df.to_csv('performance_comparison.csv', index=False)
print("Results saved to 'performance_comparison.csv'")

pd.DataFrame({'Feature Index': selected_feature_indices, 'Feature Name': selected_feature_names}).to_csv('selected_features.csv', index=False)
print("Selected features saved to 'selected_features.csv'")

dataset_full = X.copy()
dataset_full['target'] = y
dataset_full.to_csv('breast_cancer_dataset.csv', index=False)
print("Dataset saved to 'breast_cancer_dataset.csv'")

print("PROJECT COMPLETE - All Requirements Satisfied")

Results saved to 'performance_comparison.csv'
Selected features saved to 'selected_features.csv'
Dataset saved to 'breast_cancer_dataset.csv'
PROJECT COMPLETE - All Requirements Satisfied
