In [63]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import random
import warnings

warnings.filterwarnings('ignore')




In [64]:
csv_filename = "processed_dataset_2025-12-04 (3).csv"
data = pd.read_csv(csv_filename)
print("\n===== CSV Loaded Successfully =====")
print(f"Rows: {data.shape[0]}")
print(f"Columns: {data.shape[1]}")
print(data.head())
TARGET_COL = "target"
X = data.drop(columns=[TARGET_COL])
y = data[TARGET_COL]
print(f"\nFeatures Shape: {X.shape}")
print(f"Target Shape: {y.shape}")




===== CSV Loaded Successfully =====
Rows: 569
Columns: 31
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  w

# New Section

# 2. Preprocessing

In [65]:
print(f"\nMissing Values: {X.isnull().sum().sum()}")

# Normalize the features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Train-Test Split (80/20)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTrain Set Size: {X_train.shape[0]}")
print(f"Test Set Size: {X_test.shape[0]}")



Missing Values: 0

Train Set Size: 455
Test Set Size: 114


In [66]:
svm_params = {'kernel': 'rbf', 'C': 1.0, 'gamma': 'scale', 'random_state': 42}

svm_original = SVC(**svm_params)
svm_original.fit(X_train, y_train)

y_pred_original = svm_original.predict(X_test)

metrics = {
    'Accuracy': accuracy_score(y_test, y_pred_original),
    'Precision': precision_score(y_test, y_pred_original, average='weighted'),
    'Recall': recall_score(y_test, y_pred_original, average='weighted'),
    'F1-Score': f1_score(y_test, y_pred_original, average='weighted'),
}
cm_original = confusion_matrix(y_test, y_pred_original)

print("\n===== Performance (Original Dataset) =====")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

print(f"\nConfusion Matrix:\n{cm_original}")



===== Performance (Original Dataset) =====
Accuracy: 0.9825
Precision: 0.9825
Recall: 0.9825
F1-Score: 0.9825

Confusion Matrix:
[[41  1]
 [ 1 71]]


In [67]:
GA_PARAMS = {
    'POPULATION_SIZE': 50,
    'GENERATIONS': 30,
    'CROSSOVER_PROB': 0.8,
    'MUTATION_PROB': 0.1
}

# Initialize population
def initialize_population(pop_size, num_features):
    population = []
    for _ in range(pop_size):
        chromosome = [random.randint(0, 1) for _ in range(num_features)]
        while sum(chromosome) == 0:
            chromosome = [random.randint(0, 1) for _ in range(num_features)]
        population.append(chromosome)
    return population

# Fitness function
def fitness_function(chromosome, X_tr, y_tr, X_val, y_val):
    selected_features = [i for i, gene in enumerate(chromosome) if gene == 1]
    if len(selected_features) == 0:
        return 0.0

    X_tr_selected = X_tr[:, selected_features]
    X_val_selected = X_val[:, selected_features]

    svm = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
    svm.fit(X_tr_selected, y_tr)
    y_pred = svm.predict(X_val_selected)

    return accuracy_score(y_val, y_pred)

# Tournament Selection
def tournament_selection(population, fitness_scores, k=3):
    selected = []
    for _ in range(len(population)):
        tournament_idx = random.sample(range(len(population)), k)
        tournament_fitness = [fitness_scores[i] for i in tournament_idx]
        winner_idx = tournament_idx[np.argmax(tournament_fitness)]
        selected.append(population[winner_idx].copy())
    return selected

# Crossover
def crossover(parent1, parent2):
    if random.random() < GA_PARAMS['CROSSOVER_PROB']:
        point = random.randint(1, len(parent1) - 1)
        child1 = parent1[:point] + parent2[point:]
        child2 = parent2[:point] + parent1[point:]
        return child1, child2
    return parent1.copy(), parent2.copy()

# Mutation
def mutate(chromosome):
    mutated = chromosome.copy()
    for i in range(len(mutated)):
        if random.random() < GA_PARAMS['MUTATION_PROB']:
            mutated[i] = 1 - mutated[i]
    if sum(mutated) == 0:
        mutated[random.randint(0, len(mutated) - 1)] = 1
    return mutated

# Split training data for GA
X_tr_ga, X_val_ga, y_tr_ga, y_val_ga = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)
X_tr_ga_np = X_tr_ga.values
X_val_ga_np = X_val_ga.values

# Run GA
population = initialize_population(GA_PARAMS['POPULATION_SIZE'], X_train.shape[1])

best_fitness_history = []
best_chromosome = None
best_fitness = 0.0

for gen in range(GA_PARAMS['GENERATIONS']):
    fitness_scores = [
        fitness_function(chrom, X_tr_ga_np, y_tr_ga.values, X_val_ga_np, y_val_ga.values)
        for chrom in population
    ]

    gen_best_fitness = max(fitness_scores)
    gen_best_idx = fitness_scores.index(gen_best_fitness)

    if gen_best_fitness > best_fitness:
        best_fitness = gen_best_fitness
        best_chromosome = population[gen_best_idx].copy()

    best_fitness_history.append(best_fitness)

    if (gen + 1) % 5 == 0:
        print(f"Generation {gen+1} - Best Fitness: {best_fitness:.4f}")

    selected = tournament_selection(population, fitness_scores)

    next_population = []
    for i in range(0, len(selected), 2):
        parent1 = selected[i]
        parent2 = selected[i + 1] if i + 1 < len(selected) else selected[0]

        child1, child2 = crossover(parent1, parent2)
        child1 = mutate(child1)
        child2 = mutate(child2)

        next_population.append(child1)
        if len(next_population) < GA_PARAMS['POPULATION_SIZE']:
            next_population.append(child2)

    population = next_population

selected_feature_indices = [i for i, gene in enumerate(best_chromosome) if gene == 1]
selected_feature_names = [X.columns[i] for i in selected_feature_indices]

print("\n===== GA Feature Selection Complete =====")
print("Selected Features:")
for f in selected_feature_names:
    print(" -", f)


Generation 5 - Best Fitness: 0.9890
Generation 10 - Best Fitness: 0.9890
Generation 15 - Best Fitness: 0.9890
Generation 20 - Best Fitness: 0.9890
Generation 25 - Best Fitness: 0.9890
Generation 30 - Best Fitness: 0.9890

===== GA Feature Selection Complete =====
Selected Features:
 - mean texture
 - mean compactness
 - mean concavity
 - mean fractal dimension
 - perimeter error
 - area error
 - smoothness error
 - compactness error
 - concavity error
 - symmetry error
 - fractal dimension error
 - worst radius
 - worst texture
 - worst smoothness
 - worst concavity
 - worst concave points
 - worst symmetry
 - worst fractal dimension


In [68]:
X_train_reduced = X_train.iloc[:, selected_feature_indices]
X_test_reduced = X_test.iloc[:, selected_feature_indices]

svm_reduced = SVC(**svm_params)
svm_reduced.fit(X_train_reduced, y_train)

y_pred_reduced = svm_reduced.predict(X_test_reduced)

metrics_reduced = {
    'Accuracy': accuracy_score(y_test, y_pred_reduced),
    'Precision': precision_score(y_test, y_pred_reduced, average='weighted'),
    'Recall': recall_score(y_test, y_pred_reduced, average='weighted'),
    'F1-Score': f1_score(y_test, y_pred_reduced, average='weighted'),
}

cm_reduced = confusion_matrix(y_test, y_pred_reduced)

print("\n===== Performance (GA Reduced Dataset) =====")
for metric, value in metrics_reduced.items():
    print(f"{metric}: {value:.4f}")

print(f"\nConfusion Matrix:\n{cm_reduced}")

#=================

comparison_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'Number of Features'],
    'Original Dataset': [metrics['Accuracy'], metrics['Precision'], metrics['Recall'], metrics['F1-Score'], X_train.shape[1]],
    'GA-Reduced Dataset': [metrics_reduced['Accuracy'], metrics_reduced['Precision'], metrics_reduced['Recall'], metrics_reduced['F1-Score'], len(selected_feature_indices)],
})

print("\nComparison Table:")
print(comparison_df)



===== Performance (GA Reduced Dataset) =====
Accuracy: 0.9737
Precision: 0.9737
Recall: 0.9737
F1-Score: 0.9736

Confusion Matrix:
[[40  2]
 [ 1 71]]

Comparison Table:
               Metric  Original Dataset  GA-Reduced Dataset
0            Accuracy          0.982456            0.973684
1           Precision          0.982456            0.973711
2              Recall          0.982456            0.973684
3            F1-Score          0.982456            0.973616
4  Number of Features         30.000000           18.000000
