In [3]:
import os
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
import xgboost as xgb
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import fetch_openml

import matplotlib.pyplot as plt

from helpers.datasetHelper import get_samples, split_healthy_data
from MyPSO import MyPSO

print("importing done")



importing done


In [8]:
def load_methylation_data(file_path):
    """Load DNA methylation data from a file"""
    # Read the CSV file
    data = pd.read_csv(file_path)
    
    # Extract feature names, exclude first column (sample names) and last column (target)
    feature_names = np.array(data.columns[1:-1])
    
    # Extract data: X contains all columns except the first (sample names) and last (target)
    X = data.iloc[:, 1:-1].values
    
    # Extract target: last column
    y = data.iloc[:, -1].values
    
    return X, y, feature_names



In [13]:
directory_path = './datasets'
data_path = os.path.join(directory_path, 'DT.Healthy.csv')

"""Main function to run the PSO feature selection with XGBoost"""
# Load methylation data
print("Loading DNA methylation data...")

data_health = get_samples(data_path)

# Load the PAN-CANCER-TRANSPOSED.csv data
healthy_cases, prebrca_cases, cancer_cases = split_healthy_data(data_health)

# Combine the data into a single dataframe
# Tag each list of cases
healthy_cases = pd.DataFrame(healthy_cases)
healthy_cases['Tag'] = 'HEALTHY'
prebrca_cases = pd.DataFrame(prebrca_cases)
prebrca_cases['Tag'] = 'PRE-BRCA'
cancer_cases = pd.DataFrame(cancer_cases)
cancer_cases['Tag'] = 'BRCA'

df_cancer = pd.concat([healthy_cases, prebrca_cases, cancer_cases], ignore_index=True) #blood samples
# The last column is the target classes
# Ensure all data is numeric
X = df_cancer.iloc[:, :-1].apply(pd.to_numeric, errors='coerce')
Y = df_cancer.iloc[:, -1]
# Label encode the target variable
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(Y)


Loading DNA methylation data...


  df = pd.read_csv(file_path)


In [14]:
# Fill missing values with the lowest value of its cpg site
X = X.apply(lambda col: col.fillna(col.min()), axis=0)



In [None]:
print(f"Label encoding mapping: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")
feature_names = {index: value for index, value in enumerate(data_health)}

n_features = X.shape[1]
print(f"Loaded dataset with {n_features} features and {len(Y)} samples")

# Initialize PSO
pso = MyPSO(n_particles=30, 
            n_features=n_features, 
            max_iter=100, 
            init_prob=0.3,
            min_features=3)


Label encoding mapping: {'BRCA': np.int64(0), 'HEALTHY': np.int64(1), 'PRE-BRCA': np.int64(2)}
Loaded dataset with 27578 features and 790 samples


In [None]:
# Run optimization
# Run optimization
best_position, best_fitness = pso.optimize(X, Y)

# Ensure the length of best_position matches the number of features
if len(best_position) != n_features:
	raise ValueError(f"Length of best_position ({len(best_position)}) does not match the number of features ({n_features}).")


  0%|          | 0/100 [00:00<?, ?it/s]


IndexError: Boolean index has wrong length: 100 instead of 27578