In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Data Acquisition and Preprocessing**

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Step 1: Load the dataset
file_path = '/content/drive/MyDrive/Colab Notebooks/genome/classified_genes_with_labels.csv'
data = pd.read_csv(file_path)

print("Dataset loaded successfully!")
print("First 5 rows of the dataset:")
print(data.head())

# Step 2: Handle missing values
# Separate numeric and non-numeric columns
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
non_numeric_columns = data.select_dtypes(exclude=['float64', 'int64']).columns

# Fill missing values for numeric columns with the column mean
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

# Fill missing values for non-numeric columns with the mode (most frequent value)
for col in non_numeric_columns:
    data[col] = data[col].fillna(data[col].mode()[0])

print("\nMissing values handled!")

# Step 3: Normalization
# Normalize numeric columns using MinMaxScaler
scaler = MinMaxScaler()
data[numeric_columns] = scaler.fit_transform(data[numeric_columns])

print("\nData normalization complete!")

# Step 4: Filter noise
# Use IQR to remove outliers in numeric columns
Q1 = data[numeric_columns].quantile(0.25)
Q3 = data[numeric_columns].quantile(0.75)
IQR = Q3 - Q1

# Remove rows outside 1.5 * IQR (considered outliers)
data_filtered = data[~((data[numeric_columns] < (Q1 - 1.5 * IQR)) |
                       (data[numeric_columns] > (Q3 + 1.5 * IQR))).any(axis=1)]

print("\nNoise filtering complete!")
print(f"Original dataset size: {data.shape}")
print(f"Filtered dataset size: {data_filtered.shape}")

# Step 5: Save preprocessed data to a new CSV file
output_file_path = '/content/drive/MyDrive/Colab Notebooks/genome/preprocessed_genomic_data.csv'
data_filtered.to_csv(output_file_path, index=False)

print(f"\nPreprocessed data saved to '{output_file_path}'!")

data_filtered

Dataset loaded successfully!
First 5 rows of the dataset:
                      Gene Description Gene Accession Number    1 call    2  \
0  AFFX-BioB-5_at (endogenous control)        AFFX-BioB-5_at -214    A -139   
1  AFFX-BioB-M_at (endogenous control)        AFFX-BioB-M_at -153    A  -73   
2  AFFX-BioB-3_at (endogenous control)        AFFX-BioB-3_at  -58    A   -1   
3  AFFX-BioC-5_at (endogenous control)        AFFX-BioC-5_at   88    A  283   
4  AFFX-BioC-3_at (endogenous control)        AFFX-BioC-3_at -295    A -264   

  call.1    3 call.2    4 call.3  ...  call.33   30  call.34   31  call.35  \
0      A  -76      A -135      A  ...        A -318        A  -32        A   
1      A  -49      A -114      A  ...        A -192        A  -49        A   
2      A -307      A  265      A  ...        A  -95        A   49        A   
3      A  309      A   12      A  ...        A  312        A  230        P   
4      A -376      A -419      A  ...        A -139        A -367        A   

Unnamed: 0,Gene Description,Gene Accession Number,1,call,2,call.1,3,call.2,4,call.3,...,call.33,30,call.34,31,call.35,32,call.36,33,call.37,Class
0,AFFX-BioB-5_at (endogenous control),AFFX-BioB-5_at,0.385214,A,0.376784,A,0.490713,A,0.424123,A,...,A,0.404879,A,0.472855,A,0.528593,A,0.455520,A,IntOGen Tier-1
1,AFFX-BioB-M_at (endogenous control),AFFX-BioB-M_at,0.386412,A,0.378182,A,0.491202,A,0.424505,A,...,A,0.407834,A,0.472540,A,0.529434,A,0.454680,A,IntOGen Tier-2
2,AFFX-BioB-3_at (endogenous control),AFFX-BioB-3_at,0.388278,A,0.379707,A,0.486531,A,0.431416,A,...,A,0.410109,A,0.474360,A,0.530219,A,0.456589,A,FAMSIC driver
3,AFFX-BioC-5_at (endogenous control),AFFX-BioC-5_at,0.391146,A,0.385722,A,0.497683,A,0.426803,A,...,A,0.419655,A,0.477721,P,0.537080,A,0.463288,A,FAMSIC neutral
4,AFFX-BioC-3_at (endogenous control),AFFX-BioC-3_at,0.383623,A,0.374137,A,0.485282,A,0.418944,A,...,A,0.409077,A,0.466633,A,0.527396,A,0.451043,A,FAMSIC neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7121,GB DEF = Calcium/calmodulin-dependent protein ...,U73738_at,0.386962,A,0.378966,A,0.492686,A,0.430559,A,...,A,0.413815,A,0.474508,A,0.530630,A,0.456787,A,dbSNP
7123,CYP4B1 Cytochrome P450; subfamily IVB; polypep...,X16699_at,0.388690,A,0.379368,A,0.493030,A,0.424578,A,...,A,0.413111,A,0.473041,A,0.530873,A,0.457363,A,dbSNP
7125,HMG2 High-mobility group (nonhistone chromosom...,Z17240_at,0.395879,A,0.385976,A,0.506155,P,0.429684,A,...,A,0.417662,A,0.480155,A,0.536220,A,0.461856,A,dbSNP
7126,RB1 Retinoblastoma 1 (including osteosarcoma),L49218_f_at,0.390124,A,0.379961,A,0.492831,A,0.425672,A,...,A,0.412126,A,0.472967,A,0.531640,A,0.457544,A,dbSNP


# **Feature Extension using SincNet:**
# **Structure-level features**
# **Ratio-metric level features**
# **Mutation-level features**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Load and preprocess dataset from CSV file
def load_data(csv_path):
    data = pd.read_csv(csv_path)

    # Handling non-numeric columns
    data = data.apply(pd.to_numeric, errors='ignore')  # This keeps non-numeric columns as they are

    # Split features and target
    X = data.iloc[:, :-1].values  # All columns except the last one for features
    y = data.iloc[:, -1].values   # Last column as target

    # Encode categorical columns using LabelEncoder
    X = encode_categorical_columns(X)

    # Standardize the features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    return X, y

# Encode categorical columns using LabelEncoder
def encode_categorical_columns(X):
    label_encoder = LabelEncoder()
    for i in range(X.shape[1]):
        if X[:, i].dtype == 'object':  # Check if the column is categorical
            X[:, i] = label_encoder.fit_transform(X[:, i])
    return X

# Feature Extraction for Specific Levels
def extract_structure_level_features(X):
    return np.mean(X, axis=1), np.std(X, axis=1)

def extract_ratio_metric_level_features(X):
    ratios = np.sum(X, axis=1) / np.max(X, axis=1)
    return ratios

def extract_mutation_level_features(X):
    mutation_level = np.sum(X, axis=1)
    return mutation_level

# Split data into training and testing sets
def split_data(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

# Check data before splitting
def check_data(X, y):
    print(f"Shape of X before NaN removal: {X.shape}")
    print(f"Shape of y before NaN removal: {y.shape}")

    # Drop NaN rows
    X = X[~np.isnan(y)]  # Drop rows with NaN in target variable
    y = y[~np.isnan(y)]  # Drop NaN in target

    print(f"Shape of X after NaN removal: {X.shape}")
    print(f"Shape of y after NaN removal: {y.shape}")

    return X, y

# Define the SincConv Layer
class SincConv(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, sample_rate=16000):
        super(SincConv, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.sample_rate = sample_rate

        self.low_hz = nn.Parameter(torch.Tensor(out_channels).uniform_(50, 150))
        self.band_hz = nn.Parameter(torch.Tensor(out_channels).uniform_(50, 200))
        self.window = torch.hamming_window(kernel_size)

    def sinc(self, band, t):
        y = torch.where(t == 0, torch.ones_like(t) * 2 * band, torch.sin(2 * 3.1416 * band * t) / (2 * 3.1416 * band * t))
        return y

    def forward(self, x):
        t = torch.linspace(-self.kernel_size // 2, self.kernel_size // 2, self.kernel_size)
        t = t / self.sample_rate
        filters = []
        for i in range(self.out_channels):
            low = self.low_hz[i]
            high = low + torch.abs(self.band_hz[i])
            band = high - low
            filter_i = self.sinc(high, t) - self.sinc(low, t)
            filter_i *= self.window
            filters.append(filter_i)
        filters = torch.stack(filters).unsqueeze(1)  # Shape: (out_channels, 1, kernel_size)
        return nn.functional.conv1d(x, filters, stride=1, padding=self.kernel_size // 2)

# Define the Full Model for Feature Extraction
class SincNet(nn.Module):
    def __init__(self, input_size, output_size):
        super(SincNet, self).__init__()
        self.sinc_conv = SincConv(in_channels=1, out_channels=16, kernel_size=31)

        self.dummy_input = torch.zeros(1, 1, input_size)
        self.flattened_size = self._get_flattened_size(self.dummy_input)

        self.fc1 = nn.Linear(self.flattened_size, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, output_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def _get_flattened_size(self, x):
        x = self.sinc_conv(x)
        return int(np.prod(x.size()))

    def forward(self, x):
        x = x.unsqueeze(1)
        x = self.sinc_conv(x)
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.dropout(self.relu(self.fc2(x)))
        x = self.fc3(x)
        return x

# Train the model
def train_model(model, X_train_tensor, y_train_tensor, criterion, optimizer, device, epochs=2, batch_size=64):
    model.train()
    total_batches = len(X_train_tensor) // batch_size
    for epoch in range(epochs):
        total_loss = 0
        for i in range(total_batches):
            start_idx = i * batch_size
            end_idx = (i + 1) * batch_size
            X_batch = X_train_tensor[start_idx:end_idx].to(device)
            y_batch = y_train_tensor[start_idx:end_idx].to(device)

            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / total_batches:.4f}")
# Load the input CSV file
input_file_path = '/content/drive/MyDrive/Colab Notebooks/genome/preprocessed_genomic_data.csv'  # Replace with your input CSV file path
data = pd.read_csv(input_file_path)

# Assume 'Class' column contains the target variable (class labels)
class_column = 'Class'  # Replace with the actual column name of your class labels
data_numeric = data.drop(columns=[class_column])  # Exclude the class column for PCA

# Select only numeric columns for PCA
data_numeric = data_numeric.select_dtypes(include=['number'])

# Apply PCA
pca = PCA(n_components=20)  # Specify the number of components as 10
pca_result = pca.fit_transform(data_numeric)

# Convert PCA result to a DataFrame and rename the columns to 10 components
df = pd.DataFrame(pca_result, columns=[f'Feature {i+1}' for i in range(20)])

# Include the 'Class' column with the PCA features
df[class_column] = data[class_column]

# Save the transformed data to a new CSV file
output_file_path = '/content/drive/MyDrive/Colab Notebooks/genome/output_file_with_class.csv'  # Replace with your desired output file path
df.to_csv(output_file_path, index=False)

print("transformation applied and saved successfully with class labels.")
# Make predictions on the test set
def make_predictions(model, X_test_tensor):
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        y_pred = model(X_test_tensor)
        _, predicted_labels = torch.max(y_pred, 1)  # Get the predicted class labels
    return predicted_labels

# Save predictions to a CSV file
def save_predictions_to_csv(predictions, y_test, output_csv_path):
    predictions_df = pd.DataFrame({
        'True Label': y_test,
        'Predicted Label': predictions
    })
    predictions_df.to_csv(output_csv_path, index=False)
    print(f"Predictions saved to {output_csv_path}")
df

transformation applied and saved successfully with class labels.


Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,...,Feature 12,Feature 13,Feature 14,Feature 15,Feature 16,Feature 17,Feature 18,Feature 19,Feature 20,Class
0,-0.029757,0.000257,-0.007135,0.002315,0.002437,-0.000689,0.002850,-0.001342,-0.002946,0.000348,...,-0.001064,-0.000366,0.002926,0.001846,0.001596,-0.000492,-0.000292,0.000935,-0.000536,IntOGen Tier-1
1,-0.033825,-0.003063,-0.000071,0.000451,-0.001943,-0.000634,0.004267,-0.000250,-0.004179,-0.000230,...,0.001061,0.000847,-0.000441,-0.000271,0.000993,0.000913,0.001327,0.001377,0.000931,IntOGen Tier-2
2,-0.017410,-0.002297,0.000399,0.000375,0.000531,0.004710,0.005079,0.001926,-0.004760,-0.002529,...,-0.000103,-0.002938,-0.000108,-0.003123,0.000219,0.000649,-0.002481,-0.000949,-0.001712,FAMSIC driver
3,0.009684,-0.001301,0.002329,0.006506,-0.001640,-0.003435,0.000406,-0.001286,0.000466,-0.002296,...,-0.001280,-0.000805,-0.000792,0.003243,-0.000172,0.000169,0.002531,0.004212,-0.002720,FAMSIC neutral
4,-0.047998,0.001492,0.001221,0.002905,-0.003264,-0.001112,0.001002,-0.003728,0.000468,0.000084,...,0.000606,-0.000233,-0.002553,0.002168,0.000182,-0.000477,-0.000504,0.001753,0.001367,FAMSIC neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4736,-0.013081,-0.003182,0.002057,-0.001211,-0.002853,0.000126,0.000119,0.001093,0.000821,-0.002621,...,-0.001640,0.000532,0.001548,-0.001176,0.000056,-0.001398,0.001596,-0.001121,0.000595,dbSNP
4737,-0.016748,-0.002296,0.001840,-0.000538,-0.000958,0.000022,0.000746,0.000326,0.000572,0.000823,...,0.000478,0.000163,-0.000598,0.000365,0.000455,0.000041,0.001157,-0.000484,-0.001090,dbSNP
4738,0.029797,-0.005273,0.009732,-0.011336,0.001816,-0.004077,0.000123,-0.002686,0.001373,0.002029,...,-0.006144,-0.002218,0.002516,0.000160,0.000853,-0.001578,0.004948,0.003899,0.001035,dbSNP
4739,-0.011556,-0.000636,0.000029,-0.001387,-0.000309,-0.000536,0.001022,-0.000458,-0.000401,0.000828,...,-0.000298,-0.000621,0.000012,-0.000564,-0.000689,-0.000651,-0.000281,0.000908,-0.001180,dbSNP


# **Feature Selection**
# **Modified Crow Search Optimization (MCFO):**

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Initialize MCFO parameters
class MCFO:
    def __init__(self, n_features, n_crows=30, max_iter=100, p1=0.3, p2=0.7):
        self.n_features = n_features  # Number of features in the dataset
        self.n_crows = n_crows  # Number of crows
        self.max_iter = max_iter  # Maximum number of iterations
        self.p1 = p1  # Probability of selecting feature
        self.p2 = p2  # Probability of feature exploration
        self.best_solution = None
        self.best_fitness = -np.inf

    def initialize_population(self):
        population = np.random.randint(2, size=(self.n_crows, self.n_features))  # Binary representation of feature selection
        return population

    def fitness(self, solution, X_train, y_train):
        # Select features based on binary solution
        selected_features = [i for i, val in enumerate(solution) if val == 1]
        if len(selected_features) == 0:
            return 0  # No features selected
        X_selected = X_train[:, selected_features]
        model = RandomForestClassifier(n_estimators=100)
        model.fit(X_selected, y_train)
        predictions = model.predict(X_selected)
        return accuracy_score(y_train, predictions)

    def update_position(self, crow, best_position):
        # Update the crow's position using MCFO logic
        new_position = crow.copy()
        for i in range(len(crow)):
            if np.random.rand() < self.p1:
                new_position[i] = 1 - crow[i]  # Explore by flipping bits
            if np.random.rand() < self.p2:
                new_position[i] = best_position[i]  # Exploit by moving towards best solution
        return new_position

    def optimize(self, X_train, y_train):
        # Initialize population
        population = self.initialize_population()
        for iter in range(self.max_iter):
            for i in range(self.n_crows):
                fitness_value = self.fitness(population[i], X_train, y_train)
                if fitness_value > self.best_fitness:
                    self.best_fitness = fitness_value
                    self.best_solution = population[i].copy()

            # Display best fitness and best solution for the current iteration
            print(f"Iteration {iter + 1}: Best Fitness = {self.best_fitness:.4f}")
            print(f"Best Solution (Feature Indices): {np.where(self.best_solution == 1)[0]}")

            # Update the positions of crows
            for i in range(self.n_crows):
                population[i] = self.update_position(population[i], self.best_solution)
        return self.best_solution, self.best_fitness


# Load dataset
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/genome/output_file_with_class.csv')  # Replace with your dataset path
X = data.drop('Class', axis=1).values  # Features (replace 'Class' with your target column)
y = data['Class'].values  # Target variable

# Get feature names
feature_names = data.drop('Class', axis=1).columns

# Data preprocessing
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Apply MCFO for feature optimization
mcfo = MCFO(n_features=X_train.shape[1], n_crows=30, max_iter=1, p1=0.3, p2=0.7)  # Max iter set to 1 for demonstration
best_solution, best_fitness = mcfo.optimize(X_train, y_train)

# Ensure that 'Class' column is included
selected_features = [i for i, val in enumerate(best_solution) if val == 1]
selected_feature_names = [feature_names[i] for i in selected_features]  # Get feature names

# Add the 'Class' column to the selected feature list
selected_feature_names.append('Class')

# Select features for training and testing (without 'Class')
X_selected_train = X_train[:, selected_features]
X_selected_test = X_test[:, selected_features]

# Train model with selected features
model = RandomForestClassifier(n_estimators=1)
model.fit(X_selected_train, y_train)

# Evaluate the model
predictions = model.predict(X_selected_test)
accuracy = accuracy_score(y_test, predictions)

# Prepare DataFrame to save selected features and their values
selected_features_df = pd.DataFrame(X_selected_test, columns=selected_feature_names[:-1])  # Exclude 'Class' from columns
selected_features_df['Class'] = y_test

# Save the selected features and class column to a CSV file
output_file_path = '/content/drive/MyDrive/Colab Notebooks/genome/selected_features_output.csv'  # Replace with desired output file path
selected_features_df.to_csv(output_file_path, index=False)

# Print results
print(f"Best feature subset (indices): {selected_features}")
print(f"Model accuracy with selected features: {accuracy:.4f}")
print(f"Selected feature names: {selected_feature_names}")
print(f"Selected features and class saved to: {output_file_path}")
selected_features_df

Iteration 1: Best Fitness = 1.0000
Best Solution (Feature Indices): [ 1  2  3  4  6  8  9 10 12 13 14 16 18 19]
Best feature subset (indices): [1, 2, 3, 4, 6, 8, 9, 10, 12, 13, 14, 16, 18, 19]
Model accuracy with selected features: 0.9979
Selected feature names: ['Feature 2', 'Feature 3', 'Feature 4', 'Feature 5', 'Feature 7', 'Feature 9', 'Feature 10', 'Feature 11', 'Feature 13', 'Feature 14', 'Feature 15', 'Feature 17', 'Feature 19', 'Feature 20', 'Class']
Selected features and class saved to: /content/drive/MyDrive/Colab Notebooks/genome/selected_features_output.csv


Unnamed: 0,Feature 2,Feature 3,Feature 4,Feature 5,Feature 7,Feature 9,Feature 10,Feature 11,Feature 13,Feature 14,Feature 15,Feature 17,Feature 19,Feature 20,Class
0,-0.704864,0.848421,5.162741,-1.914564,0.215428,-0.289876,0.733572,-1.362767,3.021428,0.802039,-1.917211,1.238105,0.239673,-0.236144,dbSNP
1,0.570061,-0.720523,0.128866,-0.286946,-0.857320,-1.174479,-1.328960,1.864637,1.022519,0.436560,-0.891466,1.515292,-1.464749,-0.121962,dbSNP
2,-0.418998,0.533668,-0.152782,-0.293382,-0.117913,0.153903,0.899741,0.255051,0.994294,1.080278,1.362654,1.285985,-0.072750,-0.010947,dbSNP
3,1.614831,1.462299,-0.450938,-0.040287,0.650210,0.310563,-0.222304,-0.126250,-0.631111,-0.497849,-0.871363,0.956866,0.431748,-1.914310,dbSNP
4,2.928062,1.134852,-0.357564,1.253648,0.257373,-1.042706,0.815815,0.103244,-0.013105,-1.046566,-0.216168,0.309978,0.167900,1.255751,dbSNP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
944,1.169212,-0.100425,-1.160145,-0.690307,1.215099,3.736307,-1.163004,0.300380,-0.759685,-0.348492,-1.384946,-1.233796,-0.039368,-1.356900,dbSNP
945,-0.073895,-0.198507,-0.257363,0.201086,0.450445,-0.113235,-0.032105,-0.141729,-0.057574,0.867584,-0.304988,0.226942,-0.178295,-0.301503,dbSNP
946,0.363559,0.569339,-1.015532,-0.393148,0.195625,-0.848998,-0.043954,0.406606,0.352705,0.301001,0.165928,-0.557541,0.236906,-0.177315,dbSNP
947,-1.030131,-0.332184,2.095821,2.430815,-0.683621,0.590276,-0.538411,-0.291628,-1.846115,3.442045,0.252463,0.948508,-2.348558,1.689091,dbSNP


# **Classification**

# **dual-mode impulsive pantograph neural network (DMIP-NN)**

In [17]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset from CSV file
def load_data_from_csv(file_path):
    # Load data from CSV file
    data = pd.read_csv(file_path)

    # Separate features and labels
    X = data.iloc[:, :-1].values  # All columns except the last one as features
    y = data.iloc[:, -1].values   # The last column as the label

    # Check for unique values in the labels
    print("Unique values in label column:", np.unique(y))

    # Convert non-numeric labels to numeric or drop rows with invalid labels
    try:
        y = np.array(y, dtype=np.float32)  # Try casting to float32
    except ValueError as e:
        print(f"Error converting labels to float32: {e}")

        # Handle non-numeric labels (for example, converting 'dbSNP' to 0 or 1)
        y = np.where(y == 'dbSNP', 0, 1)  # Example: convert 'dbSNP' to 0 and others to 1

    return X, y

# Neural Network Model Definition using only standard layers (no custom activation)
def create_dmip_nn(input_dim):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))

    # Add Dense layers with standard ReLU activation
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(16, activation='relu'))

    model.add(Dense(1, activation='sigmoid'))  # Sigmoid for binary classification or regression

    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

    return model

# Load your data from a CSV file
file_path = '/content/drive/MyDrive/Colab Notebooks/genome/selected_features_output.csv'  # Replace with your actual CSV file path
X, y = load_data_from_csv(file_path)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Initialize and train the DMIP-NN model
model = create_dmip_nn(input_dim=X_train.shape[1])
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

# Predict the labels for the test data
y_pred = (model.predict(X_test) > 0.5).astype("int32")  # Convert predictions to binary (0 or 1)

# Calculate accuracy, precision, recall, and F1-score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
fmeasure = f1_score(y_test, y_pred)

# Print the metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F-measure: {fmeasure:.4f}")


Unique values in label column: ['dbSNP']
Error converting labels to float32: could not convert string to float: 'dbSNP'
Epoch 1/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 19ms/step - accuracy: 0.4903 - loss: 0.6864 - val_accuracy: 1.0000 - val_loss: 0.3871
Epoch 2/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 0.3143 - val_accuracy: 1.0000 - val_loss: 0.0989
Epoch 3/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 0.0806 - val_accuracy: 1.0000 - val_loss: 0.0237
Epoch 4/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 1.0000 - loss: 0.0202 - val_accuracy: 1.0000 - val_loss: 0.0096
Epoch 5/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 1.0000 - loss: 0.0098 - val_accuracy: 1.0000 - val_loss: 0.0049
Epoch 6/50
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms