<a href="https://colab.research.google.com/github/Ignas12345/Magistro_projektas/blob/main/knygutes/trecias_bandymas_klasifikuoti.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score


In [None]:
# uzkraunam neapdorotus duomenis
url = 'https://raw.githubusercontent.com/Ignas12345/Magistro_projektas/refs/heads/main/counts_combined.csv'
df = pd.read_csv(url)
gene_names = df["miRNA_ID"].to_numpy()
df_trimmed = df.drop(df.columns[0], axis=1)
full_data = (df_trimmed.to_numpy(dtype=np.int32)).T

In [3]:
#arba uzkraunam isfiltruotus duomenis is praeitos knygutes
url = 'https://raw.githubusercontent.com/Ignas12345/Magistro_projektas/refs/heads/main/filtruoti_duomenys/filtruoti_1_su_310_miRNR/'
filtered_data = (pd.read_csv(url + 'further_filtered_data.csv').values).T
gene_names = pd.read_csv(url + 'further_filtered_gene_names.csv', header = None).values.flatten()
print(np.shape(filtered_data))
print(np.shape(gene_names))

(139, 310)
(310,)


In [4]:
X = filtered_data

In [5]:
# Create labels: first 6 samples -> label 0, rest -> label 1
num_samples = X.shape[0]
labels = torch.zeros(num_samples, dtype=torch.float32)
labels[6:] = 1  # Set the rest of the samples to label 1

# Convert X to a PyTorch tensor
features = torch.tensor(X, dtype=torch.float32)

#netikri papildomi treniravimo duomenys
benign_features = features[:6]  # First 6 samples are benign
benign_labels = labels[:6]

# Duplicate benign samples with slight noise
augmented_features = benign_features + 0.01 * torch.randn_like(benign_features)
features = torch.cat([features, augmented_features])
labels = torch.cat([labels, benign_labels])

mean = features.mean(dim=0)
std = features.std(dim=0)
features = (features - mean) / (std + 1e-8)

# Combine features and labels into a dataset
dataset = TensorDataset(features, labels)

In [9]:
# Create a DataLoader for batching
batch_size = 1  # You can adjust this as needed
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Example: Print the shape of a batch
for batch_features, batch_labels in data_loader:
    print(f"Batch features shape: {batch_features.shape}")
    print(f"Batch labels shape: {batch_labels.shape}")
    break

Batch features shape: torch.Size([1, 310])
Batch labels shape: torch.Size([1])


In [None]:
print(features[0])

In [None]:
for batch_features, batch_labels in data_loader:
    print(f"Batch features shape: {batch_features}")
    print(f"Batch labels shape: {batch_labels}")
    break

Batch features shape: tensor([[-0.4202, -0.4190, -0.4189, -0.5222, -0.4267, -0.4379, -0.5122, -0.3789,
         -0.3905, -0.2518, -0.4331, -0.5286, -0.0767, -0.0914, -0.6333, -0.6347,
          1.5937,  1.5757, -0.6209, -0.8597, -0.6032, -0.6093, -0.3402, -0.4592,
         -0.3929, -0.7102, -0.5144, -0.5066, -0.5726, -0.2692,  0.1066,  0.2820,
         -0.0914, -0.1206, -0.3672,  0.0884, -0.5307, -0.0728, -0.5520, -0.9523,
         -0.4180, -0.5381, -0.2854, -0.7428, -0.3375, -0.3485, -0.3906, -0.4429,
          0.4400, -0.2638, -0.6155, -0.2417,  0.1125, -0.5115,  1.0740, -0.2824,
         -0.1617,  0.0269, -0.0662, -0.4925, -0.1999, -0.0687, -0.1604, -0.6651,
          0.5691, -0.4192, -0.2966, -0.6920, -0.4501, -0.4978, -0.5041, -0.4984,
         -0.3729, -0.4540, -0.0171,  0.3037, -0.3918, -0.3279, -0.5138, -0.3938,
         -0.7908,  0.2592, -0.1739, -0.3517, -0.8469,  0.6807, -0.2942, -0.2999,
         -0.1894, -0.5065, -0.4219,  0.1239, -0.5509, -0.5626, -0.5792, -0.7760,
      

In [6]:
class SparseNN(nn.Module):
    def __init__(self, input_dim, temperature = 1):
        super(SparseNN, self).__init__()
        self.input_dim = input_dim
        self.temperature = temperature
        self.first_layer_weights = nn.Parameter(torch.randn(input_dim))  # 1D tensor for scaling
        self.first_layer_biases = nn.Parameter(torch.randn(input_dim))
        self.output_weights = nn.Parameter(torch.randn(input_dim))  # Raw weights

    def forward(self, x):
        first_weights = torch.sigmoid(self.first_layer_weights)

        first_layer_output = torch.tanh((x * first_weights + self.first_layer_biases) * 100)
        weighted_output = first_layer_output * self.output_weights
        pooled_output = weighted_output.sum(dim=1)

        output = torch.sigmoid(pooled_output / self.temperature)

        return output

def l1_regularization(weight, lambda_l1=0.001):
    return lambda_l1 * weight.abs().sum()



In [15]:
#initialize the model
input_dim = features.shape[1]
model = SparseNN(input_dim, temperature = 2)
lambda_l1 = 0.01

# Define a loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy Loss for binary classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # L1 sparsity via weight_decay

In [16]:
# Training Loop
num_epochs = 40  # Adjust based on dataset size and convergence

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for batch_features, batch_labels in data_loader:
        # Forward pass
        outputs = model(batch_features)  # Ensure shape compatibility for BCELoss
        # Compute binary cross-entropy loss
        loss = criterion(outputs, batch_labels)

        # Add L1 regularization for sparsity
        loss += l1_regularization(model.output_weights, lambda_l1)

        #make probabilities less extreme
        penalty = (outputs * torch.log(outputs + 1e-8) + (1 - outputs) * torch.log(1 - outputs + 1e-8)).mean()
        loss += 0.1 * penalty

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Logging epoch statistics
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss / len(data_loader):.4f}")

# Evaluation (after training)
model.eval()
with torch.no_grad():
    # Forward pass on the entire dataset
    predictions = model(features)
    predictions = (predictions > 0.5).float()  # Convert probabilities to binary predictions

    # Compute accuracy
    accuracy = accuracy_score(labels.numpy(), predictions.numpy())
    print(f"Final Accuracy: {accuracy:.4f}")

Epoch [1/40], Loss: 3.0852
Epoch [2/40], Loss: 2.9150
Epoch [3/40], Loss: 2.7641
Epoch [4/40], Loss: 2.6711
Epoch [5/40], Loss: 2.6111
Epoch [6/40], Loss: 2.5616
Epoch [7/40], Loss: 2.5109
Epoch [8/40], Loss: 2.4614
Epoch [9/40], Loss: 2.4107
Epoch [10/40], Loss: 2.3553
Epoch [11/40], Loss: 2.3010
Epoch [12/40], Loss: 2.2394
Epoch [13/40], Loss: 2.1782
Epoch [14/40], Loss: 2.1152
Epoch [15/40], Loss: 2.0503
Epoch [16/40], Loss: 1.9823
Epoch [17/40], Loss: 1.9168
Epoch [18/40], Loss: 1.8495
Epoch [19/40], Loss: 1.7789
Epoch [20/40], Loss: 1.7084
Epoch [21/40], Loss: 1.6402
Epoch [22/40], Loss: 1.5722
Epoch [23/40], Loss: 1.5057
Epoch [24/40], Loss: 1.4386
Epoch [25/40], Loss: 1.3703
Epoch [26/40], Loss: 1.3106
Epoch [27/40], Loss: 1.2428
Epoch [28/40], Loss: 1.1858
Epoch [29/40], Loss: 1.1251
Epoch [30/40], Loss: 1.0713
Epoch [31/40], Loss: 1.0193
Epoch [32/40], Loss: 0.9703
Epoch [33/40], Loss: 0.9261
Epoch [34/40], Loss: 0.8818
Epoch [35/40], Loss: 0.8377
Epoch [36/40], Loss: 0.7970
E

In [17]:

probs = 100 * model(features).detach().numpy()
formatted_probs = np.array([f"{p:.2f}%" for p in probs])
print(formatted_probs)


['3.05%' '13.94%' '48.85%' '22.51%' '12.41%' '33.99%' '95.17%' '98.54%'
 '98.00%' '99.28%' '99.94%' '96.49%' '99.69%' '99.59%' '98.18%' '98.35%'
 '99.19%' '98.72%' '99.87%' '97.06%' '99.21%' '83.70%' '92.77%' '98.65%'
 '99.51%' '98.98%' '97.74%' '96.90%' '97.98%' '91.10%' '97.56%' '97.41%'
 '99.60%' '99.48%' '99.76%' '87.38%' '99.70%' '99.64%' '99.89%' '99.81%'
 '70.30%' '99.24%' '98.54%' '98.99%' '96.59%' '98.52%' '99.34%' '92.80%'
 '96.54%' '97.77%' '99.60%' '97.35%' '99.66%' '94.40%' '83.89%' '99.75%'
 '99.90%' '99.92%' '99.04%' '98.92%' '99.52%' '99.76%' '97.44%' '99.72%'
 '99.42%' '99.65%' '88.54%' '99.81%' '96.45%' '97.63%' '97.13%' '87.13%'
 '80.85%' '99.77%' '99.32%' '97.52%' '94.62%' '96.66%' '99.08%' '95.07%'
 '84.82%' '98.45%' '99.73%' '97.11%' '99.09%' '99.58%' '98.53%' '95.60%'
 '99.66%' '97.76%' '99.85%' '99.04%' '93.48%' '99.43%' '99.80%' '98.98%'
 '98.87%' '99.90%' '99.63%' '98.33%' '99.78%' '99.70%' '89.66%' '99.42%'
 '95.42%' '99.13%' '99.87%' '99.83%' '99.07%' '70.40

In [None]:
#jeigu procentai sutampa su tikrove, issaugom svorius (ir jeigu jie naudingi...)

torch.save(model.first_layer_weights.data, "first_layer_weights.pt")

# Save the first layer's biases (if present)
torch.save(model.first_layer_biases.data, "first_layer_biases.pt")

# Save the second layer (sparse) weights
torch.save(model.output_weights_raw.data, "output_weights_raw.pt")

In [18]:
# Svoriu analize is idejos turetu parodyti, kurie genai yra svarbiausi priimant sprendima
output_weights = model.output_weights.data
print(model.output_weights.data)


tensor([-2.5330e-03, -1.2592e+00,  3.1983e-04, -2.1926e-03,  3.7992e-04,
        -3.3820e-04, -4.5530e-01, -3.7946e-04,  3.0042e-04, -2.9972e-04,
         1.4058e-04,  1.3845e-04, -1.3908e-02,  2.0122e-04,  2.7481e-04,
         1.5958e-04,  5.9291e-04,  9.2438e-01, -1.6107e-01,  7.3743e-01,
         2.9973e-04, -1.3522e-03, -3.7082e-04, -1.1151e-03, -1.4761e-01,
        -7.5591e-01, -4.3986e-04, -5.6375e-03,  3.1244e-04, -9.5029e-03,
        -5.5138e-04,  8.8996e-01,  5.1193e-01, -3.3815e-01,  2.6450e-04,
        -3.2485e-01,  2.4615e-04,  2.4104e-04, -6.2923e-01,  1.0488e+00,
         4.6793e-04,  2.5583e-01, -2.0701e-01,  2.5893e-03, -2.0408e-01,
         2.9645e-04,  1.6083e-04, -1.8986e-04, -2.5654e-01, -1.0167e-02,
         5.0028e-01,  2.9206e-04, -3.0392e-04, -2.1084e-01, -5.3479e-01,
        -5.0211e-03, -8.6039e-01,  2.5391e-04, -2.0971e-03, -2.9152e-04,
         3.1914e-04,  4.4486e-04, -8.0783e-01, -1.9709e-03,  4.6935e-04,
        -5.2079e-03,  2.4506e-04,  2.9990e-04, -9.9

In [24]:
sorted_indices = torch.argsort(np.abs(output_weights), descending=True)
values = output_weights[sorted_indices]
genes = gene_names[sorted_indices]

print(sorted_indices[:10])
print(values[:10])
print(gene_names[280])

tensor([287, 280,  70, 125, 263,   1,  76,  85, 168, 285])
tensor([-2.2200, -2.0122,  1.7648,  1.5862,  1.5305, -1.2592, -1.1371, -1.1141,
         1.0997, -1.0876])
hsa-mir-656


In [None]:
model = SparseNN(input_dim, temperature = 5)

loaded_first_layer_weights = torch.load("first_layer_weights.pt")
loaded_first_layer_biases = torch.load("first_layer_biases.pt")
loaded_otput_weights_raw = torch.load("output_weights_raw.pt")

# Assign the loaded values to the model
model.first_layer_weights.data = loaded_first_layer_weights
model.first_layer_biases.data = loaded_first_layer_biases
model.output_weights_raw.data = loaded_otput_weights_raw

probs = 100 * model(features).detach().numpy()
formatted_probs = np.array([f"{p:.2f}%" for p in probs])
print(formatted_probs)

['0.28%' '2.63%' '98.76%' '13.91%' '6.85%' '2.21%' '100.00%' '99.84%'
 '99.97%' '99.61%' '99.25%' '99.98%' '100.00%' '100.00%' '99.84%' '96.65%'
 '99.59%' '99.98%' '100.00%' '99.96%' '99.95%' '100.00%' '99.56%' '99.91%'
 '99.99%' '100.00%' '99.87%' '98.75%' '99.90%' '91.62%' '99.97%' '99.85%'
 '99.57%' '100.00%' '93.07%' '99.83%' '99.97%' '100.00%' '100.00%'
 '100.00%' '78.69%' '85.09%' '99.97%' '99.86%' '99.94%' '99.91%' '98.92%'
 '96.46%' '100.00%' '99.58%' '100.00%' '98.58%' '99.46%' '99.59%'
 '100.00%' '98.60%' '97.74%' '100.00%' '100.00%' '99.84%' '100.00%'
 '100.00%' '93.83%' '84.41%' '99.85%' '99.96%' '100.00%' '99.79%' '99.79%'
 '99.93%' '99.92%' '92.36%' '100.00%' '99.76%' '99.98%' '99.78%' '99.87%'
 '99.77%' '100.00%' '97.00%' '91.85%' '79.52%' '99.78%' '84.40%' '99.07%'
 '100.00%' '99.98%' '99.56%' '99.98%' '100.00%' '100.00%' '95.92%'
 '99.38%' '92.33%' '99.79%' '100.00%' '99.97%' '99.04%' '100.00%'
 '100.00%' '99.99%' '99.54%' '99.99%' '100.00%' '99.99%' '98.96%' '99.99%'


  loaded_first_layer_weights = torch.load("first_layer_weights.pt")
  loaded_first_layer_biases = torch.load("first_layer_biases.pt")
  loaded_otput_weights_raw = torch.load("output_weights_raw.pt")
