<a href="https://colab.research.google.com/github/Ignas12345/Magistro_projektas/blob/main/knygutes/antras_bandymas_klasifikuoti.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score


In [28]:
# uzkraunam neapdorotus duomenis
url = 'https://raw.githubusercontent.com/Ignas12345/Magistro_projektas/refs/heads/main/counts_combined.csv'
df = pd.read_csv(url)
gene_names = df["miRNA_ID"].to_numpy()
df_trimmed = df.drop(df.columns[0], axis=1)
full_data = (df_trimmed.to_numpy(dtype=np.int32)).T

In [29]:
#arba uzkraunam isfiltruotus duomenis is praeitos knygutes
url = 'https://raw.githubusercontent.com/Ignas12345/Magistro_projektas/refs/heads/main/filtruoti_duomenys/filtruoti_1_su_310_miRNR/'
filtered_data = (pd.read_csv(url + 'further_filtered_data.csv').values).T
gene_names = pd.read_csv(url + 'further_filtered_gene_names.csv', header = None).values.flatten()
print(np.shape(filtered_data))
print(np.shape(gene_names))

(139, 310)
(310,)


In [30]:
X = filtered_data

In [31]:
# Create labels: first 6 samples -> label 0, rest -> label 1
num_samples = X.shape[0]
labels = torch.zeros(num_samples, dtype=torch.float32)
labels[6:] = 1  # Set the rest of the samples to label 1

# Convert X to a PyTorch tensor
features = torch.tensor(X, dtype=torch.float32)

#netikri papildomi treniravimo duomenys
benign_features = features[:6]  # First 6 samples are benign
benign_labels = labels[:6]

# Duplicate benign samples with slight noise
augmented_features = benign_features + 0.01 * torch.randn_like(benign_features)
features = torch.cat([features, augmented_features])
labels = torch.cat([labels, benign_labels])

mean = features.mean(dim=0)
std = features.std(dim=0)
features = (features - mean) / (std + 1e-8)

# Combine features and labels into a dataset
dataset = TensorDataset(features, labels)

In [6]:
# Create a DataLoader for batching
batch_size = 1  # You can adjust this as needed
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Example: Print the shape of a batch
for batch_features, batch_labels in data_loader:
    print(f"Batch features shape: {batch_features.shape}")
    print(f"Batch labels shape: {batch_labels.shape}")
    break

Batch features shape: torch.Size([1, 310])
Batch labels shape: torch.Size([1])


In [None]:
print(features[0])

In [32]:
for batch_features, batch_labels in data_loader:
    print(f"Batch features shape: {batch_features}")
    print(f"Batch labels shape: {batch_labels}")
    break

Batch features shape: tensor([[-0.4202, -0.4190, -0.4189, -0.5222, -0.4267, -0.4379, -0.5122, -0.3789,
         -0.3905, -0.2518, -0.4331, -0.5286, -0.0767, -0.0914, -0.6333, -0.6347,
          1.5937,  1.5757, -0.6209, -0.8597, -0.6032, -0.6093, -0.3402, -0.4592,
         -0.3929, -0.7102, -0.5144, -0.5066, -0.5726, -0.2692,  0.1066,  0.2820,
         -0.0914, -0.1206, -0.3672,  0.0884, -0.5307, -0.0728, -0.5520, -0.9523,
         -0.4180, -0.5381, -0.2854, -0.7428, -0.3375, -0.3485, -0.3906, -0.4429,
          0.4400, -0.2638, -0.6155, -0.2417,  0.1125, -0.5115,  1.0740, -0.2824,
         -0.1617,  0.0269, -0.0662, -0.4925, -0.1999, -0.0687, -0.1604, -0.6651,
          0.5691, -0.4192, -0.2966, -0.6920, -0.4501, -0.4978, -0.5041, -0.4984,
         -0.3729, -0.4540, -0.0171,  0.3037, -0.3918, -0.3279, -0.5138, -0.3938,
         -0.7908,  0.2592, -0.1739, -0.3517, -0.8469,  0.6807, -0.2942, -0.2999,
         -0.1894, -0.5065, -0.4219,  0.1239, -0.5509, -0.5626, -0.5792, -0.7760,
      

In [35]:
class SparseNN(nn.Module):
    def __init__(self, input_dim, temperature = 1):
        super(SparseNN, self).__init__()
        self.input_dim = input_dim
        self.temperature = temperature
        self.output_weights = nn.Parameter(torch.randn(input_dim))  # Raw weights

    def forward(self, x):
        first_layer_output = torch.tanh(x)
        weighted_output = first_layer_output * self.output_weights
        pooled_output = weighted_output.sum(dim=1)

        output = torch.sigmoid(pooled_output / self.temperature)

        return output

def l1_regularization(weight, lambda_l1=0.001):
    return lambda_l1 * weight.abs().sum()



In [52]:
#initialize the model
input_dim = features.shape[1]
model = SparseNN(input_dim, temperature = 1)

# Define a loss function and optimizer
criterion = nn.BCELoss()  # Binary Cross Entropy Loss for binary classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # L1 sparsity via weight_decay

In [50]:
# Training Loop
num_epochs = 50  # Adjust based on dataset size and convergence
lambda_l1 = 0.01  # Regularization strength

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for batch_features, batch_labels in data_loader:
        # Forward pass
        outputs = model(batch_features)  # Ensure shape compatibility for BCELoss
        # Compute binary cross-entropy loss
        loss = criterion(outputs, batch_labels)

        # Add L1 regularization for sparsity
        loss += l1_regularization(model.output_weights, lambda_l1)

        #make probabilities less extreme
        penalty = (outputs * torch.log(outputs + 1e-8) + (1 - outputs) * torch.log(1 - outputs + 1e-8)).mean()
        loss += 0.1 * penalty

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    # Logging epoch statistics
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss / len(data_loader):.4f}")

# Evaluation (after training)
model.eval()
with torch.no_grad():
    # Forward pass on the entire dataset
    predictions = model(features)
    predictions = (predictions > 0.5).float()  # Convert probabilities to binary predictions

    # Compute accuracy
    accuracy = accuracy_score(labels.numpy(), predictions.numpy())
    print(f"Final Accuracy: {accuracy:.4f}")

Epoch [1/50], Loss: 4.2571
Epoch [2/50], Loss: 3.3859
Epoch [3/50], Loss: 3.1003
Epoch [4/50], Loss: 2.9384
Epoch [5/50], Loss: 2.8211
Epoch [6/50], Loss: 2.7203
Epoch [7/50], Loss: 2.6365
Epoch [8/50], Loss: 2.5646
Epoch [9/50], Loss: 2.4977
Epoch [10/50], Loss: 2.4403
Epoch [11/50], Loss: 2.3843
Epoch [12/50], Loss: 2.3319
Epoch [13/50], Loss: 2.2796
Epoch [14/50], Loss: 2.2333
Epoch [15/50], Loss: 2.1835
Epoch [16/50], Loss: 2.1359
Epoch [17/50], Loss: 2.0894
Epoch [18/50], Loss: 2.0425
Epoch [19/50], Loss: 1.9973
Epoch [20/50], Loss: 1.9525
Epoch [21/50], Loss: 1.9076
Epoch [22/50], Loss: 1.8632
Epoch [23/50], Loss: 1.8204
Epoch [24/50], Loss: 1.7756
Epoch [25/50], Loss: 1.7352
Epoch [26/50], Loss: 1.6894
Epoch [27/50], Loss: 1.6479
Epoch [28/50], Loss: 1.6084
Epoch [29/50], Loss: 1.5690
Epoch [30/50], Loss: 1.5294
Epoch [31/50], Loss: 1.4914
Epoch [32/50], Loss: 1.4550
Epoch [33/50], Loss: 1.4175
Epoch [34/50], Loss: 1.3790
Epoch [35/50], Loss: 1.3427
Epoch [36/50], Loss: 1.3064
E

In [51]:

probs = 100 * model(features).detach().numpy()
formatted_probs = np.array([f"{p:.2f}%" for p in probs])
print(formatted_probs)


['3.50%' '5.15%' '77.16%' '20.94%' '2.28%' '18.25%' '88.19%' '66.17%'
 '97.64%' '90.80%' '93.98%' '92.69%' '65.33%' '83.98%' '78.70%' '60.62%'
 '98.07%' '98.99%' '81.91%' '88.80%' '91.57%' '79.83%' '68.42%' '81.81%'
 '88.03%' '91.37%' '87.03%' '83.67%' '89.00%' '46.18%' '93.30%' '87.30%'
 '90.24%' '73.27%' '75.23%' '17.03%' '58.32%' '93.63%' '98.23%' '84.02%'
 '65.12%' '94.84%' '94.27%' '80.27%' '80.97%' '76.63%' '84.98%' '82.26%'
 '88.08%' '92.56%' '56.12%' '86.92%' '91.56%' '14.04%' '84.68%' '55.60%'
 '92.27%' '74.54%' '86.09%' '77.56%' '82.38%' '82.64%' '87.59%' '82.30%'
 '99.43%' '69.31%' '87.07%' '85.58%' '98.56%' '71.37%' '96.13%' '89.19%'
 '93.26%' '60.74%' '79.15%' '96.80%' '89.34%' '93.58%' '87.75%' '71.01%'
 '87.28%' '91.01%' '62.85%' '89.45%' '87.07%' '67.47%' '76.51%' '87.80%'
 '78.26%' '81.98%' '91.69%' '50.00%' '69.15%' '93.37%' '91.01%' '83.51%'
 '76.24%' '83.96%' '93.53%' '91.93%' '97.36%' '87.54%' '79.91%' '75.24%'
 '59.27%' '76.37%' '99.41%' '96.81%' '74.23%' '64.93%'

In [None]:
#jeigu procentai sutampa su tikrove, issaugom svorius (ir jeigu jie naudingi...)

torch.save(model.first_layer_weights.data, "first_layer_weights.pt")

# Save the first layer's biases (if present)
torch.save(model.first_layer_biases.data, "first_layer_biases.pt")

# Save the second layer (sparse) weights
torch.save(model.output_weights_raw.data, "output_weights_raw.pt")

In [53]:
# Svoriu analize is idejos turetu parodyti, kurie genai yra svarbiausi priimant sprendima
output_weights = model.output_weights.data
print(model.output_weights.data)


tensor([-8.0371e-01, -1.9905e-01, -6.9083e-02, -8.4472e-01, -1.3725e+00,
        -1.2039e+00,  2.4423e-01,  8.2587e-02,  2.9562e-01,  5.6423e-01,
         1.1296e+00,  1.3189e+00, -1.0270e+00,  1.8009e-01,  6.7404e-01,
        -1.6188e+00,  2.0592e-01,  1.6052e+00,  4.7599e-01,  8.4591e-01,
         2.5805e-01,  2.6345e-01,  1.9229e+00,  9.7635e-01, -2.5098e+00,
         1.2302e+00, -1.4879e+00,  3.7657e-01, -4.3982e-02, -1.0523e+00,
        -2.1744e-01,  1.1730e+00, -5.2970e-01,  2.0544e+00, -2.0209e-01,
        -9.3559e-01, -1.4347e-01,  8.2741e-01, -2.0529e-01, -1.6451e+00,
        -4.2957e-01, -1.2683e+00,  1.8300e+00,  5.2238e-01, -8.8786e-02,
         8.8914e-01,  1.7513e-01, -4.2077e-02, -4.1244e-01,  8.4879e-01,
         4.3400e-01,  8.8089e-01,  1.3504e+00,  9.5448e-01, -9.4601e-01,
         3.7017e-02,  5.0306e-01,  9.3641e-02,  1.5665e-01,  3.4711e-01,
        -2.2121e-01,  9.0479e-01,  1.3822e+00,  5.3259e-01, -4.8295e-01,
         9.7658e-01, -8.2628e-01, -7.8465e-01,  7.6

In [56]:
sorted_indices = torch.argsort(np.abs(output_weights), descending=True)
values = output_weights[sorted_indices]
genes = gene_names[sorted_indices]

print(sorted_indices[:50])
print(values[:50])
print(gene_names[264])

tensor([258, 264,  24, 202, 180, 289,  33, 116, 266, 308,  22, 225,  42, 221,
         75,  92, 244, 268, 215, 240, 282,  39,  15,  17, 300, 197, 269, 250,
        163,  90, 265,  26, 187, 109, 118, 108, 238, 131, 177, 101, 204, 151,
        156,  62,   4, 171, 162,  52, 284,  11])
tensor([ 2.7127,  2.6269, -2.5098,  2.4584, -2.1656,  2.1328,  2.0544, -1.9919,
        -1.9733,  1.9512,  1.9229,  1.8349,  1.8300,  1.8247, -1.7635, -1.7530,
         1.7185,  1.7174,  1.7011, -1.7006, -1.6783, -1.6451, -1.6188,  1.6052,
         1.5936, -1.5396, -1.5275,  1.5223, -1.5091, -1.5021,  1.4959, -1.4879,
         1.4864, -1.4832, -1.4808, -1.4741,  1.4736,  1.4714, -1.4665,  1.4492,
        -1.4438,  1.3973,  1.3857,  1.3822, -1.3725, -1.3565,  1.3513,  1.3504,
        -1.3249,  1.3189])
hsa-mir-574


In [None]:
model = SparseNN(input_dim, temperature = 5)

loaded_first_layer_weights = torch.load("first_layer_weights.pt")
loaded_first_layer_biases = torch.load("first_layer_biases.pt")
loaded_otput_weights_raw = torch.load("output_weights_raw.pt")

# Assign the loaded values to the model
model.first_layer_weights.data = loaded_first_layer_weights
model.first_layer_biases.data = loaded_first_layer_biases
model.output_weights_raw.data = loaded_otput_weights_raw

probs = 100 * model(features).detach().numpy()
formatted_probs = np.array([f"{p:.2f}%" for p in probs])
print(formatted_probs)

['0.28%' '2.63%' '98.76%' '13.91%' '6.85%' '2.21%' '100.00%' '99.84%'
 '99.97%' '99.61%' '99.25%' '99.98%' '100.00%' '100.00%' '99.84%' '96.65%'
 '99.59%' '99.98%' '100.00%' '99.96%' '99.95%' '100.00%' '99.56%' '99.91%'
 '99.99%' '100.00%' '99.87%' '98.75%' '99.90%' '91.62%' '99.97%' '99.85%'
 '99.57%' '100.00%' '93.07%' '99.83%' '99.97%' '100.00%' '100.00%'
 '100.00%' '78.69%' '85.09%' '99.97%' '99.86%' '99.94%' '99.91%' '98.92%'
 '96.46%' '100.00%' '99.58%' '100.00%' '98.58%' '99.46%' '99.59%'
 '100.00%' '98.60%' '97.74%' '100.00%' '100.00%' '99.84%' '100.00%'
 '100.00%' '93.83%' '84.41%' '99.85%' '99.96%' '100.00%' '99.79%' '99.79%'
 '99.93%' '99.92%' '92.36%' '100.00%' '99.76%' '99.98%' '99.78%' '99.87%'
 '99.77%' '100.00%' '97.00%' '91.85%' '79.52%' '99.78%' '84.40%' '99.07%'
 '100.00%' '99.98%' '99.56%' '99.98%' '100.00%' '100.00%' '95.92%'
 '99.38%' '92.33%' '99.79%' '100.00%' '99.97%' '99.04%' '100.00%'
 '100.00%' '99.99%' '99.54%' '99.99%' '100.00%' '99.99%' '98.96%' '99.99%'


  loaded_first_layer_weights = torch.load("first_layer_weights.pt")
  loaded_first_layer_biases = torch.load("first_layer_biases.pt")
  loaded_otput_weights_raw = torch.load("output_weights_raw.pt")
