## Data Preprocessing

The current dataset contains the sequence for the sgRNA, features like the **log2fc** and **effect** and information about the target sequence for which the sgRNA is being used. My goal is to create a model which can take in a target sequence and it can generate new sgRNA sequences for that target sequence. For that, the first step will be to extract the raw target sequence itself

In [38]:
# Importing the necessary libraries
import pandas as pd
import requests
from Bio.Seq import Seq

In [39]:
# Load the data from CSV
df = pd.read_csv('GenomeCRISPR_full.csv')

In [40]:
# Filter the dataframe to keep the relevant columns and drop NAs and duplicate values
df = df[['sequence', 'symbol', 'ensg', 'log2fc', 'effect']].dropna().drop_duplicates()

In [51]:
len(pd.unique(df['ensg']).tolist())

18950

In [5]:
def fetch_gene_sequence(ensg):
    """Fetches the entire gene sequence from ENSEMBL using the gene's ENSEMBL ID."""
    url = f"https://rest.ensembl.org/lookup/id/{ensg}?expand=1;content-type=application/json"
    response = requests.get(url, headers={"Content-Type": "application/json"})

    if response.status_code == 200:
        gene_data = response.json()
        chromosome = gene_data['seq_region_name']
        start = gene_data['start']
        end = gene_data['end']
        strand = gene_data['strand']

        seq_url = f"https://rest.ensembl.org/sequence/region/human/{chromosome}:{start}..{end}:{strand}?content-type=text/plain"
        seq_response = requests.get(seq_url, headers={"Content-Type": "text/plain"})

        return seq_response.text if seq_response.status_code == 200 else "Failed to fetch sequence"
    else:
        return "Failed to fetch gene data"

In [6]:
def extract_target_region(gene_sequence, sgRNA_sequence, window_size=100):
    """Extracts the target region around the sgRNA binding site within the gene sequence."""
    sgRNA_rc = str(Seq(sgRNA_sequence).reverse_complement())
    start_pos = gene_sequence.find(sgRNA_rc)
    
    if start_pos == -1:
        return "sgRNA binding site not found in the gene sequence"
    
    start = max(0, start_pos - window_size)
    end = min(len(gene_sequence), start_pos + len(sgRNA_rc) + window_size)
    
    return gene_sequence[start:end]

In [7]:
# Initialize a column for target sequences
df['target_region'] = None

# Loop over DataFrame
for index, row in df.iterrows():
    print(f"Processing gene symbol: {row['symbol']} with ENSG ID: {row['ensg']}")
    gene_sequence = fetch_gene_sequence(row['ensg'])
    if gene_sequence is None:
        print(f"Failed to fetch gene sequence for {row['symbol']} (ENSG ID: {row['ensg']})")
        continue  # Skip this entry if fetching the gene sequence fails

    target_region = extract_target_region(gene_sequence, row['sequence'])
    if target_region is not None:
        df.at[index, 'target_region'] = target_region
    else:
        print(f"Target region extraction failed for {row['symbol']} (ENSG ID: {row['ensg']})")
        continue  # Skip this entry if target region extraction fails

Processing gene symbol: A1CF with ENSG ID: ENSG00000148584
Processing gene symbol: A1CF with ENSG ID: ENSG00000148584
Processing gene symbol: A1CF with ENSG ID: ENSG00000148584
Processing gene symbol: A1CF with ENSG ID: ENSG00000148584
Processing gene symbol: A1CF with ENSG ID: ENSG00000148584
Processing gene symbol: A1CF with ENSG ID: ENSG00000148584
Processing gene symbol: A1CF with ENSG ID: ENSG00000148584
Processing gene symbol: A1CF with ENSG ID: ENSG00000148584
Processing gene symbol: A1CF with ENSG ID: ENSG00000148584
Processing gene symbol: A1CF with ENSG ID: ENSG00000148584
Processing gene symbol: A2M with ENSG ID: ENSG00000175899
Processing gene symbol: A2M with ENSG ID: ENSG00000175899
Processing gene symbol: A2M with ENSG ID: ENSG00000175899
Processing gene symbol: A2M with ENSG ID: ENSG00000175899
Processing gene symbol: A2M with ENSG ID: ENSG00000175899
Processing gene symbol: A2M with ENSG ID: ENSG00000175899
Processing gene symbol: A2M with ENSG ID: ENSG00000175899
Proc

KeyboardInterrupt: 

In [None]:
df.to_csv('CRISPR_Genome_preprocessed.csv', index=False)

In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.functional import one_hot

# Define the LSTM model
class SG_RNA_LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(SG_RNA_LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out = self.fc(out[:, -1, :])  # Decode the hidden state of the last time step
        return out

In [28]:
# Data preprocessing
def encode_sequences(sequences):
    # Example encoding - implement your own encoding logic
    nucleotide_map = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
    encoded_data = []
    for seq in sequences:
        encoded_seq = [nucleotide_map[nuc] for nuc in seq]
        encoded_data.append(encoded_seq)
    return encoded_data

# Example sgRNA sequences
sequences = ['GCTAGCTAGCTA', 'TAGCTAGCTAGC']  # Add your sequences
encoded_sequences = encode_sequences(sequences)
max_length = max(len(seq) for seq in encoded_sequences)

In [29]:
# Padding sequences and preparing datasets
padded_sequences = [seq + [4] * (max_length - len(seq)) for seq in encoded_sequences]  # 4 for padding
input_sequences = torch.tensor([seq[:-1] for seq in padded_sequences])
target_sequences = torch.tensor([seq[1:] for seq in padded_sequences])

In [30]:
# Define dataset
dataset = TensorDataset(input_sequences, target_sequences)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [31]:
# Model parameters
input_dim = 5  # A, C, G, T, (pad)
hidden_dim = 128
layer_dim = 1
output_dim = 4

In [32]:
model = SG_RNA_LSTM(input_dim=5, hidden_dim=128, layer_dim=1, output_dim=4)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [33]:
input_sequences = one_hot(input_sequences, num_classes=5).float()  # One-hot encode and convert to float
dataset = TensorDataset(input_sequences, target_sequences)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [34]:
# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    for inputs, targets in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        outputs = outputs.permute(0, 2, 1)  # Adjust output to be [batch_size, num_classes, sequence_length]
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

RuntimeError: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 2 is not equal to len(dims) = 3

In [None]:
# Save your model
torch.save(model.state_dict(), 'sgRNA_model.pth')

In [36]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

class SG_RNA_LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(SG_RNA_LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.output_dim = output_dim  # Store output_dim as an instance variable
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initialize hidden and cell states
        h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).to(x.device)
        # Forward propagate LSTM
        out, (hn, cn) = self.lstm(x, (h0, c0))
        # Decode the hidden state of each time step
        out = self.fc(out.contiguous().view(-1, self.hidden_dim))
        return out.view(x.size(0), -1, self.output_dim)

# Define model, loss, and optimizer
model = SG_RNA_LSTM(input_dim=5, hidden_dim=128, layer_dim=1, output_dim=4)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Example input and target data
input_sequences = torch.randn(10, 15, 5)  # Random data for demonstration
target_sequences = torch.randint(0, 4, (10, 15))  # Random target indices

# DataLoader setup
dataset = TensorDataset(input_sequences, target_sequences)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Training loop
num_epochs = 20
for epoch in range(num_epochs):
    for inputs, targets in dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, 4), targets.view(-1))  # Flatten outputs and targets for loss calculation
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

Epoch 1, Loss: 1.403838038444519
Epoch 2, Loss: 1.3004004955291748
Epoch 3, Loss: 1.2671236991882324
Epoch 4, Loss: 1.4063835144042969
Epoch 5, Loss: 1.2804505825042725
Epoch 6, Loss: 1.2125598192214966
Epoch 7, Loss: 1.134634256362915
Epoch 8, Loss: 0.906342625617981
Epoch 9, Loss: 1.0873583555221558
Epoch 10, Loss: 1.1719279289245605
Epoch 11, Loss: 0.7548324465751648
Epoch 12, Loss: 0.9888455867767334
Epoch 13, Loss: 0.858740508556366
Epoch 14, Loss: 0.624103844165802
Epoch 15, Loss: 0.4939213693141937
Epoch 16, Loss: 0.2809202969074249
Epoch 17, Loss: 0.432477205991745
Epoch 18, Loss: 0.28100690245628357
Epoch 19, Loss: 0.3669787049293518
Epoch 20, Loss: 0.17170028388500214


In [45]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

class SG_RNA_LSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(SG_RNA_LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.output_dim = output_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Batch size dynamically determined by input size
        batch_size = x.size(0)
        h0 = torch.zeros(self.layer_dim, batch_size, self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.layer_dim, batch_size, self.hidden_dim).to(x.device)
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
        out = self.fc(out.reshape(-1, self.hidden_dim))
        return out.view(batch_size, -1, self.output_dim)

def encode_sequences(sequences, max_length):
    label_encoder = LabelEncoder()
    # Assume 'N' is treated as padding and should not be predicted
    label_encoder.fit(['A', 'C', 'G', 'T'])  # 'N' is not included in the fit
    encoded_sequences = []
    for seq in sequences:
        seq = [s if s in ['A', 'C', 'G', 'T'] else 'N' for s in seq]  # Replace non-ACGT with 'N'
        encoded_seq = label_encoder.transform([s for s in seq if s != 'N'])  # Only encode ACGT
        pad_length = max_length - len(encoded_seq)
        padded_seq = torch.cat([torch.tensor(encoded_seq, dtype=torch.long), torch.full((pad_length,), -1, dtype=torch.long)], dim=0)
        encoded_sequences.append(padded_seq)
    return torch.stack(encoded_sequences)

# Load data
sequences = df['sequence'].dropna().tolist()

# Encode sequences
max_length = max(len(seq) for seq in sequences)
encoded_sequences = encode_sequences(sequences, max_length)
input_sequences = torch.nn.functional.one_hot(encoded_sequences[:, :-1], num_classes=4).float()
target_sequences = encoded_sequences[:, 1:]  # Shifted by one for prediction

# Remove padding from targets
mask = target_sequences != -1  # Mask for valid positions
input_sequences = input_sequences[mask]
target_sequences = target_sequences[mask]

# Model initialization
model = SG_RNA_LSTM(input_dim=4, hidden_dim=128, layer_dim=1, output_dim=4)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training setup
train_inputs, test_inputs, train_targets, test_targets = train_test_split(input_sequences, target_sequences, test_size=0.2)
train_dataset = TensorDataset(train_inputs, train_targets)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Training loop
num_epochs = 30
for epoch in range(num_epochs):
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, 4), targets.view(-1))
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

RuntimeError: For unbatched 2-D input, hx and cx should also be 2-D but got (3-D, 3-D) tensors