In [12]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import tensorflow as tf
import numpy as np
import pandas as pd
import requests

In [5]:
def fetch_ensembl_transcripts(gene_symbol):
    url = f"https://rest.ensembl.org/lookup/symbol/homo_sapiens/{gene_symbol}?expand=1;content-type=application/json"
    response = requests.get(url)
    if response.status_code == 200:
        gene_data = response.json()
        if 'Transcript' in gene_data:
            return gene_data['Transcript']
        else:
            print("No transcripts found for gene:", gene_symbol)
            return None
    else:
        print(f"Error fetching gene data from Ensembl: {response.text}")
        return None

def fetch_ensembl_sequence(transcript_id):
    url = f"https://rest.ensembl.org/sequence/id/{transcript_id}?content-type=application/json"
    response = requests.get(url)
    if response.status_code == 200:
        sequence_data = response.json()
        if 'seq' in sequence_data:
            return sequence_data['seq']
        else:
            print("No sequence found for transcript:", transcript_id)
            return None
    else:
        print(f"Error fetching sequence data from Ensembl: {response.text}")
        return None


In [6]:
class Encoder:
    def __init__(self, on_seq, off_seq, with_category = False, label = None, with_reg_val = False, value = None):
        tlen = 24
        self.on_seq = "-" *(tlen-len(on_seq)) +  on_seq
        self.off_seq = "-" *(tlen-len(off_seq)) + off_seq
        self.encoded_dict_indel = {'A': [1, 0, 0, 0, 0], 'T': [0, 1, 0, 0, 0],
                                   'G': [0, 0, 1, 0, 0], 'C': [0, 0, 0, 1, 0], '_': [0, 0, 0, 0, 1], '-': [0, 0, 0, 0, 0]}
        self.direction_dict = {'A':5, 'G':4, 'C':3, 'T':2, '_':1}
        if with_category:
            self.label = label
        if with_reg_val:
            self.value = value
        self.encode_on_off_dim7()

    def encode_sgRNA(self):
        code_list = []
        encoded_dict = self.encoded_dict_indel
        sgRNA_bases = list(self.on_seq)
        for i in range(len(sgRNA_bases)):
            if sgRNA_bases[i] == "N":
                sgRNA_bases[i] = list(self.off_seq)[i]
            code_list.append(encoded_dict[sgRNA_bases[i]])
        self.sgRNA_code = np.array(code_list)

    def encode_off(self):
        code_list = []
        encoded_dict = self.encoded_dict_indel
        off_bases = list(self.off_seq)
        for i in range(len(off_bases)):
            code_list.append(encoded_dict[off_bases[i]])
        self.off_code = np.array(code_list)

    def encode_on_off_dim7(self):
        self.encode_sgRNA()
        self.encode_off()
        on_bases = list(self.on_seq)
        off_bases = list(self.off_seq)
        on_off_dim7_codes = []
        for i in range(len(on_bases)):
            diff_code = np.bitwise_or(self.sgRNA_code[i], self.off_code[i])
            on_b = on_bases[i]
            off_b = off_bases[i]
            if on_b == "N":
                on_b = off_b
            dir_code = np.zeros(2)
            if on_b == "-" or off_b == "-" or self.direction_dict[on_b] == self.direction_dict[off_b]:
                pass
            else:
                if self.direction_dict[on_b] > self.direction_dict[off_b]:
                    dir_code[0] = 1
                else:
                    dir_code[1] = 1
            on_off_dim7_codes.append(np.concatenate((diff_code, dir_code)))
        self.on_off_code = np.array(on_off_dim7_codes)

def encode_on_off_seq_pairs(input_file):
    inputs = pd.read_csv(input_file, delimiter=",", header=None, names=['on_seq', 'off_seq'])
    input_codes = []
    for idx, row in inputs.iterrows():
        on_seq = row['on_seq']
        off_seq = row['off_seq']
        en = Encoder(on_seq=on_seq, off_seq=off_seq)
        input_codes.append(en.on_off_code)
    input_codes = np.array(input_codes)
    input_codes = input_codes.reshape((len(input_codes), 1, 24, 7))
    y_pred = CRISPR_net_predict(input_codes)
    inputs['CRISPR_Net_score'] = y_pred
    inputs.to_csv("CRISPR_net_results.csv", index=False)

In [13]:
# Function to load the CRISPR-Net model
def load_CRISPR_net_model(json_file_path):
    with open(json_file_path, 'r') as json_file:
        loaded_model_json = json_file.read()

    loaded_model = tf.keras.models.model_from_json(loaded_model_json)
    # Load weights (assuming you have the path to the weights file)
    loaded_model.load_weights("/content/drive/MyDrive/Colab Notebooks/Cas9/CRISPR_Net_CIRCLE_elevation_SITE_weights.h5")
    return loaded_model

def find_crispr_targets(sequence, pam="NGG", target_length=21):
    targets = []
    len_sequence = len(sequence)

    for i in range(len_sequence - len(pam) + 1):
        if sequence[i + 1:i + 3] == pam[1:]:
            if i >= target_length:
                target_seq = sequence[i - target_length:i + 3]
                targets.append(target_seq)

    return targets


# Predict using CRISPR-Net
def CRISPR_net_predict(model, encoded_pairs):
    predictions = model.predict(encoded_pairs).flatten()
    return predictions

In [14]:
# Load CRISPR-Net model
crispr_net_model = load_CRISPR_net_model("/content/drive/MyDrive/Colab Notebooks/Cas9/CRISPR_Net_CIRCLE_elevation_SITE_structure.json")

In [25]:
gene_symbol = "FOXA1"
transcripts = fetch_ensembl_transcripts(gene_symbol)
transcript_id = transcripts[0]['id']
gene_sequence = fetch_ensembl_sequence(transcript_id)
gRNA_sites = find_crispr_targets(gene_sequence)
# Predict off-target effects for each gRNA site
for gRNA in gRNA_sites:
    # just demonstration
    on_target = gRNA
    off_target = gRNA
    encoder = Encoder(on_seq=on_target, off_seq=off_target)
    encoded_pair = encoder.on_off_code.reshape(1, 1, 24, 7)

    prediction = CRISPR_net_predict(crispr_net_model, encoded_pair)
    print(f"on_target: {on_target}, off_target: {off_target}, CRISPR-Net Score: {prediction}")

on_target: AGGCAGCCCGCTCACTTCCCGCGG, off_target: AGGCAGCCCGCTCACTTCCCGCGG, CRISPR-Net Score: [0.5989955]
on_target: CAGCCCGCTCACTTCCCGCGGAGG, off_target: CAGCCCGCTCACTTCCCGCGGAGG, CRISPR-Net Score: [0.6855179]
on_target: ACTTCCCGCGGAGGCGCTCCCCGG, off_target: ACTTCCCGCGGAGGCGCTCCCCGG, CRISPR-Net Score: [0.8278589]
on_target: GCTCCCCGGCGCCGCGCTCCGCGG, off_target: GCTCCCCGGCGCCGCGCTCCGCGG, CRISPR-Net Score: [0.53671783]
on_target: TCCGCGGCAGCCGCCTGCCCCCGG, off_target: TCCGCGGCAGCCGCCTGCCCCCGG, CRISPR-Net Score: [0.17207283]
on_target: GCACGCCGCGCCCCGCAGCTCTGG, off_target: GCACGCCGCGCCCCGCAGCTCTGG, CRISPR-Net Score: [0.27052987]
on_target: CACGCCGCGCCCCGCAGCTCTGGG, off_target: CACGCCGCGCCCCGCAGCTCTGGG, CRISPR-Net Score: [0.5476822]
on_target: GCTCTGGGCTTCCTCTTCGCCCGG, off_target: GCTCTGGGCTTCCTCTTCGCCCGG, CRISPR-Net Score: [0.04343839]
on_target: CTCTGGGCTTCCTCTTCGCCCGGG, off_target: CTCTGGGCTTCCTCTTCGCCCGGG, CRISPR-Net Score: [0.05773075]
on_target: TGGGCTTCCTCTTCGCCCGGGTGG, off_target: T