In [None]:
# Define the path to the zipped folder in your Google Drive
tar_file_path = 'RNAStralign.tar.gz' #Change!

# Define the folder where you want to unzip the files (in-memory)
extracted_folder = 'sequences' #Change name?!

# Import files

In [None]:
import tarfile, os

# Create the destination folder (in-memory)
os.makedirs(extracted_folder, exist_ok=True)

# Extract the tar.gz archive
with tarfile.open(tar_file_path, 'r:gz') as tar:
    tar.extractall(extracted_folder)

# List the files in the extracted folder
os.listdir(extracted_folder)

In [None]:
import os
from fnmatch import fnmatch

root = extracted_folder
pattern = "*.ct"

ct_files = []
for path, subdirs, files in os.walk(root):
    for name in files:
        if fnmatch(name, pattern):
            ct_files.append(os.path.join(path, name))
with open("ct_files.txt", "w") as output:
    output.write("\n".join(ct_files))


### Picking files

In [None]:
import random as rd

def singleFamily(family: str, txt_file="ct_files.txt") -> list:
    '''
    Returns list of files from a single specified family
    '''
    ct_files = []
    with open(txt_file, "r") as file:
        for line in file:
            if not line.startswith("#") and family in line:
                ct_files.append(line.strip())
    return ct_files

def leaveOneFamilyOut(family: str, txt_file="ct_files.txt"):
    '''
    Returns list of files from all families except the specified family
    '''
    ct_files = [] 
    with open(txt_file, "r") as file: 
        for line in file:
            if not line.startswith("#") and family not in line:
                ct_files.append(line.strip())
    return ct_files

def pickFromFamilies(data_size, txt_file="ct_files.txt"):
    '''
    Returns a list of files with data_size from EACH family.
    If a family does not have enough data, all data from that family is added.
    '''
    ct_files = []
    with open(txt_file, "r") as file: #Read paths to ct_files
        for line in file:
            if not line.startswith("#"):
                ct_files.append(line.strip())
    
    families = []
    for file in ct_files: #Find all family names
        families.append(file.split("\\")[2])
    families = list(set(families)) #Remove duplicates

    data = [] #Create list of lists, where each list contains all files from a family
    for family in families:
        data.append([line for line in ct_files if family in line])

    ct_files = [] 
    for family in data: #Pick data_size files from each family
        try:
            ct_files.append(rd.sample(family, data_size))
        except:
            print("Not enough data in family: ", family[0].split("\\")[2], " for size: ", data_size, ".\n Missing", data_size-len(family),"files.","\n Adding all data from family.")

    return ct_files


# Set up preprocessing of RNA files   

Below are the functions that is needed for processing the files from .ct files to images

In [None]:
### THESE NEEDS TO BE CHANGED TO FIT WITH LOADING FILES IN COLAB ###

def read_bpseq(file: str) -> tuple():
    """
    Takes a .bpseq file and returns the sequence as a string and a list of base pairs
    """

    sequence = ""
    pairs = []

    with open(file, 'r') as f:
        lines = [line.split() for line in f.readlines()]

    #Remove header - if any
    header_lines = 0
    for line in lines:
        if line[0] == '1':
                break
        else:
            header_lines += 1

    lines = lines[header_lines:]

    #Make sequence in bp list
    for line in lines:
        sequence += line[1]
        if line[2] != '0':
            pairs.append((int(line[0])-1, int(line[2])-1)) #The files start indexing from 1
    return sequence, pairs

def read_ct(file: str) -> tuple():
    """
    Takes a .ct file and returns the sequence as a string and a list of base pairs
    """
    sequence = ""
    pairs = []

    with open(file, 'r') as f:
        lines = [line.split() for line in f.readlines()]

    #Remove header - if any
    header_lines = 0
    for line in lines:
        if line[0] == '1':
                break
        else:
            header_lines += 1

    lines = lines[header_lines:]

    for line in lines:
        sequence += line[1]
        if line[4] != '0':
            pairs.append((int(line[0])-1, int(line[4])-1)) #The files start indexing from 1

    return sequence, pairs

In [None]:
import numpy as np

def make_matrix_from_sequence(sequence: str) -> np.array:
    """
    A sequence is converted to a matrix containing all the possible base pairs

    If the bases does not form  valid pair the cell is white.
    The valid base pairs has the following colors:
    GC = green
    CG = dark green
    UG = blue
    GU = dark blue
    UA = red
    AU = dark red
    """
    colors = {"invalid_pairing": [255, 255, 255],
              "unpaired": [64, 64, 64],
              "GC": [0, 255, 0],
              "CG": [0, 128, 0],
              "UG": [0, 0, 255],
              "GU": [0, 0, 128],
              "UA": [255, 0, 0],
              "AU": [128, 0, 0]}
    basepairs = ["GC", "CG", "UG", "GU", "UA", "AU"]

    N = len(sequence)

    matrix = np.full((N,N,3),255, dtype="uint8")

    for i in range(N):
        for j in range(N):
            pair = sequence[i] + sequence[j]
            if i == j:
                matrix[i, j, :] = colors["unpaired"]
            elif pair in basepairs:
                matrix[i, j, :] = colors[pair]

    return matrix


def make_matrix_from_basepairs(sequence: str, pairs: list) -> np.array:
    """
    Takes a list of all the base pairs.
    From the list a matrix is made, with each cell coresponding to a base pair colered black
    """
    black = [0, 0, 0]

    N = len(sequence)
    matrix = np.full((N,N,3),255, dtype="uint8")

    for pair in pairs:
        matrix[pair[0], pair[1], :] = black

    return matrix

In [None]:
import matplotlib.pyplot as plt

def save_matrix(matrix: np.array, name: str) -> None: 
    """
    Saves the matrix as a .png file 
    """
    plt.imsave(name, matrix)

# Convert sequences
The sequences needs to be read from the .ct files and converted into images that can be used as input to the CNN.    

In [None]:
import os

def getCtFromTxt(txt_file):
    """
    Takes a txt file containing the path to all the ct files and returns a list of the paths
    """
    with open(txt_file, 'r') as f:
        ct_files = f.readlines()
    ct_files = [file.strip() for file in ct_files if file.strip()[0] != "#"]
    return ct_files

def save_to_matrices(file_list, input_matrix_path, output_matrix_path):
    """
    For every file supplied in file_list:
      a matrix of all possible base pairs is saved to input_matrix_path
      a matrix showing the base pairs in the actual sequence is saved to output_matrix_path
    """
    for file_name in file_list: 
        sequence, pairs = read_ct(file_name)
        input_matrix = make_matrix_from_sequence(sequence)
        save_matrix(input_matrix, os.path.join(input_matrix_path, os.path.splitext(os.path.basename(file_name))[0] + '.png'))
        output_matrix = make_matrix_from_basepairs(sequence, pairs)
        save_matrix(output, os.path.join(output_matrix_path, os.path.splitext(os.path.basename(file_name))[0] + '.png'))

In [None]:
ct_files = getCtFromTxt("ct_files.txt")

file_list = []

for file_name in ct_files: 
    if "tRNA_database" in file_name: 
        file_list.append(file_name)

file_list

In [None]:
input_matrix_path = "input"
output_matrix_path = "result"

# Create folders for images
os.makedirs(input_matrix_path, exist_ok=True)
os.makedirs(output_matrix_path, exist_ok=True)

save_to_matrices(file_list, input_matrix_path, output_matrix_path)


# Set up data for network

### Splitting data into train and validation sets 

In [None]:
def move_files(src_dir, dst_dir, file_list): 
    for filename in file_list: 
        src_path = os.path.join(src_dir, file_name)
        dst_path = os.path.join(dst_dir, filename)
        os.rename(src_path, dst_path)

In [None]:
import os

#Make directories
directories = ['train', 'validation']
subdirectories = ['input_images', 'output_images']

for directory in directories: 
    os.makedirs(directory, exist_ok=True)
    for subdirectory in subdirectories: 
        dir_path = os.path.join(directory, subdirectory)
        os.makedirs(dir_path, exist_ok=True)

In [None]:
from sklearn.model_selection import train_test_split

#List input and outputs
input_images = os.listdir(input_matrix_path)
output_images = os.listdir(output_matrix_path)

#Make split
input_train, input_valid, output_train, output_valid = train_test_split(input_images, output_images, train_size=0.8, random_state=42, shuffle=True) #NOTE - Change to the fraction we want nd whether we want shuffle or not


#Move files to train and validation folders
train_input_dir = 'train/input_images'
train_output_dir = 'train/output_images'
val_input_dir = 'validation/input_images'
val_output_dir = 'validation/output_images'

move_files(input_matrix_path, train_input_dir, input_train)
move_files(output_matrix_path, train_output_dir, output_train)
move_files(input_matrix_path, val_input_dir, input_valid)
move_files(output_matrix_path, val_output_dir, output_valid)

### Read data and set up data loader

In [None]:
import os
from torch.utils.data import Dataset
from torchvision import transforms

class ImageToImageDataset(Dataset): 
    """
    
    """
    def __init__(self, input_dir, output_dir, transform = None): 
        self.input_dir = input_dir
        self.output_dir = output_dir
        self.input_files = os.listdir(input_dir)
        self.output_files = os.listdir(output_dir)
        self.transform = transform
    
    def __len__(self): 
        return len(self.input_files)
    
    def __getitem__(self, idx): 
        input_image =  plt.imread(os.path.join(self.input_dir, self.input_files[idx]))
        output_image = plt.imread(os.path.join(self.output_dir, self.output_files[idx]))

        if self.transform: 
            input_image = self.transform(input_image)
            output_image = self.transform(output_image)
        
        return input_image, output_image


mean = [] #NOTE - Find these for normalization
std = []

dataTransformer = transforms.Compose([
    transforms.ToTensor(), #Convert image to tensor
    transforms.Normalize(mean=mean, std=std) 
])


In [None]:
from torch.utils.data import DataLoader

train_dataset = ImageToImageDataset(train_input_dir, train_output_dir, dataTransformer)
validation_dataset = ImageToImageDataset(val_input_dir, val_output_dir, dataTransformer)

batch_size = '?' #NOTE - Change!

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) #Do shuffle need to be true???
val_loader = DataLoader(validation_dataset, batch_size=batch_size)