Process all the weird txt files into csv files

In [21]:
import pandas as pd

def ThreeCycle(lines):
    """
    Take a text file that is one column of data that should actually be 3 columns
    and split it into 3 columns.
    """
    #print(len(lines))
    assert (len(lines) % 3) == 0, "The number of lines is not divisible by 3."
    # Replace the newline characters with nothing
    lines = [line.strip() for line in lines]
    
    chromosome, start, end = [], [], []
    for i in range(3, len(lines), 3):
        chromosome.append(lines[i])
        start.append(lines[i+1])
        end.append(lines[i+2])
    
    assert len(chromosome) == len(start) == len(end), "The lists are not the same length."
    # Now I need to put this into a dataframe
    data = pd.DataFrame({'Chromosome': chromosome, 'Start': start, 'End': end})
    
    
    return data


def DNADataProcess(file):
    """
    Read in a text file, look at its structure, process into a dataframe
    based on how it is structured.
    """
    with open(file, 'r') as f:
        lines = f.readlines()
    # Remove the newline characters
    lines = [line.strip() for line in lines]
    if lines[0].lower() == "chromosome" and lines[1].lower() == "start" and lines[2].lower() == "end":
        # Then I need to do the 3 cycle thing
        #print("3 cycle")
        data = ThreeCycle(lines)
    else:
        data = pd.DataFrame([line.strip().split() for line in lines])
        # Make the first row the columns
        data.columns = data.iloc[0]
        data = data[1:]
        #print(data.columns)
        data = data[["Item", "Cell", "Type"]]
        # Rename the columns
        data.columns = ['Chromosome', 'Start', 'End']

    # Make sure the columns are the right type
    data['Chromosome'] = data['Chromosome'].astype(int)
    data['Start'] = data['Start'].astype(float)
    data['End'] = data['End'].astype(float)
    return data

In [22]:
import numpy as np

def SpacingsPerChromosome(data: pd.DataFrame, metric = 'Interorigin', name:str = 'Dataset') -> dict:
    """
    Takes in a dataframe with Chromosome, Start, End columns
    For each chromosome, calculate the spacings between the origins
    If metric = 'Interorigin' then the spacing is between midpoints
    If metric = 'EndToEnd' then the spacing is from end to start
    Outputs dataset name, metric, and list of spacings
    """
    assert metric in ['Interorigin', 'EndToEnd'], "The metric is not valid, must be either Interorigin or EndToEnd."
    assert 'Chromosome' in data.columns, "The dataframe does not have a Chromosome column."
    assert 'Start' in data.columns, "The dataframe does not have a Start column."
    assert 'End' in data.columns, "The dataframe does not have an End column."
    
    SpacingList = []
    if metric == 'Interorigin':
        data["Midpoint"] = (data['Start'] + data['End']) / 2
        #print("Checkpoint1")
    for chromosome in data['Chromosome'].unique():
        chrom_data = data[data['Chromosome'] == chromosome]
        # Sort by start column
        chrom_data = chrom_data.sort_values(by='Start')
        #print("Checkpoint2")
        if metric == 'Interorigin':
            spacings = chrom_data['Midpoint'].diff()
            SpacingList.extend(spacings)
        elif metric == 'EndToEnd':
            # Each spacing is the start of the next origin minus the end of the previous origin
            Starts = chrom_data['Start'].values[1:]
            Ends = chrom_data['End'].values[:-1]
            spacings = Starts - Ends
            SpacingList.extend(spacings)
    # Drop NaN values from the list
    SpacingList = [spacing for spacing in SpacingList if not np.isnan(spacing)]
    # Make mean spacing = 1
    if len(SpacingList) == 0:
        print(f"No valid spacings found for {name} with metric {metric}.")
        return name, metric, []
    SpacingList = np.array(SpacingList)
    SpacingList = SpacingList / np.mean(SpacingList)  # Normalize spacings to have mean of 1
    print(f"The mean of the {name} spacings is {np.mean(SpacingList)}")
    
    return name, metric, SpacingList

In [23]:
# Look at all files ending in .txt in the current "rawdata"
import os
dir_list = os.listdir("rawdata")
dir_list = [file for file in dir_list if file.endswith('.txt')]
dir_list = [file for file in dir_list if file.startswith("rep_origins")]
print(dir_list)

['rep_origins_arabidopsis.txt', 'rep_origins_candida_CBS138.txt', 'rep_origins_drosophila_Kc.txt', 'rep_origins_drosophila_S2.txt', 'rep_origins_human_K562.txt', 'rep_origins_human_MCF7.txt', 'rep_origins_k_lactis.txt', 'rep_origins_L_waltii.txt', 'rep_origins_mouse_ES1.txt', 'rep_origins_mouse_MEF.txt', 'rep_origins_mouse_P19.txt', 'rep_origins_s_cerevisiae.txt']


In [24]:
def name_extractor(filename: str) -> str:
    """
    Extract the name from the filename
    """
    # Remove the .txt extension
    name = filename.split('.')[0]
    # Remove the prefix "rawdata/"
    name = name.split('rep_origins_')[1]
    return name

assert name_extractor('rep_origins_arabidopsis.txt') == 'arabidopsis', "The name extractor is not working."

In [25]:
# Check if there's a folder in the directory called processeddata
if not os.path.exists("processeddata"):
    os.makedirs("processeddata")
    print("Created processeddata folder")
else:
    print("processeddata folder already exists")

processeddata folder already exists


In [26]:
# For each file in the directory read it in
for file in dir_list:
    name = name_extractor(file)
    data = DNADataProcess(os.path.join("rawdata", file))
    name, metric, SpacingList = SpacingsPerChromosome(data, metric = 'Interorigin', name = name)
    temp_df = pd.DataFrame(SpacingList, columns = [f"{metric}_spacing"])
    # Save the dataframe to a csv file in the processeddata folder
    temp_df.to_csv(os.path.join("processeddata", f"{name}_{metric}_spacing.csv"), index = False)

The mean of the arabidopsis spacings is 0.9999999999999999
The mean of the candida_CBS138 spacings is 1.0
The mean of the drosophila_Kc spacings is 1.0
The mean of the drosophila_S2 spacings is 1.0
The mean of the human_K562 spacings is 1.0000000000000002
The mean of the human_MCF7 spacings is 0.9999999999999999
The mean of the k_lactis spacings is 1.0
The mean of the L_waltii spacings is 1.0
The mean of the mouse_ES1 spacings is 1.0
The mean of the mouse_MEF spacings is 1.0
The mean of the mouse_P19 spacings is 0.9999999999999999
The mean of the s_cerevisiae spacings is 1.0
