In [1]:
import gzip
import json
import pandas as pd
from joblib import Parallel, delayed
import itertools


In [2]:
# Function to parse a single JSON line
def parse_json_line(line):
    line = line.strip()
    if line:  # Skip empty lines
        try:
            return json.loads(line)
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            return None  # Skip malformed lines
    return None

# Function to read and process the .json.gz file using joblib for parallelism
def read_json_gz_parallel_joblib(file_path, n_jobs=4):
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        lines = f.readlines()  # Read all lines first

    # Use joblib's Parallel to process the lines in parallel
    results = Parallel(n_jobs=n_jobs)(delayed(parse_json_line)(line) for line in lines)
    
    # Filter out None values (invalid or empty lines)
    data = [result for result in results if result is not None]
    
    return data

# Usage
file_path = '../data/dataset0.json.gz'
data = read_json_gz_parallel_joblib(file_path, n_jobs=24)  # Adjust n_jobs based on your system

# Now 'data' contains the JSON objects from the file
print(data[:10])

[{'ENST00000000233': {'244': {'AAGACCA': [[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0.0093, 10.9, 84.1], [0.00631, 2.53, 125.0, 0.00844, 4.67, 126.0, 0.0103, 6.3, 80.9], [0.00465, 3.92, 109.0, 0.0136, 12.0, 124.0, 0.00498, 2.13, 79.6], [0.00398, 2.06, 125.0, 0.0083, 5.01, 130.0, 0.00498, 3.78, 80.4], [0.00664, 2.92, 120.0, 0.00266, 3.94, 129.0, 0.013, 7.15, 82.2], [0.0103, 3.83, 123.0, 0.00598, 6.45, 126.0, 0.0153, 1.09, 74.8], [0.00398, 3.75, 126.0, 0.00332, 4.3, 129.0, 0.00299, 1.93, 81.9], [0.00498, 3.93, 127.0, 0.00398, 2.51, 131.0, 0.0111, 3.47, 79.4], [0.0139, 4.69, 106.0, 0.0136, 6.21, 124.0, 0.00531, 10.6, 85.5], [0.00631, 3.5, 126.0, 0.0222, 5.38, 128.0, 0.00332, 1.72, 79.3], [0.0061, 3.99, 121.0, 0.0121, 7.27, 122.0, 0.00232, 1.27, 78.9], [0.00299, 1.99, 128.0, 0.00427, 4.85, 124.0, 0.00332, 3.18, 80.5], [0.0186, 3.62, 124.0, 0.00428, 2.25, 129.0, 0.00554, 2.78, 80.1], [0.0093, 3.12, 125.0, 0.00398, 8.84, 129.0, 0.00361, 1.86, 82.0], [0.00365, 2.92, 126.0, 0.00698, 3.7, 126

[
    {
        "ENST00000000233": {
            "244": {
                "AAGACCA": [
                    [0.00299, 2.06, 125.0, ...],
                    [0.00631, 2.53, 125.0, ...],
                    ...
                ]
            },
            "261": {
                "CAAACTG": [
                    [0.0126, 1.95, 111.0, ...],
                    [0.00432, 2.35, 111.0, ...],
                    ...
                ]
            }
        }
    },
    ...
]

In [3]:
def flatten_data(json_data):
    rows = []
    for entry in json_data:
        for transcript_id, positions in entry.items():
            for position, sequences in positions.items():
                for sequence_key, sequence_data in sequences.items():
                    rows.append({
                        'transcript_id': transcript_id,
                        'transcript_position': int(position),  # Convert to int to match `info_data`
                        'sequence_key': sequence_key,
                        'sequence_data': sequence_data
                    })
    return pd.DataFrame(rows)

# Flatten the JSON data
data_flattened = flatten_data(data)

In [4]:
print(data_flattened.head())

     transcript_id  transcript_position sequence_key  \
0  ENST00000000233                  244      AAGACCA   
1  ENST00000000233                  261      CAAACTG   
2  ENST00000000233                  316      GAAACAG   
3  ENST00000000233                  332      AGAACAT   
4  ENST00000000233                  368      AGGACAA   

                                       sequence_data  
0  [[0.00299, 2.06, 125.0, 0.0177, 10.4, 122.0, 0...  
1  [[0.0126, 1.95, 111.0, 0.0125, 1.27, 108.0, 0....  
2  [[0.00432, 2.02, 104.0, 0.00299, 3.56, 99.3, 0...  
3  [[0.0134, 4.71, 132.0, 0.00447, 4.24, 98.8, 0....  
4  [[0.015, 6.97, 118.0, 0.0106, 3.04, 123.0, 0.0...  


In [5]:
def process_row(row):
    transcript_id = row['transcript_id']
    transcript_position = row['transcript_position']
    sequence_key = row['sequence_key']
    
    # Converting string representation of list to an actual list, if needed
    sequence_data = row['sequence_data']
    if isinstance(sequence_data, str):
        sequence_data = eval(sequence_data)  # Convert to list if it's a string
    
    # Define positions for -1, 0, and +1, and extract features accordingly
    row_data = {
        'transcript_id': transcript_id,
        'transcript_position': transcript_position,
        'sequence_key': sequence_key,
        'sequence_-1': sequence_key[:-2],  # Sequence for -1 position
        'sequence_0': sequence_key[1:-1],    # Sequence for 0 (central) position
        'sequence_+1': sequence_key[2:],   # Sequence for +1 position
        'features_-1': [seq[:3] for seq in sequence_data],  # Features for -1 position
        'features_0': [seq[3:6] for seq in sequence_data],  # Features for 0 position
        'features_+1': [seq[6:] for seq in sequence_data]   # Features for +1 position
    }
    
    return row_data

# Apply this processing to all rows
processed_data = pd.DataFrame([process_row(row) for index, row in data_flattened.iterrows()])



In [6]:
processed_data.head()

Unnamed: 0,transcript_id,transcript_position,sequence_key,sequence_-1,sequence_0,sequence_+1,features_-1,features_0,features_+1
0,ENST00000000233,244,AAGACCA,AAGAC,AGACC,GACCA,"[[0.00299, 2.06, 125.0], [0.00631, 2.53, 125.0...","[[0.0177, 10.4, 122.0], [0.00844, 4.67, 126.0]...","[[0.0093, 10.9, 84.1], [0.0103, 6.3, 80.9], [0..."
1,ENST00000000233,261,CAAACTG,CAAAC,AAACT,AACTG,"[[0.0126, 1.95, 111.0], [0.00432, 2.35, 111.0]...","[[0.0125, 1.27, 108.0], [0.00559, 2.4, 106.0],...","[[0.00996, 2.94, 101.0], [0.00332, 1.39, 94.5]..."
2,ENST00000000233,316,GAAACAG,GAAAC,AAACA,AACAG,"[[0.00432, 2.02, 104.0], [0.0216, 3.29, 107.0]...","[[0.00299, 3.56, 99.3], [0.0246, 4.76, 100.0],...","[[0.00598, 1.44, 87.9], [0.0139, 1.64, 89.2], ..."
3,ENST00000000233,332,AGAACAT,AGAAC,GAACA,AACAT,"[[0.0134, 4.71, 132.0], [0.0134, 8.18, 128.0],...","[[0.00447, 4.24, 98.8], [0.0209, 3.18, 97.5], ...","[[0.00313, 1.22, 83.2], [0.00709, 2.23, 88.9],..."
4,ENST00000000233,368,AGGACAA,AGGAC,GGACA,GACAA,"[[0.015, 6.97, 118.0], [0.00365, 4.93, 116.0],...","[[0.0106, 3.04, 123.0], [0.00568, 3.36, 124.0]...","[[0.011, 3.34, 81.8], [0.0052, 4.39, 87.3], [0..."


In [7]:
# Read the .info file as a CSV
def read_info_file(file_path):
    df = pd.read_csv(file_path)
    return df

# Usage
file_path = '../data/data.info'
info_data = read_info_file(file_path)


In [8]:
# Now, merge `data_flattened` with `info_data`, drop 'gene_id' column
merged_data = pd.merge(info_data, processed_data, on=['transcript_id', 'transcript_position'], how='right').drop('gene_id', axis=1)
# Display the first few rows of the merged result
print(merged_data.head())

     transcript_id  transcript_position  label sequence_key sequence_-1  \
0  ENST00000000233                  244      0      AAGACCA       AAGAC   
1  ENST00000000233                  261      0      CAAACTG       CAAAC   
2  ENST00000000233                  316      0      GAAACAG       GAAAC   
3  ENST00000000233                  332      0      AGAACAT       AGAAC   
4  ENST00000000233                  368      0      AGGACAA       AGGAC   

  sequence_0 sequence_+1                                        features_-1  \
0      AGACC       GACCA  [[0.00299, 2.06, 125.0], [0.00631, 2.53, 125.0...   
1      AAACT       AACTG  [[0.0126, 1.95, 111.0], [0.00432, 2.35, 111.0]...   
2      AAACA       AACAG  [[0.00432, 2.02, 104.0], [0.0216, 3.29, 107.0]...   
3      GAACA       AACAT  [[0.0134, 4.71, 132.0], [0.0134, 8.18, 128.0],...   
4      GGACA       GACAA  [[0.015, 6.97, 118.0], [0.00365, 4.93, 116.0],...   

                                          features_0  \
0  [[0.0177, 10.4,

In [11]:
def separate_features_for_corresponding_rows(row):
    """Separate corresponding feature lists for -1, 0, and +1 positions into individual rows."""
    transcript_id = row['transcript_id']
    transcript_position = row['transcript_position']
    label = row['label']
    sequence_key = row['sequence_key']
    
    # Extract the lists of features for each position (-1, 0, +1)
    features_minus_1 = row['features_-1']
    features_0 = row['features_0']
    features_plus_1 = row['features_+1']
    
    # The number of corresponding feature sets (assuming all positions have the same number of features)
    num_features = len(features_minus_1)
    
    # Create rows where each list of features corresponds across the -1, 0, and +1 positions
    rows = []
    for i in range(num_features):
        rows.append({
            'transcript_id': transcript_id,
            'transcript_position': transcript_position,
            'label': label,
            'sequence_key': sequence_key,
            'sequence_-1': row['sequence_-1'],
            'sequence_0': row['sequence_0'],
            'sequence_+1': row['sequence_+1'],
            # Extracting specific terms from the features lists
            'dwell_time_-1': features_minus_1[i][0],  # First term in -1 position is dwell_time
            'sd_-1': features_minus_1[i][1],          # Second term in -1 position is standard deviation (sd)
            'mean_-1': features_minus_1[i][2],        # Third term in -1 position is mean
            
            'dwell_time_0': features_0[i][0],         # First term in 0 position is dwell_time
            'sd_0': features_0[i][1],                 # Second term in 0 position is standard deviation (sd)
            'mean_0': features_0[i][2],               # Third term in 0 position is mean
            
            'dwell_time_+1': features_plus_1[i][0],   # First term in +1 position is dwell_time
            'sd_+1': features_plus_1[i][1],           # Second term in +1 position is standard deviation (sd)
            'mean_+1': features_plus_1[i][2]          # Third term in +1 position is mean
        })
    
    return rows

# Apply this processing to all rows and flatten the result
separated_data = pd.DataFrame([new_row for index, row in merged_data.iterrows() for new_row in separate_features_for_corresponding_rows(row)])



In [12]:
separated_data

Unnamed: 0,transcript_id,transcript_position,label,sequence_key,sequence_-1,sequence_0,sequence_+1,dwell_time_-1,sd_-1,mean_-1,dwell_time_0,sd_0,mean_0,dwell_time_+1,sd_+1,mean_+1
0,ENST00000000233,244,0,AAGACCA,AAGAC,AGACC,GACCA,0.00299,2.06,125.0,0.01770,10.40,122.0,0.00930,10.90,84.1
1,ENST00000000233,244,0,AAGACCA,AAGAC,AGACC,GACCA,0.00631,2.53,125.0,0.00844,4.67,126.0,0.01030,6.30,80.9
2,ENST00000000233,244,0,AAGACCA,AAGAC,AGACC,GACCA,0.00465,3.92,109.0,0.01360,12.00,124.0,0.00498,2.13,79.6
3,ENST00000000233,244,0,AAGACCA,AAGAC,AGACC,GACCA,0.00398,2.06,125.0,0.00830,5.01,130.0,0.00498,3.78,80.4
4,ENST00000000233,244,0,AAGACCA,AAGAC,AGACC,GACCA,0.00664,2.92,120.0,0.00266,3.94,129.0,0.01300,7.15,82.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11027101,ENST00000641834,1693,0,TTGACAT,TTGAC,TGACA,GACAT,0.00418,7.49,108.0,0.00564,10.20,116.0,0.01000,2.01,76.4
11027102,ENST00000641834,1693,0,TTGACAT,TTGAC,TGACA,GACAT,0.00664,1.91,109.0,0.00598,12.30,110.0,0.01760,2.61,74.6
11027103,ENST00000641834,1693,0,TTGACAT,TTGAC,TGACA,GACAT,0.00721,4.58,105.0,0.00398,6.58,113.0,0.00316,2.28,85.3
11027104,ENST00000641834,1693,0,TTGACAT,TTGAC,TGACA,GACAT,0.00266,2.33,109.0,0.00913,10.40,108.0,0.00664,4.44,76.8


In [13]:
# now save the merged data to a new file
separated_data.to_csv('../data/processed_data.csv', index=False)