In [11]:
import os
import gzip
import json
import pandas as pd

# --------------------------
# CONFIGURATION
# --------------------------
TASK1_DIR = "task1"
JSON_FILE = os.path.join(TASK1_DIR, "dataset0.json.gz")
INFO_LABEL_FILE = os.path.join(TASK1_DIR, "data.info.labelled")
PARSED_CSV_OUTPUT = os.path.join(TASK1_DIR, "parsed_dataset0.csv")

# Ensure task1 exists and files are present
if not os.path.isdir(TASK1_DIR):
    raise FileNotFoundError(f"Directory '{TASK1_DIR}' not found.")
if not os.path.exists(JSON_FILE):
    raise FileNotFoundError(f"JSON file not found: {JSON_FILE}")
if not os.path.exists(INFO_LABEL_FILE):
    raise FileNotFoundError(f"Info/label file not found: {INFO_LABEL_FILE}")

# --------------------------
# STEP 1: PARSE dataset0.json.gz INTO ROWS
# --------------------------
rows = []
parsed_count = 0
with gzip.open(JSON_FILE, "rt") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        obj = json.loads(line)
        # Each line is a JSON object mapping transcript_id -> positions
        for transcript_id, positions in obj.items():
            for position, kmers in positions.items():
                # position in JSON may be string or number; keep as int
                try:
                    pos_int = int(position)
                except Exception:
                    # skip malformed position
                    continue
                for kmer, feature_lists in kmers.items():
                    for feats in feature_lists:
                        # each feats should be list of length 9
                        if isinstance(feats, (list, tuple)) and len(feats) == 9:
                            rows.append({
                                "transcript_id": transcript_id,
                                "position": pos_int,
                                "kmer": kmer,
                                "feat1": feats[0],
                                "feat2": feats[1],
                                "feat3": feats[2],
                                "feat4": feats[3],
                                "feat5": feats[4],
                                "feat6": feats[5],
                                "feat7": feats[6],
                                "feat8": feats[7],
                                "feat9": feats[8]
                            })
                            parsed_count += 1

print(f"Parsed {parsed_count} feature rows from {JSON_FILE}")

# Convert to DataFrame
df = pd.DataFrame(rows)
print("Parsed DataFrame shape:", df.shape)
if df.shape[0] == 0:
    raise ValueError("No rows parsed from JSON. Check the JSON file and parsing logic.")

# --------------------------
# STEP 2: READ data.info.labelled (CSV with header)
# Expect header: gene_id,transcript_id,transcript_position,label
# --------------------------
# Read with pandas; it will automatically parse the header if present.
info_df = pd.read_csv(INFO_LABEL_FILE)

# Validate expected columns exist
expected_cols = {"gene_id", "transcript_id", "transcript_position", "label"}
if not expected_cols.issubset(set(info_df.columns)):
    raise ValueError(
        f"Info file columns mismatch. Expected at least {expected_cols}; found {set(info_df.columns)}"
    )

# Rename the position column to 'position' to match parsed df
info_df = info_df.rename(columns={"transcript_position": "position"})

# Coerce position to numeric, drop invalid rows
info_df["position"] = pd.to_numeric(info_df["position"], errors="coerce")
before_drop = len(info_df)
info_df = info_df.dropna(subset=["position"])
dropped = before_drop - len(info_df)
if dropped:
    print(f"Warning: Dropped {dropped} rows from info file due to invalid 'position' values.")
# convert to int now (safe)
info_df["position"] = info_df["position"].astype(int)

# Ensure label is integer if possible
if info_df["label"].dtype != int:
    info_df["label"] = pd.to_numeric(info_df["label"], errors="coerce").astype("Int64")

print("Info DataFrame shape (after cleaning):", info_df.shape)

# --------------------------
# STEP 3: MERGE parsed JSON data with info_df
# --------------------------
# Merge on transcript_id and position
merged = df.merge(
    info_df[["gene_id", "transcript_id", "position", "label"]],
    on=["transcript_id", "position"],
    how="left",
    validate="m:1"  # many parsed rows can map to single info row
)

print("Merged DataFrame shape:", merged.shape)

# Report merge stats
num_with_info = merged["gene_id"].notna().sum()
print(f"Rows with matching gene_id/label from info file: {num_with_info} / {len(merged)}")

# --------------------------
# STEP 4: SAVE RESULT
# --------------------------
merged.to_csv(PARSED_CSV_OUTPUT, index=False)
print(f"Saved merged dataframe to: {PARSED_CSV_OUTPUT}")

# Optional: display first few rows
print("\nFirst 5 rows of merged dataframe:")
print(merged.head().to_string(index=False))



Parsed 11027106 feature rows from task1/dataset0.json.gz
Parsed DataFrame shape: (11027106, 12)
Info DataFrame shape (after cleaning): (121838, 4)
Merged DataFrame shape: (11027106, 14)
Rows with matching gene_id/label from info file: 11027106 / 11027106
Saved merged dataframe to: task1/parsed_dataset0.csv

First 5 rows of merged dataframe:
  transcript_id  position    kmer   feat1  feat2  feat3   feat4  feat5  feat6   feat7  feat8  feat9         gene_id  label
ENST00000000233       244 AAGACCA 0.00299   2.06  125.0 0.01770  10.40  122.0 0.00930  10.90   84.1 ENSG00000004059      0
ENST00000000233       244 AAGACCA 0.00631   2.53  125.0 0.00844   4.67  126.0 0.01030   6.30   80.9 ENSG00000004059      0
ENST00000000233       244 AAGACCA 0.00465   3.92  109.0 0.01360  12.00  124.0 0.00498   2.13   79.6 ENSG00000004059      0
ENST00000000233       244 AAGACCA 0.00398   2.06  125.0 0.00830   5.01  130.0 0.00498   3.78   80.4 ENSG00000004059      0
ENST00000000233       244 AAGACCA 0.00664 

In [12]:
df = merged
# Number of unique gene_ids
num_genes = df['gene_id'].nunique()
print("Number of unique gene_ids:", num_genes)

Number of unique gene_ids: 3852


In [13]:

import pandas as pd
from sklearn.model_selection import train_test_split

# Assume df has 'gene_id' and 'label' columns

# First, get unique gene_ids with their label (majority label for that gene)
gene_labels = df.groupby('gene_id')['label'].agg(lambda x: x.mode()[0]).reset_index()

# Split gene_ids into train/test while stratifying by label
train_genes, test_genes = train_test_split(
    gene_labels['gene_id'],
    test_size=0.2,                # 20% genes for test
    stratify=gene_labels['label'],
    random_state=42
)

# Select rows corresponding to train/test genes
train_df = df[df['gene_id'].isin(train_genes)].reset_index(drop=True)
test_df = df[df['gene_id'].isin(test_genes)].reset_index(drop=True)

# Check sizes
print("Train size:", len(train_df))
print("Test size:", len(test_df))

# Optional: check label balance
print("Train label distribution:\n", train_df['label'].value_counts(normalize=True))
print("Test label distribution:\n", test_df['label'].value_counts(normalize=True))

# Save to CSV
train_df.to_csv("task1/train_set.csv", index=False)
test_df.to_csv("task1/test_set.csv", index=False)

Train size: 8786719
Test size: 2240387
Train label distribution:
 label
0    0.953795
1    0.046205
Name: proportion, dtype: float64
Test label distribution:
 label
0    0.958227
1    0.041773
Name: proportion, dtype: float64


In [14]:
# Find overlapping gene_ids
overlap_genes = set(train_df['gene_id']).intersection(set(test_df['gene_id']))

if len(overlap_genes) == 0:
    print("Check passed ✅: No gene_ids in test are in train.")
else:
    print("Check failed ❌: Overlapping gene_ids found:", overlap_genes)


Check passed ✅: No gene_ids in test are in train.
