In [4]:
import os
import json
import pandas as pd

# -----------------------------
# 1️⃣ Choose a folder to explore
# -----------------------------
base_dir = "rna_modification_data"
cell_line_folder = "SGNex_H9_directRNA_replicate2_run1"  # change this if you like
folder_path = os.path.join(base_dir, cell_line_folder)

# List files in the folder
print("Files in folder:", os.listdir(folder_path))

# -----------------------------
# 2️⃣ Read .info file
# -----------------------------
info_path = os.path.join(folder_path, "data.info")
with open(info_path, "r") as f:
    info_data = f.read()

print("\n--- data.info ---")
print(info_data[:500])  # print first 500 characters

# -----------------------------
# 3️⃣ Read .readcount file
# -----------------------------
readcount_path = os.path.join(folder_path, "data.readcount")
with open(readcount_path, "r") as f:
    readcount_data = f.read()

print("\n--- data.readcount ---")
print(readcount_data[:500])  # print first 500 characters



Files in folder: ['data.json', 'data.info', 'data.index', 'data.readcount']

--- data.info ---
transcript_id,transcript_position,start,end,n_reads
SIRV106,38,0,1204,11
SIRV106,69,1204,2507,11
SIRV106,87,2507,3805,11
SIRV106,126,3805,5040,12
SIRV106,202,5040,6493,12
SIRV106,259,6493,8067,13
SIRV106,323,8067,9369,12
SIRV106,350,9369,10479,11
SIRV106,409,10479,11870,13
SIRV106,424,11870,13104,12
SIRV106,478,13104,14428,12
SIRV106,497,14428,16049,14
SIRV106,537,16049,16889,8
SIRV106,560,16889,18360,14
SIRV106,570,18360,19741,13
SIRV106,593,19741,21314,14
SIRV106,628,21314,23193,16
SIRV106,650

--- data.readcount ---
transcript_id,transcript_position,n_reads
SIRV106,38,11
SIRV106,69,11
SIRV106,87,11
SIRV106,126,12
SIRV106,202,12
SIRV106,259,13
SIRV106,323,12
SIRV106,350,11
SIRV106,409,13
SIRV106,424,12
SIRV106,478,12
SIRV106,497,14
SIRV106,537,8
SIRV106,560,14
SIRV106,570,13
SIRV106,593,14
SIRV106,628,16
SIRV106,650,17
SIRV106,663,18
SIRV106,669,17
SIRV106,692,17
SIRV106,801,18
SIRV106,814,

: 

In [5]:
import os
import json
import pandas as pd

# -----------------------------
# Set folder path
# -----------------------------
base_dir = "rna_modification_data"
cell_line_folder = "SGNex_H9_directRNA_replicate2_run1"  # change if needed
folder_path = os.path.join(base_dir, cell_line_folder)

# -----------------------------
# 1️⃣ Read .info file (CSV)
# -----------------------------
info_path = os.path.join(folder_path, "data.info")
try:
    info_df = pd.read_csv(info_path)
    print("\n--- data.info (first 5 rows) ---")
    print(info_df.head())
except Exception as e:
    print("Error reading data.info:", e)

# -----------------------------
# 2️⃣ Read .readcount file (CSV-like)
# -----------------------------
readcount_path = os.path.join(folder_path, "data.readcount")
try:
    readcount_df = pd.read_csv(readcount_path)
    print("\n--- data.readcount (first 5 rows) ---")
    print(readcount_df.head())
except Exception as e:
    print("Error reading data.readcount:", e)

# -----------------------------
# 3️⃣ Read .json file
# -----------------------------
json_path = os.path.join(folder_path, "data.json")
try:
    with open(json_path, "r") as f:
        data_json = json.load(f)
    if isinstance(data_json, list):
        print("\n--- data.json (first 5 entries) ---")
        for entry in data_json[:5]:
            print(entry)
    else:
        print("\n--- data.json keys ---")
        print(list(data_json.keys()))
except Exception as e:
    print("Error reading data.json:", e)

# -----------------------------
# 4️⃣ Read .index file
# -----------------------------
index_path = os.path.join(folder_path, "data.index")
try:
    with open(index_path, "r") as f:
        index_data = f.read()
    print("\n--- data.index (first 500 characters) ---")
    print(index_data[:500])
except Exception as e:
    print("Error reading data.index:", e)



--- data.info (first 5 rows) ---
  transcript_id  transcript_position  start   end  n_reads
0       SIRV106                   38      0  1204       11
1       SIRV106                   69   1204  2507       11
2       SIRV106                   87   2507  3805       11
3       SIRV106                  126   3805  5040       12
4       SIRV106                  202   5040  6493       12

--- data.readcount (first 5 rows) ---
  transcript_id  transcript_position  n_reads
0       SIRV106                   38       11
1       SIRV106                   69       11
2       SIRV106                   87       11
3       SIRV106                  126       12
4       SIRV106                  202       12


In [3]:
import joblib

# Load the model
model = joblib.load("full_xgb_model.pkl")

# Get feature names the model expects
feature_names = model.get_booster().feature_names
print("Features expected by the model:")
print(feature_names)


Features expected by the model:
['feat1', 'feat2', 'feat3', 'feat4', 'feat5', 'feat6', 'feat7', 'feat8', 'feat9']


In [None]:
info_path = "rna_modification_data/SGNex_HepG2_directRNA_replicate5_run2/data.info"

try:
    with open(info_path, "r") as f:
        for i in range(10):  # read first 10 lines
            line = f.readline()
            if not line:
                break
            print(line.strip())
except FileNotFoundError:
    print(f"No data.info file found in {info_path}")
except Exception as e:
    print(f"Error reading {info_path}: {e}")
