In [3]:
import pandas as pd
from pathlib import Path

# paths - change to your file paths
f1 = Path('/Users/arujatiwary/Desktop/DL+IS-2/Multi GA.csv')
f2 = Path('/Users/arujatiwary/Desktop/DL+IS-2/Binary GA.csv')
out = Path('/Users/arujatiwary/Desktop/DL+IS-2/dataset_union.csv')

# 1) load without using any column as index (rows are patients in order)
df1 = pd.read_csv(f1, header=0)   # header assumed present: gene names in first row
df2 = pd.read_csv(f2, header=0)

# 2) quick sanity check on number of rows (patients)
if df1.shape[0] != df2.shape[0]:
    raise ValueError(f"Row counts differ: df1 has {df1.shape[0]} rows, df2 has {df2.shape[0]} rows. "
                     "Since you said rows are the same serials, check ordering or missing rows.")

print("Rows match:", df1.shape[0], "rows")

# 3) handle overlapping gene columns
overlap = df1.columns.intersection(df2.columns).tolist()
print("Number of overlapping gene columns:", len(overlap))

# If genes overlap, combine by mean across the two datasets (row-wise)
combined = df1.copy()

if overlap:
    # compute mean for overlapping genes
    for g in overlap:
        # take mean across the two series (works even if NaNs exist)
        combined[g] = pd.concat([df1[g], df2[g]], axis=1).mean(axis=1)
    # drop overlapping columns from df2 before concatenation
    df2_nonoverlap = df2.drop(columns=overlap)
else:
    df2_nonoverlap = df2

# 4) concatenate the remaining (non-overlapping) genes from df2
df_union = pd.concat([combined, df2_nonoverlap], axis=1)

# Optional: re-order columns (alphabetical) or keep as-is
# df_union = df_union.reindex(sorted(df_union.columns), axis=1)

# 5) save
df_union.to_csv(out, index=False)
print("Saved union dataset to:", out)
print("Union shape (rows, cols):", df_union.shape)


Rows match: 10459 rows
Number of overlapping gene columns: 4557
Saved union dataset to: /Users/arujatiwary/Desktop/DL+IS-2/dataset_union.csv
Union shape (rows, cols): (10459, 13470)
