In [1]:
import pandas as pd
import numpy as np
import scanpy as sc

In [2]:
granger_df_all_T = pd.read_csv('../results_final_Jan15/granger_all_T_signed_score.csv', index_col=0)
granger_df_all_A = pd.read_csv('../results_final_Jan15/granger_all_A_signed_score.csv', index_col=0)

In [4]:
adata = sc.read_h5ad('../data/rna_figure_ready.h5ad')

In [5]:
with open('../data/hvg_50.txt', 'r') as f:
    hvg_50 = f.read().splitlines()

with open('../data/hv_tf_50.txt', 'r') as f:
    hv_tf_50 = f.read().splitlines()

In [25]:
# Check if any gene_ids contain a period
# Need to escape the period since it's a special regex character
assert not adata.var['gene_ids'].str.contains('\.').any()

In [19]:
gene_name2id = adata.var['gene_ids'].to_dict()

In [21]:
granger_df_all_T_id = granger_df_all_T.rename(index=gene_name2id, columns=gene_name2id)
granger_df_all_T_id.to_csv('../results_final_Jan15/granger_all_T_signed_score_id.csv')

In [45]:
granger_df_all_T_id.to_csv('../results_final_Jan15/granger_all_T_signed_score_id.csv')

In [26]:
granger_df_all_A_id = granger_df_all_A.rename(index=gene_name2id, columns=gene_name2id)
granger_df_all_A_id.to_csv('../results_final_Jan15/granger_all_A_signed_score_id.csv')

In [43]:
granger_df_50_T = granger_df_all_T.loc[hv_tf_50, hvg_50]
granger_df_50_T.to_csv('../results_final_Jan15/granger_50_T_signed_score.csv')
granger_df_50_A = granger_df_all_A.loc[hv_tf_50, hvg_50]
granger_df_50_A.to_csv('../results_final_Jan15/granger_50_A_signed_score.csv')

In [34]:
granger_df_50_T_id = granger_df_50_T.rename(index=gene_name2id, columns=gene_name2id)
granger_df_50_T_id.to_csv('../results_final_Jan15/granger_50_T_signed_score_id.csv')
granger_df_50_A_id = granger_df_50_A.rename(index=gene_name2id, columns=gene_name2id)
granger_df_50_A_id.to_csv('../results_final_Jan15/granger_50_A_signed_score_id.csv')


In [36]:
# Convert to pairs with values for T
pairs = [(idx, col, val) for idx in granger_df_50_T.index for col, val in granger_df_50_T.abs().loc[idx].items()]

# Sort by absolute value
pairs.sort(key=lambda x: x[2], reverse=True)

# Get top pairs until we have 100 unique indices
seen_indices = set()
top_pairs = []
for idx, col, val in pairs:
    if idx not in seen_indices:
        seen_indices.add(idx)
        top_pairs.append((idx, col, val))
    if len(seen_indices) == 100:
        break

# Convert result to DataFrame for easier viewing
result = pd.DataFrame(top_pairs, columns=['Index', 'Column', 'Value'])
top_100_T = result['Index'].to_list()

# Now do the same for A
pairs = [(idx, col, val) for idx in granger_df_50_A.index for col, val in granger_df_50_A.abs().loc[idx].items()]

pairs.sort(key=lambda x: x[2], reverse=True)

seen_indices = set()
top_pairs = []
for idx, col, val in pairs:
    if idx not in seen_indices:
        seen_indices.add(idx)
        top_pairs.append((idx, col, val))
    if len(seen_indices) == 100:
        break

result_A = pd.DataFrame(top_pairs, columns=['Index', 'Column', 'Value'])
top_100_A = result_A['Index'].to_list()

In [37]:
# Find intersection between top 100 genes from T and A
intersection = set(top_100_T) & set(top_100_A)
print(f"Number of genes in intersection: {len(intersection)}")

Number of genes in intersection: 87


In [38]:
unique_T = set(top_100_T) - intersection
unique_A = set(top_100_A) - intersection
intersection_list = list(intersection)
unique_T_list = list(unique_T)
unique_A_list = list(unique_A)



In [42]:
# Create a dictionary of dataframes for each list
data_dict = {
    'Unique_T': pd.DataFrame(unique_T_list, columns=['Gene']),
    'Unique_A': pd.DataFrame(unique_A_list, columns=['Gene']), 
    'Intersection': pd.DataFrame(intersection_list, columns=['Gene']),
    'Top_100_T': pd.DataFrame(top_100_T, columns=['Gene']),
    'Top_100_A': pd.DataFrame(top_100_A, columns=['Gene'])
}

# Save to Excel with multiple sheets
with pd.ExcelWriter('../results_final_Jan15/granger_causality_rankings_50.xlsx') as writer:
    for sheet_name, df in data_dict.items():
        df.to_excel(writer, sheet_name=sheet_name, index=False)

# Convert gene names to IDs
data_dict_ids = {}
for name, df in data_dict.items():
    df_ids = df.copy()
    df_ids['Gene'] = df_ids['Gene'].map(gene_name2id)
    data_dict_ids[name] = df_ids

# Save version with gene IDs to Excel
with pd.ExcelWriter('../results_final_Jan15/granger_causality_rankings_50_ids.xlsx') as writer:
    for sheet_name, df in data_dict_ids.items():
        df.to_excel(writer, sheet_name=sheet_name, index=False)

In [46]:
# Convert to pairs with values for T
pairs = [(idx, col, val) for idx in granger_df_50_T.index for col, val in granger_df_50_T.abs().loc[idx].items()]

# Sort by absolute value
pairs.sort(key=lambda x: x[2], reverse=True)

# Get top pairs until we have 200 unique indices
seen_indices = set()
top_pairs = []
for idx, col, val in pairs:
    if idx not in seen_indices:
        seen_indices.add(idx)
        top_pairs.append((idx, col, val))
    if len(seen_indices) == 200:
        break

# Convert result to DataFrame for easier viewing
result = pd.DataFrame(top_pairs, columns=['Index', 'Column', 'Value'])
top_200_T = result['Index'].to_list()

# Now do the same for A
pairs = [(idx, col, val) for idx in granger_df_50_A.index for col, val in granger_df_50_A.abs().loc[idx].items()]

pairs.sort(key=lambda x: x[2], reverse=True)

seen_indices = set()
top_pairs = []
for idx, col, val in pairs:
    if idx not in seen_indices:
        seen_indices.add(idx)
        top_pairs.append((idx, col, val))
    if len(seen_indices) == 200:
        break

result_A = pd.DataFrame(top_pairs, columns=['Index', 'Column', 'Value'])
top_200_A = result_A['Index'].to_list()

# Find intersection between top 200 genes from T and A
intersection = set(top_200_T) & set(top_200_A)
print(f"Number of genes in intersection: {len(intersection)}")

unique_T = set(top_200_T) - intersection
unique_A = set(top_200_A) - intersection
intersection_list = list(intersection)
unique_T_list = list(unique_T)
unique_A_list = list(unique_A)


# Create a dictionary of dataframes for each list
data_dict = {
    'Unique_T': pd.DataFrame(unique_T_list, columns=['Gene']),
    'Unique_A': pd.DataFrame(unique_A_list, columns=['Gene']), 
    'Intersection': pd.DataFrame(intersection_list, columns=['Gene']),
    'Top_200_T': pd.DataFrame(top_200_T, columns=['Gene']),
    'Top_200_A': pd.DataFrame(top_200_A, columns=['Gene'])
}

# Save to Excel with multiple sheets
with pd.ExcelWriter('../results_final_Jan15/granger_causality_rankings_50_top200.xlsx') as writer:
    for sheet_name, df in data_dict.items():
        df.to_excel(writer, sheet_name=sheet_name, index=False)

# Convert gene names to IDs
data_dict_ids = {}
for name, df in data_dict.items():
    df_ids = df.copy()
    df_ids['Gene'] = df_ids['Gene'].map(gene_name2id)
    data_dict_ids[name] = df_ids

# Save version with gene IDs to Excel
with pd.ExcelWriter('../results_final_Jan15/granger_causality_rankings_50_top200_ids.xlsx') as writer:
    for sheet_name, df in data_dict_ids.items():
        df.to_excel(writer, sheet_name=sheet_name, index=False)

Number of genes in intersection: 173


In [48]:
print(len(unique_T_list))

27


In [49]:
# Convert to pairs with values for T
pairs = [(idx, col, val) for idx in granger_df_50_T.index for col, val in granger_df_50_T.abs().loc[idx].items()]

# Sort by absolute value
pairs.sort(key=lambda x: x[2], reverse=True)

# Get top pairs until we have 300 unique indices
seen_indices = set()
top_pairs = []
for idx, col, val in pairs:
    if idx not in seen_indices:
        seen_indices.add(idx)
        top_pairs.append((idx, col, val))
    if len(seen_indices) == 300:
        break

# Convert result to DataFrame for easier viewing
result = pd.DataFrame(top_pairs, columns=['Index', 'Column', 'Value'])
top_300_T = result['Index'].to_list()

# Now do the same for A
pairs = [(idx, col, val) for idx in granger_df_50_A.index for col, val in granger_df_50_A.abs().loc[idx].items()]

pairs.sort(key=lambda x: x[2], reverse=True)

seen_indices = set()
top_pairs = []
for idx, col, val in pairs:
    if idx not in seen_indices:
        seen_indices.add(idx)
        top_pairs.append((idx, col, val))
    if len(seen_indices) == 300:
        break

result_A = pd.DataFrame(top_pairs, columns=['Index', 'Column', 'Value'])
top_300_A = result_A['Index'].to_list()

# Find intersection between top 300 genes from T and A
intersection = set(top_300_T) & set(top_300_A)
print(f"Number of genes in intersection: {len(intersection)}")

unique_T = set(top_300_T) - intersection
unique_A = set(top_300_A) - intersection
intersection_list = list(intersection)
unique_T_list = list(unique_T)
unique_A_list = list(unique_A)


# Create a dictionary of dataframes for each list
data_dict = {
    'Unique_T': pd.DataFrame(unique_T_list, columns=['Gene']),
    'Unique_A': pd.DataFrame(unique_A_list, columns=['Gene']), 
    'Intersection': pd.DataFrame(intersection_list, columns=['Gene']),
    'Top_300_T': pd.DataFrame(top_300_T, columns=['Gene']),
    'Top_300_A': pd.DataFrame(top_300_A, columns=['Gene'])
}

# Save to Excel with multiple sheets
with pd.ExcelWriter('../results_final_Jan15/granger_causality_rankings_50_top300.xlsx') as writer:
    for sheet_name, df in data_dict.items():
        df.to_excel(writer, sheet_name=sheet_name, index=False)

# Convert gene names to IDs
data_dict_ids = {}
for name, df in data_dict.items():
    df_ids = df.copy()
    df_ids['Gene'] = df_ids['Gene'].map(gene_name2id)
    data_dict_ids[name] = df_ids

# Save version with gene IDs to Excel
with pd.ExcelWriter('../results_final_Jan15/granger_causality_rankings_50_top300_ids.xlsx') as writer:
    for sheet_name, df in data_dict_ids.items():
        df.to_excel(writer, sheet_name=sheet_name, index=False)

Number of genes in intersection: 263


In [51]:
print(len(unique_T_list))

37
