In [None]:
import pandas as pd 
import numpy as np 

import anndata
import scanpy as sc 


In [None]:
adata = anndata.read_csv('../files/Tsukahara_2021/GSE173947_envA_bidirectional_switch_umi_counts.csv')
meta = pd.read_csv('../files/Tsukahara_2021/GSE173947_envA_bidirectional_switch_metadata.csv', index_col = 0 )

# Combine metadata 
adata.obs = adata.obs.merge(meta, left_index = True, 
                            right_index = True)

### TODO Tasks 
- identify the change in ORs expression across the change of environment
- create category bins for up-regulated, no-change, and down-regulated via change of environment 
- Observe the change of Rhbdf2 and associated genes if there are consistent patterns across bins 

In [None]:
# Basic preprocessing steps
sc.pp.filter_cells(adata, min_genes=200)  # Filter cells with fewer than 200 expressed genes
sc.pp.filter_genes(adata, min_cells=3)  # Filter genes expressed in fewer than 3 cells

# Calculate QC metrics
sc.pp.calculate_qc_metrics(adata, inplace=True)

# Normalize counts and log-transform
sc.pp.normalize_total(adata, target_sum=1e4)  # Normalize counts
# sc.pp.log1p(adata) 

In [None]:
# Identify highly variable genes
# sc.pp.highly_variable_genes(adata, flavor='seurat', n_top_genes=2000)

# # Plot highly variable genes
# sc.pl.highly_variable_genes(adata)

# # Filter the data based on highly variable genes
# adata = adata[:, adata.var['highly_variable']] 

In [None]:
# Scale the data
sc.pp.scale(adata, max_value=10)

# Perform PCA
sc.tl.pca(adata, svd_solver='arpack')

# Perform UMAP
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=50)
sc.tl.umap(adata)

In [None]:
# Plot UMAP
sc.set_figure_params(figsize = [5,5], facecolor = 'white')
sc.pl.umap(adata, color = 'source', size = 10)

In [None]:
sc.pl.umap(adata, color = ['Rhbdf2', 'leiden'], size = 10)

In [None]:
env_cellindex = {}
for env in environments: 
    env_cellindex[env] = [adata.obs[adata.obs.source == env].index]

In [None]:
adata[(adata.obs.top_Olfr == 'Olfr1377') & (adata.obs.source == env)].var

In [None]:
top_Olfr = [Olfr for Olfr in adata.obs.top_Olfr.unique() if 'Olfr' in Olfr]

results = pd.DataFrame()

for i in range(len(environments)):
    for j in range(i+1, len(environments)):
        
        env1 = environments[i]
        env2 = environments[j]
        
        # Subset the data for the two sources
        adata_subset = adata[adata.obs['source'].isin([env1, env2])].copy()
        
        # Filter genes based on the gene_subset
        for Olfr in top_Olfr:
            try: 
                # Subset the genes to selected Olfr
                adata_subset_Olfr = adata_subset[:, Olfr]
                # Subset cells to selected Olfr 
                adata_subset_Olfr = adata_subset_Olfr[(adata_subset_Olfr.obs.top_Olfr == Olfr)]
                
                # Check if Olfr is in present in both environments adata if not skip 
                if not len(adata_subset_Olfr.obs.env.unique()) == 2:
                    continue 
                
                # Calculate the mean of Olfr expression across cells in different environment
                env1_Olfr_mean = adata_subset_Olfr[adata_subset_Olfr.obs.source == env1].X.mean()
                env2_Olfr_mean = adata_subset_Olfr[adata_subset_Olfr.obs.source == env2].X.mean()
                # Calculate fold change of mean Olfr expression between environments
                foldchange = float(env1_Olfr_mean / env2_Olfr_mean)
                # Store the result in a result df 
                result_df = pd.DataFrame([[Olfr, env1, env2, foldchange]], 
                                        columns=['Olfr', 'env1', 'env2', 'FoldChange'])
                results = pd.concat([results, result_df])
            except: 
                # print(f'{Olfr} skipped')
                continue 
            
        
        break
    break




In [None]:
len(adata_subset_Olfr.obs.env.unique())

In [None]:
adata_subset_Olfr

In [None]:
adata_subset_Olfr = adata_subset[:, 'Olfr58']
adata_subset_Olfr = adata_subset_Olfr[(adata_subset_Olfr.obs.top_Olfr == 'Olfr58')]


In [None]:
adata_subset_Olfr.obs

In [None]:
adata_subset[(adata_subset.obs.source == env2) & 
             (adata_subset.obs.top_Olfr == 'Olfr114')].X

In [None]:
(adata_subset_Olfr.shape[1] <= 1) | (adata_subset_Olfr.shape[0] <= 1)

In [None]:
adata_subset_Olfr