# Building informative conflict  table

After running 'classifying_cell_types' script we'll have a folder for each sample containing everything we need in order to
build a table mapping between a cell classified with some cell-types wich had a conflict with one of the corresponding (negative) markers.

Note: cells in that table might have (and actually PROBABLY have) final classification after removing cell-types with conflicts.

In [4]:
import sys
import os
lib = r'D:\Technion studies\Keren Laboratory\python_playground\classifying-response-to-immunotherapy\utilities\droplet_dataset'
lib2 = r'D:\Technion studies\Keren Laboratory\python_playground\classifying-response-to-immunotherapy\utilities'
lib3 = r'D:\Technion studies\Keren Laboratory\python_playground\classifying-response-to-immunotherapy\data_analysis'
lib4 = r'D:\Technion studies\Keren Laboratory\python_playground\classifying-response-to-immunotherapy'
lib5 = r'D:\Technion studies\Keren Laboratory\python_playground\classifying-response-to-immunotherapy\scripts'
sys.path.append(lib)
sys.path.append(lib2)
sys.path.append(lib3)
sys.path.append(lib4)
sys.path.append(lib5)

In [5]:
import sklearn
from droplet_dataset import *
from utilities import *
from matplotlib import pyplot
import pandas as pd
from os.path import join
from classifying_cell_types import *


In [9]:
EXPERIMENT_NAME = r'13.11.20_2'
EXPERIMENTS_PATH = fr'D:\Technion studies\Keren Laboratory\python_playground\outputs\building_informative_tables'
OUTPUT_PATH = join(EXPERIMENTS_ROOT, EXPERIMENT)

SAMPLES_PATH = r'D:\Technion studies\Keren Laboratory\python_playground\outputs\scrublet\5.12.20'
# OUTPUT_PATH = r'D:\Technion studies\Keren Laboratory\python_playground\output files'

samples = [subfolder for subfolder in os.listdir(SAMPLES_PATH) if not ('csv' in subfolder or 'xlsx' in subfolder)]
CSV_NAME = 'informative_conflict_table.csv'


# Union all DFs of the samples to one DF

In [12]:
# new version (13.11.20) doesn't require converstion.
conflict_df = pd.DataFrame(columns=['sample_id', 'cell_index (from 0)', 'classified cell-types', 'problematic markers (potential for conflict)'])
for sample in samples:
#     print(sample)
    sample_path = join(SAMPLES_PATH, sample, CSV_NAME)
    sample_df = pd.read_csv(sample_path).fillna(-1)
    
    conflict_df = conflict_df.append(pd.DataFrame([[sample] + v[1:] + [np.nan] for v in sample_df.values.tolist()],
                      columns=conflict_df.columns))


M100


FileNotFoundError: [Errno 2] File b'D:\\Technion studies\\Keren Laboratory\\python_playground\\outputs\\scrublet\\5.12.20\\M100\\informative_conflict_table.csv' does not exist: b'D:\\Technion studies\\Keren Laboratory\\python_playground\\outputs\\scrublet\\5.12.20\\M100\\informative_conflict_table.csv'

In [5]:
conflict_df

Unnamed: 0,sample_id,cell_index (from 0),classified cell-types,problematic markers (potential for conflict)
0,M100,68,myeloid cells_general_immature,
1,M100,233,myeloid cells_general_immature,
2,M100,278,T cells,
3,M100,293,Neutrophils,
4,M100,313,Neutrophils,
...,...,...,...,...
773,M99,5077,myeloid cells_general_immature,
774,M99,5078,Neutrophils,
775,M99,5085,Monocyte_immature;Macrophage_immature,
776,M99,5086,B cells;T cells,


# Find and add the neg markers
Now we have a DataFrame for all the samples where each row is a cell which has a conflict.
And we are going to find the negative markers responsible to the conflicts and add them to the table. 

In [14]:
rna_samples = {}
for sample in samples:
    sample_path = join(ROOT_PATH, sample, PKL_NAME)
    rna_sample = extract_droplet_data_from_pickle(sample_path)
    rna_samples[sample] = rna_sample

In [15]:
MARKERS_PATH = join(PROJECT_PATH, r'Data\ImmuneCellsMarkersUpdated_12.11.20.xlsx')

# Step 2: Builds positive/negative cell type marker table.
xls = pd.ExcelFile(MARKERS_PATH) # Extract ImmuneCellsMarkersUpdated Excel file from PC and load it into DataFrame.
positive_markers_df = pd.read_excel(xls, 'and_or')
negative_markers_df = pd.read_excel(xls, 'none')
positive_markers_table = builds_cell_type_markers_table(positive_markers_df)
negative_markers_table = builds_cell_type_markers_table(negative_markers_df)

In [16]:
for row_idx, row in enumerate(conflict_df.iterrows()):
    cell_idx = row[1]['cell_index (from 0)']
    rna_sample = rna_samples[row[1]['sample_id']]
    problematic_classified_cell_types = row[1]['classified cell-types'].split(';')
    
    conflict_markers = []
    for cell_type in problematic_classified_cell_types:
        markers = [m for m in convert_MHC2_markers_list(negative_markers_table[cell_type]) if m in rna_sample.gene_names]
        curr_genes_indexes = find_indexes_of_markers_in_sample(rna_sample.gene_names, markers)
        sample_markers_values = rna_sample.counts[cell_idx, curr_genes_indexes]
        [conflict_markers.append(m) for idx, m in enumerate(markers) if sample_markers_values[idx]!=0]
    
    
    row[1]['problematic markers (potential for conflict)'] = ';'.join(list(set(conflict_markers)))
    conflict_df.iloc[row_idx] = row[1].values


In [10]:
conflict_df

Unnamed: 0,sample_id,cell_index (from 0),classified cell-types,problematic markers (potential for conflict)
0,M100,68,myeloid cells_general_immature,HLA-DRB1;HLA-DQB1;HLA-DPB1;HLA-DRA;HLA-DRB5;HL...
1,M100,233,myeloid cells_general_immature,HLA-DRB1;HLA-DQB1;HLA-DPB1;HLA-DRA;HLA-DRB5;HL...
2,M100,278,T cells,FOXP3
3,M100,293,Neutrophils,CD3E
4,M100,313,Neutrophils,CD3E
...,...,...,...,...
773,M99,5077,myeloid cells_general_immature,HLA-DRB1;HLA-DMA;HLA-DPB1;HLA-DQB1;HLA-DRA;HLA...
774,M99,5078,Neutrophils,CD3E
775,M99,5085,Macrophage_immature;Monocyte_immature,HLA-DRA
776,M99,5086,T cells;B cells,FOXP3;CD3E


# Save it

In [11]:
file_name = r'conflict_summary_15.11.20_2.csv'
conflict_df.to_csv(join(OUTPUT_PATH, file_name), index=False)

# DEBUG ZONE

In [280]:
MHC2_GENES = ['HLA-DM', 'HLA-DMA', 'HLA-DMB', 'HLA-DO',
             'HLA-DOA', 'HLA-DOB', 'HLA-DP', 'HLA-DPA1',
             'HLA-DPB1', 'HLA-DQ', 'HLA-DQA1', 'HLA-DQA2',
             'HLA-DQB1', 'HLA-DQB2', 'HLA-DR', 'HLA-DRA',
             'HLA-DRB1', 'HLA-DRB3', 'HLA-DRB4', 'HLA-DRB5']


rna_sample = rna_samples['M102']
# curr_genes_indexes = find_indexes_of_markers_in_sample(rna_sample.gene_names, HMC2_genes)
# cells_any = np.any(rna_sample.counts[:, curr_genes_indexes], axis=1)
# sum(cells_any)
all_genes = []
for rna_sample in rna_samples.values():
#     rna_sample = rna_samples['M102']
    all_genes = all_genes + [g for g in rna_sample.gene_names if g in MHC2_GENES]
set(all_genes)

{'HLA-DMA',
 'HLA-DMB',
 'HLA-DOA',
 'HLA-DOB',
 'HLA-DPA1',
 'HLA-DPB1',
 'HLA-DQA1',
 'HLA-DQB1',
 'HLA-DQB2',
 'HLA-DRA',
 'HLA-DRB1',
 'HLA-DRB5'}