# Preprocessing for negative binomial mixture model pipeline

In [11]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from Bio.Seq import Seq
import Levenshtein as Lev

# Load in custom functions
from MM_functions import model_fit_functions as ff

import os
path = 'C:/Users/perry/Desktop/Atlas_analysis/'
os.chdir(path)

from datetime import datetime
date = datetime.today().strftime('%Y-%m-%d')
print(date)

2023-10-02


## Loading data

In [22]:
path = 'C:/Users/perry/github_repos/mixed_model_denoising/'
os.chdir(path)

raw_files = os.listdir('./data/raw')
raw_file_dict = dict(zip(list(range(1,len(raw_files) + 1)), raw_files))

# To-do:
- Make sure I actually need all functions in python file
- Modify them to make more unique
- Make summary figs here (VRC01, and other visualizations that may be helpful)

In [26]:
# Loading in raw files into dictionary
raw_dfs = {}
for donor in raw_file_dict:
    donor_file = raw_file_dict[donor]
    raw_dfs[donor] = pd.read_csv('./data/raw/' + donor_file, sep = ',', index_col=0)

In [28]:
# Getting names of unique antigens for LSS and UMI columns
lss_names = [col for col in raw_dfs[1].columns if '.LSS' in col]
umi_names = [i.split('.')[0] for i in lss_names]

## Filtering
* Separate out donor cells and VRC01 negative control cells
* Remove N > 1 (more than one heavy chain annotation)
* Remove outliers from UMI count distributions (> 99th percentile)

In [30]:
# Creating subdirectories to store processed data
try:
    os.mkdir('./data/processed/')
    os.mkdir('./data/processed/VRC01_cells')
    os.mkdir('./data/processed/donor_cells')
except FileExistsError:
    pass

# UMI to run pipeline on
umi = 'SARS-2'

# Only keeping N (# heavy chains), CDRH3 sequence, and UMI/LSS columns
columns_to_keep = np.concatenate([lss_names, umi_names, ['N', 'CDR3_IMGT.H']])
for donor in raw_dfs:
    donor_df = raw_dfs[donor]
    donor_df = donor_df[columns_to_keep]
    # Separating out VRC01 cells
    lseq_non_vrc01, vrc01_df = ff.separate_vrc01(donor_df)

    # Removing N>1 cells Aand outliers in UMI counts
    lseq_non_vrc01 = lseq_non_vrc01[lseq_non_vrc01['N'] == 1]
    lseq_non_vrc01 = lseq_non_vrc01[lseq_non_vrc01[umi] < np.percentile(lseq_non_vrc01[umi], 99)]

    if len (vrc01_df) >0:
        vrc01_df = vrc01_df[vrc01_df['N'] == 1]
        vrc01_df = vrc01_df[vrc01_df[umi] < np.percentile(vrc01_df[umi], 99)]

    lseq_non_vrc01.to_csv('./data/processed/donor_cells/donor' + str(donor) + '_processed.csv')
    vrc01_df.to_csv('./data/processed/VRC01_cells/donor' + str(donor) + '_vrc01_processed.csv')