# Analysis of [Kim et al] and [McFarland et al]

In [1]:
import os, sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import umap
from sklearn.preprocessing import StandardScaler
%config IPCompleter.use_jedi = False

## Load data
### Counts without correction

In [4]:
raw_count_files = {
    'cell_line': '../data/McFarland/processed/NSCLC_highly_variable.pkl',
    'tumor': '../data/Kim/processed/lung_data.pkl'
}

raw_data_df = {
    x: pd.read_pickle(file, compression='gzip')
    for x, file in raw_count_files.items()
}

In [5]:
# Change tumor index
tumor_index = np.array([np.array(e) for e in raw_data_df['tumor'].index.str.split('_')])
tumor_index = [
    tumor_index[:,0],
    np.char.array(tumor_index[:,1]) + '-' + np.char.array(tumor_index[:,2]),
    ['TUMOR'] * tumor_index.shape[0],
    ['TUMOR'] * tumor_index.shape[0]
]
raw_data_df['tumor'].index = pd.MultiIndex.from_arrays(tumor_index)
raw_data_df['tumor'].index.names = raw_data_df['cell_line'].index.names

In [6]:
common_genes = np.intersect1d(*[np.array(df.columns) for df in raw_data_df.values()])
raw_data_df = {
    s: df[common_genes]
    for s, df in raw_data_df.items()
}
print('%s common genes'%(common_genes.shape[0]))

1594 common genes


### Remove expt10 (if want to)

In [16]:
raw_data_df['cell_line'] = raw_data_df['cell_line'].loc[
    raw_data_df['cell_line'].index.get_level_values('pool') != 'expt10'
]

### Concatenation

In [8]:
# Create type index
type_index = [[x.upper()] * raw_data_df[x].shape[0] for x in raw_count_files]
type_index = np.concatenate(type_index)

raw_data_df = pd.concat(raw_data_df.values(), axis=0)
raw_data_df['type'] = type_index
raw_data_df.set_index('type', append=True, inplace=True)

### Remove DMSO (if want to)

In [10]:
# Restrict DMSO and untreated (if wanted)
raw_data_df = raw_data_df.loc[
    ~raw_data_df.index.get_level_values('expt').str.contains('(DMSO|Untreated)')
]

  ~raw_data_df.index.get_level_values('expt').str.contains('(DMSO|Untreated)')


### Save

In [12]:
# Save
output_folder = '../data/McFarland_Kim/'
if not os.path.isdir(output_folder):
    os.mkdir(output_folder)
raw_data_df.to_csv(
    '%s/combined_count.csv'%(output_folder), 
    sep=','
)
raw_data_df.to_pickle(
    '%s/combined_count.pkl'%(output_folder), 
    compression='gzip'
)

In [13]:
raw_data_df.xs('CELL_LINE', level='type').to_csv(
    '%s/cell_line_count.csv'%(output_folder), 
    sep=','
)
raw_data_df.xs('TUMOR', level='type').to_csv(
    '%s/tumor_count.csv'%(output_folder), 
    sep=','
)