# Analysis of [Kim et al] and [Kinker et al] restricted to NSCLC cell lines

In [1]:
import os, sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import umap
from sklearn.preprocessing import StandardScaler
%config IPCompleter.use_jedi = False

## Load data
### Counts without correction

In [2]:
raw_count_files = {
    'cell_line': '../data/Kinker/processed/NSCLC_data.pkl',
    'tumor': '../data/Kim/processed/lung_data.pkl'
}

raw_data_df = {
    x: pd.read_pickle(file, compression='gzip')
    for x, file in raw_count_files.items()
}

In [3]:
# Change tumor index
tumor_index = np.array([np.array(e) for e in raw_data_df['tumor'].index.str.split('_')])
tumor_index = [
    tumor_index[:,0],
    np.char.array(tumor_index[:,1]) + '-' + np.char.array(tumor_index[:,2]),
    ['TUMOR'] * tumor_index.shape[0]
]
raw_data_df['tumor'].index = pd.MultiIndex.from_arrays(tumor_index)

# Create type index
type_index = [[x.upper()] * raw_data_df[x].shape[0] for x in raw_count_files]
type_index = np.concatenate(type_index)

In [4]:
common_genes = np.intersect1d(*[np.array(df.columns) for df in raw_data_df.values()])
raw_data_df = {
    s: df[common_genes]
    for s, df in raw_data_df.items()
}

In [5]:
raw_data_df = pd.concat(raw_data_df.values(), axis=0)
raw_data_df['type'] = type_index
raw_data_df.set_index('type', append=True, inplace=True)

In [8]:
# Save
output_folder = '../data/Kinker_Kim/'
if not os.path.isdir(output_folder):
    os.mkdir(output_folder)
raw_data_df.to_csv(
    '%s/combined_count.csv'%(output_folder), 
    sep=','
)
raw_data_df.to_pickle(
    '%s/combined_count.pkl'%(output_folder), 
    compression='gzip'
)

In [9]:
raw_data_df.xs('CELL_LINE', level='type').to_csv(
    '%s/cell_line_count.csv'%(output_folder), 
    sep=','
)
raw_data_df.xs('TUMOR', level='type').to_csv(
    '%s/tumor_count.csv'%(output_folder), 
    sep=','
)