# Transform Long Form Count Features to Sparse Matrices
### We want one feature matrix for each of
* Training Set
* Validation Set
* Training Set + Validation Set
* Test Set
* We similarily will save multiple labels csv files where rows correspond one to one with rows in csr matrix

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os 
from google.cloud import bigquery
from tqdm import tqdm
from scipy.sparse import csr_matrix, save_npz
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/conorcorbin/.config/gcloud/application_default_credentials.json' 
os.environ['GCLOUD_PROJECT'] = 'som-nero-phi-jonc101' 

client=bigquery.Client()

### Helper to read in big query data in chunks
Without this, queries can take too long to run and/or timeout

In [None]:
def read_bq_to_pandas(query, nrows, chunksize=500000):
    offsets = [i for i in range(0, nrows, chunksize)]
    df = pd.DataFrame()
    for offset in tqdm(offsets):
        query_str = query + " LIMIT {chunksize} OFFSET {offset}"
        query_str = query_str.format(chunksize=chunksize, offset=offset)
        query_job = client.query(query_str)
        df_slice = query_job.result().to_dataframe()
        df = pd.concat([df, df_slice])
    
    return df

### Join long form feature matrix to cohort table
Read in chunks

In [None]:
cohort_table = 'final_ast_labels'
nrows = 2540179
chunksize = 600000
query = """
SELECT 
    f.*, EXTRACT(YEAR from f.index_time) year
FROM 
    mining-clinical-decisions.abx.feature_counts_long f
RIGHT JOIN 
    mining-clinical-decisions.abx.{cohort_table} c
USING 
    (pat_enc_csn_id_coded)
ORDER BY
    pat_enc_csn_id_coded, feature_type, features -- necessary so that we don't get random ordering in our chunks
"""
df = read_bq_to_pandas(query.format(cohort_table=cohort_table),
                       nrows=nrows,
                       chunksize=chunksize)

In [None]:
### Sanity check - no duplicate rows (means query executed properly)
assert(len(df) == len(df.drop_duplicates()))

### Functions that create CSR matrices from long form data in memory efficient way

In [None]:
from scipy.sparse import csr_matrix, save_npz

def build_vocab(data):
    """
    1. Builds vocabulary for of terms from the data.
    2. Assigns each unique term to a monotonically increasing integer.
    """
    vocabulary = {}
    for i, d in enumerate(data):
        for j, term in enumerate(d):
            vocabulary.setdefault(term, len(vocabulary))
    return vocabulary

def create_sparse_feature_matrix(train_data, apply_data):
    """Creates sparse matrix efficiently from long form dataframe.  
       1. Builds a vocabulary from the training set,
       2. Then applies vocab to the training + eval set
       
       We only want to include terms in our feature matrix that exist in 
       the training set.  
       
       Parameters
       ----------
       train_data : long form pandas DataFrame
           Data to use to build vocabulary
       apply_data : long form pandas DataFrame
           Data to transform to sparse matrix for input to ML models
    
       Returns
       -------
       csr_data : scipy csr_matrix
           Sparse matrix version of apply_data to feed into ML models. 
    """
    
    # Group long form train data by CSN (our unit of observation) and get list of features and values for each
    train_features = (train_data
        .groupby('pat_enc_csn_id_coded')
        .agg({'features' : lambda x: list(x),
              'value' : lambda x: list(x)})
        .reset_index()
    )
    train_feature_names = [doc for doc in train_features.features.values]
    train_feature_values = [doc for doc in train_features['value'].values]
    train_csns = [csn for csn in train_features.pat_enc_csn_id_coded.values]
    
    # Group long form apply data by CSN (our unit of observation) and get list of features and values for each
    apply_features = (apply_data
        .groupby('pat_enc_csn_id_coded')
        .agg({'features' : lambda x: list(x),
              'value' : lambda x: list(x)})
        .reset_index()
    )
    apply_features_names = [doc for doc in apply_features.features.values]
    apply_features_values = [doc for doc in apply_features['value'].values]
    apply_csns = [csn for csn in apply_features.pat_enc_csn_id_coded.values]

    # Build vocab from training set feature names
    vocabulary = build_vocab(train_feature_names)
    
    # Build up csr matrix
    indptr = [0]
    indices = []
    data = []
    for i, d in enumerate(apply_features_names):
        for j, term in enumerate(d):
            if term not in vocabulary: # make sure term is in training set
                continue
            else:
                indices.append(vocabulary[term])
                data.append(apply_features_values[i][j])
            if j == 0:
                # Add zero to data and max index in vocabulary to indices in case max feature indice isn't in apply features.
                indices.append(len(vocabulary)-1)
                data.append(0)
        indptr.append(len(indices))
    
    csr_data = csr_matrix((data, indices, indptr), dtype=float)
    
    return csr_data, apply_csns, vocabulary


### Create and save sparse matrices and labels for each of validation and testing round

In [None]:
import pdb
for r in ["validation", 'testing']:
    
    # Get proper subset of data
    if r == "validation":
        print(r)
        not_included = ['Lab_Results_test', 'Flowsheet_test']
        df = (df
            .query("feature_type not in @not_included", engine='python')
        )
        training_examples = df[df['year'] < 2018]
        test_examples = df[df['year'] == 2018]
    else:
        print(r)
        not_included = ['Lab_Results_val', 'Flowsheet_val']
        df = (df
            .query("feature_type not in @not_included", engine='python')
        )
        training_examples = df[df['year'] < 2019]
        test_examples = df[df['year'] == 2019]
    
    # Get sparce matrices for each set
    train_csr, train_csns, train_vocab = create_sparse_feature_matrix(training_examples, training_examples)
    test_csr, test_csns, test_vocab = create_sparse_feature_matrix(training_examples, test_examples)
   
    # Query cohort table for labels
    q_cohort = """
    SELECT 
        * 
    FROM 
        mining-clinical-decisions.abx.{cohort_table}
    ORDER BY
        pat_enc_csn_id_coded
    """
    query_job = client.query(q_cohort.format(cohort_table=cohort_table))
    df_cohort = query_job.result().to_dataframe()
    if r == 'validation':
        train_labels = df_cohort[df_cohort['index_time'].dt.year < 2018]
        test_labels = df_cohort[df_cohort['index_time'].dt.year == 2018]
    else:
        train_labels = df_cohort[df_cohort['index_time'].dt.year < 2019]
        test_labels = df_cohort[df_cohort['index_time'].dt.year == 2019]
        
    # Sanity check - make sure CSNs from labels and features in same order
    for a, b in zip(train_labels['pat_enc_csn_id_coded'].values, train_csns):
        try:
            assert a == b
        except:
            pdb.set_trace()
    for a, b in zip(test_labels['pat_enc_csn_id_coded'].values, test_csns):
        assert a == b
    
    # Create output path
    path = '/Users/conorcorbin/repos/er_infection/data/ast_models_large_test_set/'
    os.makedirs(path, exist_ok=True)
    
    # Save sparce matrices
    save_npz(os.path.join(path, 'training_examples_round_{r}.npz'.format(r=r)), train_csr)
    save_npz(os.path.join(path, 'test_examples_round_{r}.npz'.format(r=r)), test_csr)
    
    # Save labels
    train_labels.to_csv(os.path.join(path, 'training_labels_round_{r}.csv'.format(r=r)), index=None)
    test_labels.to_csv(os.path.join(path, 'test_labels_round_{r}.csv'.format(r=r)), index=None)
    
    # Save test vocab using train vocab of test round
    if r == 'testing':
        df_train_vocab = pd.DataFrame(data={
            'features' : [t for t in train_vocab],
            'indices' : [train_vocab[t] for t in train_vocab]
        })
        df_train_vocab.to_csv(os.path.join(path, 'feature_vocabulary.csv'))
    
    

### All code below is vestigial   

In [None]:
train_csr, train_csns, train_vocab = create_sparse_feature_matrix(training_examples, training_examples)
test_csr, test_csns, test_and_val_vocab = create_sparse_feature_matrix(training_examples, test_examples)

In [None]:
q_cohort = """
SELECT * 
FROM mining-clinical-decisions.abx.final_cohort_table
WHERE label_unobserved = 0
ORDER BY pat_enc_csn_id_coded
"""
query_job = client.query(q_cohort)
df_cohort = query_job.result().to_dataframe()

In [None]:
if ROUND == 'validation':
    train_labels = df_cohort[df_cohort['index_time'].dt.year < 2018]
    test_labels = df_cohort[df_cohort['index_time'].dt.year == 2018]
else:
    train_labels = df_cohort[df_cohort['index_time'].dt.year < 2019]
    test_labels = df_cohort[df_cohort['index_time'].dt.year == 2019]

for a, b in zip(train_labels['pat_enc_csn_id_coded'].values, train_csns):
    assert a == b
for a, b in zip(test_labels['pat_enc_csn_id_coded'].values, test_csns):
    assert a == b

In [None]:
from scipy.sparse import csr_matrix, save_npz

path = '/home/ccorbin/er_infection/data/'
os.makedirs(path, exist_ok=True)

# Save feature matrix
save_npz(os.path.join(path, 'training_examples.npz'), train_csr)
save_npz(os.path.join(path, 'test_examples.npz'), test_csr)

# Save labels
train_labels.to_csv(os.path.join(path, 'training_labels.csv'), index=None)
test_labels.to_csv(os.path.join(path, 'test_labels.csv'), index=None)