In [8]:
from os.path import join, split
import pandas as pd
import os, sys
base_dir = split(os.getcwd())[0]
if base_dir not in sys.path:
    sys.path.append(base_dir)
import pickle
from utilities import utils

In [92]:
table_dir = join(base_dir, 'data','tables')

In [93]:
with open(join(table_dir, 'sids_2019.pkl'), 'rb') as f:
    sids_2019_ls = pickle.load(f)
df_clean = utils.load_scan_csv(join(table_dir, 'neg_pos_clean.csv')).dropna(how='all')
df_all = utils.load_scan_csv(join(table_dir, 'neg_pos.csv')).dropna(how='all')

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


In [94]:
df_days_since_test = pd.read_csv(join(table_dir, 'days_since_test2.csv'))

## Analyse some numbers

In [95]:
pos_mask = df_days_since_test.days_since_test>-4
pos_sids = df_days_since_test[pos_mask].SeriesInstanceUID
print('potential cases',df_days_since_test[pos_mask].PatientID.nunique())
print('potential pos. scans',len(df_days_since_test[pos_mask]))

potential cases 339
potential pos. scans 4177


In [96]:
print('positive tests')
len(set(df_days_since_test.PatientID.unique()).intersection(set(df_all.PatientID.unique())))

positive tests


1020

## Add positive tag

In [97]:
pos_sids_mask = df_all.SeriesInstanceUID.isin(pos_sids)
df_all['Positive'] = 0
df_all.loc[pos_sids_mask, 'Positive'] = 1

## Add 2019 tag to df_all

In [98]:
df_all['2019'] = 0
mask_2019 = df_all.SeriesInstanceUID.isin(sids_2019_ls)
df_all.loc[mask_2019, '2019'] = 1

## Drop obsolete columns

In [99]:
df_all = df_all.drop(columns=['Sequence', 'TrueSequenceType'])

## Compare keys, what is missing?

In [100]:
print(set(df_clean.keys()).difference(set((df_all.keys()))))

{'TrueSequenceType', 'ColumnSpacing', 'RowSpacing', 'DistanceBetweenSlices', 'Sequence', 'days_since_test', 'NumberOfSlices'}


## Add RowSpacing and ColumnSpacing to df_all

In [101]:
def extract_from_arr(arr, pos): 
    try:
        return arr[pos] 
    except:
        return None
df_all['RowSpacing'] = df_all.PixelSpacing.map(lambda x: extract_from_arr(x, 0))
df_all['ColumnSpacing'] = df_all.PixelSpacing.map(lambda x: extract_from_arr(x, 1))

## Add DistanceBetweenSlices, NumberOfSlices and drop days_since_test, we can later compute it on CR

In [102]:
df_final = pd.merge(df_all, 
    df_clean[['SeriesInstanceUID', 'DistanceBetweenSlices', 'NumberOfSlices', 'days_since_test']], 
    how='left', on='SeriesInstanceUID')

In [103]:
df_final.to_pickle(f"{table_dir}/scan_final.pkl")

In [104]:
df_final.to_csv(f"{table_dir}/scan_final.csv", index=False, header=True)