# Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
import os
FILE_DIR = "example_dir/Data"
dataset_dir = os.path.join(FILE_DIR, "classes.csv")

In [None]:
import pandas as pd
df = pd.read_csv(dataset_dir)
df = df.rename(columns={'Scan #': 'Scan_Number'}) #changed the name of the column because the hash symbol just wont work for the method below 
df = df.rename(columns={'post con/non con': 'con'}) #changed the name of the column because the hash symbol just wont work for the method below 

In [None]:
shape = df.shape
print(f"The dataset consists of {shape[0]} rows and {shape[1]} columns")
df['Scan_Number'].value_counts()
# print()

# Generate Columns

## Remove invalid rows

In [None]:
# Remove once dataset is fixed
## Temporarily remove rows that have "#VALUE!"
# df = df[df['date last imaging- diagnosis dx'] != "#VALUE!"]
df.drop(df.loc[df['date last imaging- diagnosis dx'] == "#VALUE!"].index, inplace=True)
df.drop(df.loc[df['CT surveillance interval [(AB-Y)/365]'] == "#VALUE!"].index, inplace=True)


# Remove Scans from patients with only one scan;
# cannot calculate growth with only one scan
df = df[df.duplicated(subset=["Subject ID"], keep=False)]

## Create Unidimensional and Volumetric Growth Column


In [None]:
import numpy as np
# Unidimensional
# Change: Wy-Wt / time. XB
df['Unidimensional Rate'] = (df['1D longest dimension (cm) last imaging'].astype(float) - df['1D longest dimension (cm) 1st imaging'].astype(float))/ df['date last imaging- diagnosis dx'].astype(float)
df['Unidimensional Change'] = (df['1D longest dimension (cm) last imaging'].astype(float) - df['1D longest dimension (cm) 1st imaging'].astype(float))

# Volumentric
# UX across two entries)/XC
df["Volumetric"] = df['VOLUME_ML'].astype(float).diff().shift(-1) #Change across UX and shifts value to be in row with scan 1
df = df[df.Scan_Number  != 2] #if the scan_number is 2 then it removes the row 

# A SettingWithCopyWarning is generated from the below code, I believe it is a false positive
# See more: https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas
df['Volumetric'] = df['Volumetric']/ df["CT surveillance interval [(AB-Y)/365]"].astype(float) #change in growth / time

# # remove any patients who have NaN or infinite value (due to only having one scan)
# ## Replace infinity and negative infinity with NaN --> drop NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df = df.dropna(subset=["Volumetric"], how="all")


## Generate Categorical Data

In [None]:
CHANGE_THRESHOLD = .5
RATE_THRESHOLD = .2 # Threshold in years, same for threshold of .2 or .5/yr

df['Categorical Unidimensional Change'] = (df['Unidimensional Change'] > CHANGE_THRESHOLD).astype(int)
df['Categorical Unidimensional Rate'] = (df['Unidimensional Rate'] > RATE_THRESHOLD).astype(int)
print(sum(df['Categorical Unidimensional Rate']), len(df['Categorical Unidimensional Rate']))

# Process Dataset

## Encode Categorical Data

In [None]:
# transform categorical features
categorical = ['gender', 'race', 'tnum_aml', 'laterality', 'symptoms', 'hemorrhage', 'Treated Y/N', 'path_dx', 'ethnicity', 'ts'] 

for col in df:
  if col in categorical:
    one_hot = pd.get_dummies(df[col], prefix = col)
    df = df.drop(col, axis = 1)
    df = df.join(one_hot)

# remove 'flank pain, hematuria' combination column
for cell in df['symptoms_flank pain, hematuria']:
  if cell == '1':
    df['symptoms_flank pain'] = 1
    df['symptoms_hematuria'] = 1
  
df = df.drop('symptoms_flank pain, hematuria', axis = 1)

## Remove Columns

In [None]:
# Handling columns that are renamed by pandas (due to being a duplicate column name)
df.drop(['Scan_Number', 'Subject ID.1', 'Specialty.1', 'Modality.1', 'Modality.2', 'comments.1', "1D longest dimension (cm) last imaging"], axis=1,inplace=True)

# Remove Completely Null/Empty Columns
df.dropna(how='all', axis='columns', inplace=True)

In [None]:
import io
dataset_decisions = os.path.join(FILE_DIR, "Dataset_Decisions.csv")
decisions_df = pd.read_csv(dataset_decisions)

to_drop = []                  
for i in range(len(decisions_df)):
  if decisions_df.loc[i, "Decision"] == "REMOVE":
    to_drop.append(decisions_df.loc[i, "Column Name"])

df.drop(columns=to_drop, axis=1, inplace=True, errors="ignore")

## Remove Rows with Missing Data

In [None]:
# Specified rows that are missing data to drop, does not include 2nd scan rows


### This will need to be changed to accomodate the fact that we are removing columns before this
# rowsToDrop = [32, 132, 167,  205, 248, 270, 285, 307, 318, 408, 425,  497, 514, 436, 453, 497, 514,546, 631, 650] 
# df = df.drop(rowsToDrop)

# Visualize Dataset

In [None]:
display(df)

## Find NaN Values

In [None]:
# Find Columns with Any missing Values
ind = {}

for col in df:
  if df[col].isna().sum() > 0:
    ind[col] = (df[df[col].isnull()].index.tolist()) 

for k in ind.keys():
  missing = ind[k]
  n = len(missing)
  print(f"{k} missing {n} values at: {missing}")
  # print(f"{k} missing {n} values")
  # if n == 1:
    # print(f"{k}")
    # print(f"{k} missing {missing}")

# tmp = ", ".join([k for k,v in ind.items() if len(v) == 1])
# print(tmp)

In [None]:
# df=df.drop('^Unnamed')
print(df.shape)
# list(df.columns)

In [None]:
rows = {}
ind = {} # Location of NaN values in column
         # Won't line up with spreadsheet because of removed rows 
drop = []

for col in df:
  if df[col].isna().sum() > 0:
    ind[col] = (df[df[col].isnull()].index.tolist()) 

## Drop Column

for k in ind.keys():
  missing = ind[k]
  n = len(missing)
  # print(f"{k} missing {n} values at: {missing}")
  # print(f"{k} missing {n} values")
  if len(missing) > 100:
    print(f"Removing column {k}, missing {n}")
    drop.append(k)

df = df.drop(drop, axis=1)

for col in df:
  if df[col].isna().sum() > 0:
    ind[col] = (df[df[col].isnull()].index.tolist()) 

for k in ind.keys():
  missing = ind[k]
  n = len(missing)
  print(f"{k} missing {n} values at: {missing}")


## Drop Rows
# drop = df.iloc[[True if i > 0 else False for i in df.isnull().sum(axis=1).tolist()]]
# df.drop(index=drop)
df = df.dropna(how='any',axis=0) 

In [None]:
df.shape

# Export Modified Dataset

In [None]:
df.shape

## Regression Datasets

In [None]:
# Comment out the below line to run code
%%script false

import datetime

t = datetime.timezone(datetime.timedelta(hours=-5)) # CT Timezone offset -5 from UTC (varies based on DST)
current_time = datetime.datetime.now(tz=t).strftime("%Y-%m-%d %H:%M:%S")
loc = os.path.join(FILE_DIR, "Datasets/")
loc = "'"+loc+"'"

# Full Processed csv
#Export current data set 
filename = "original_processed_"+current_time+".csv"
df.to_csv(filename,index=False)
filename = "'"+filename+"'"
!cp {filename} {loc}

# Volumetric Growth Dataset 
## Post Con
post_vol = df.copy()
post_vol = post_vol[post_vol["con"].str.contains("post")]
post_vol = post_vol.drop("Unidimensional Rate", axis=1)
post_vol = post_vol.drop("Unidimensional Change", axis=1)
post_vol = post_vol.drop("con", axis=1)

filename = 'post_con_volumetric_'+current_time+'.csv'
post_vol.to_csv(filename,index=False)
filename = "'"+filename+"'"
!cp {filename} {loc}

## Non Con
non_vol = df.copy()
non_vol = non_vol[non_vol["con"].str.contains("non")]
non_vol = non_vol.drop(labels="Unidimensional Rate", axis=1)
non_vol = non_vol.drop(labels="Unidimensional Change", axis=1)
non_vol = non_vol.drop("con", axis=1)

filename = 'non_con_volumetric_'+current_time+'.csv'
non_vol.to_csv(filename,index=False)
filename = "'"+filename+"'"
!cp {filename} {loc}

# Unidimensional Rate Growth Dataset
## Post-con
post_uni_rate = df.copy()
post_uni_rate = post_uni_rate[post_uni_rate["con"].str.contains("post")]
post_uni_rate = post_uni_rate.drop("Volumetric", axis=1)
post_uni_rate = post_uni_rate.drop("Unidimensional Change", axis=1)
post_uni_rate = post_uni_rate.drop("con", axis=1)

filename = 'post_con_rate_unidimensional_'+current_time+'.csv'
post_uni_rate.to_csv(filename,index=False)
filename = "'"+filename+"'"
!cp {filename} {loc}

## Non-con
non_uni_rate = df.copy()
non_uni_rate = non_uni_rate[non_uni_rate["con"].str.contains("non")]
non_uni_rate = non_uni_rate.drop("Volumetric", axis=1)
non_uni_rate = non_uni_rate.drop("Unidimensional Change", axis=1)
non_uni_rate = non_uni_rate.drop("con", axis=1)

filename = 'non_con_rate_unidimensional'+current_time+'.csv'
non_uni_rate.to_csv(filename,index=False)
filename = "'"+filename+"'"
!cp {filename} {loc}

# Unidimensional Change Growth Dataset
## Post-con
post_uni_change = df.copy()
post_uni_change = post_uni_change[post_uni_change["con"].str.contains("post")]
post_uni_change = post_uni_change.drop("Volumetric", axis=1)
post_uni_change = post_uni_change.drop("Unidimensional Rate", axis=1)
post_uni_change = post_uni_change.drop("con", axis=1)

filename = 'post_con_change_unidimensional_'+current_time+'.csv'
post_uni_change.to_csv(filename,index=False)
filename = "'"+filename+"'"
!cp {filename} {loc}

## Non-con
non_uni_change = df.copy()
non_uni_change = non_uni_change[non_uni_change["con"].str.contains("non")]
non_uni_change = non_uni_change.drop("Volumetric", axis=1)
non_uni_change = non_uni_change.drop("Unidimensional Rate", axis=1)
non_uni_change = non_uni_change.drop("con", axis=1)

filename = 'non_con_change_unidimensional_'+current_time+'.csv'
non_uni_change.to_csv(filename,index=False)
filename = "'"+filename+"'"
!cp {filename} {loc}
print("Files Copied!")

## Classification Based Datasets (Includes regression as subcategory of classification)

In [None]:
# Comment out the below line to run code
# %%script false

import datetime

t = datetime.timezone(datetime.timedelta(hours=-5)) # CT Timezone offset -5 from UTC (varies based on DST)
current_time = datetime.datetime.now(tz=t).strftime("%Y-%m-%d %H:%M:%S")
loc = os.path.join(FILE_DIR, "Datasets/")
loc = "'"+loc+"'"

## Datasets to output
### Post con absolute change classification
### Post con rate classification
### Post con change threshold (regression)
### Post con rate threshold (regression)

## Datasets to output
### Post con absolute change classification
post_uni_change_class = df.copy()
post_uni_change_class = post_uni_change_class[post_uni_change_class["con"].str.contains("post")]
post_uni_change_class = post_uni_change_class.drop("Volumetric", axis=1)
post_uni_change_class = post_uni_change_class.drop("Unidimensional Change", axis=1)
post_uni_change_class = post_uni_change_class.drop("Unidimensional Rate", axis=1)
post_uni_change_class = post_uni_change_class.drop("Categorical Unidimensional Rate", axis=1)
post_uni_change_class = post_uni_change_class.drop("con", axis=1)

filename = 'post_con_change_classification_'+current_time+'.csv'
post_uni_change_class.to_csv(filename,index=False)
filename = "'"+filename+"'"
!cp {filename} {loc}

### Post con rate classification
post_uni_rate_class = df.copy()
post_uni_rate_class = post_uni_rate_class[post_uni_rate_class["con"].str.contains("post")]
post_uni_rate_class = post_uni_rate_class.drop("Volumetric", axis=1)
post_uni_rate_class = post_uni_rate_class.drop("Unidimensional Change", axis=1)
post_uni_rate_class = post_uni_rate_class.drop("Unidimensional Rate", axis=1)
post_uni_rate_class = post_uni_rate_class.drop("Categorical Unidimensional Change", axis=1)
post_uni_rate_class = post_uni_rate_class.drop("con", axis=1)

filename = 'post_con_rate_classification_'+current_time+'.csv'
post_uni_rate_class.to_csv(filename,index=False)
filename = "'"+filename+"'"
!cp {filename} {loc}


### New Regression Datasets (above threshold)
### Post con change threshold (regression)
post_uni_change_regression = df.copy()
post_uni_change_regression = post_uni_change_regression[post_uni_change_regression["con"].str.contains("post")]
post_uni_change_regression = post_uni_change_regression.drop("Volumetric", axis=1)
# post_uni_change_regression = post_uni_change_regression.drop("Unidimensional Change", axis=1) # this is what you are predicting
post_uni_change_regression = post_uni_change_regression.drop("Unidimensional Rate", axis=1)
post_uni_change_regression = post_uni_change_regression[post_uni_change_regression["Categorical Unidimensional Change"] == 1]
post_uni_change_regression = post_uni_change_regression.drop("Categorical Unidimensional Change", axis=1)
post_uni_change_regression = post_uni_change_regression.drop("con", axis=1)

filename = 'post_con_change_regression_'+current_time+'.csv'
post_uni_change_regression.to_csv(filename,index=False)
filename = "'"+filename+"'"
!cp {filename} {loc}

### Post con rate threshold (regression)
post_uni_rate_regression = df.copy()
post_uni_rate_regression = post_uni_rate_regression[post_uni_rate_regression["con"].str.contains("post")]
post_uni_rate_regression = post_uni_rate_regression.drop("Volumetric", axis=1)
post_uni_rate_regression = post_uni_rate_regression.drop("Unidimensional Change", axis=1)
# post_uni_rate_regression = post_uni_rate_regression.drop("Unidimensional Rate", axis=1) # this is what you are predicting
post_uni_rate_regression = post_uni_rate_regression[post_uni_rate_regression["Categorical Unidimensional Rate"] == 1]
post_uni_rate_regression = post_uni_rate_regression.drop("Categorical Unidimensional Rate", axis=1)
post_uni_rate_regression = post_uni_rate_regression.drop("con", axis=1)

filename = 'post_con_rate_regression_'+current_time+'.csv'
post_uni_rate_regression.to_csv(filename,index=False)
filename = "'"+filename+"'"
!cp {filename} {loc}


print("Files Copied!")