### Colorectal Cancer Dataset

In [1]:
# load modules
import pandas as pd
from src import ai_module as ai

In [2]:
# define data folder path and data file names

raw_data_folder = r"C:\Users\35799\Desktop\cookiecutter-analytical-project\biolizard-internship-marios\data\raw"

gene_expression_data = r"Colorectal Cancer Gene Expression Data.csv"
patient_data = r"Colorectal Cancer Patient Data.csv"

In [3]:
# load data

gene_expression_df = ai.data_load(raw_data_folder, gene_expression_data)
patient_df = ai.data_load(raw_data_folder, patient_data)

In [4]:
# combine data into one Pandas DataFrame

gene_expression_df_transposed = gene_expression_df.transpose()
gene_expression_df_transposed.columns = gene_expression_df_transposed.iloc[0]
gene_expression_df_transposed_index = gene_expression_df_transposed.drop("ID_REF", axis=0)
gene_expression_df_transposed_index_final = gene_expression_df_transposed_index.astype("float64")
df = pd.merge(patient_df, gene_expression_df_transposed_index_final, right_index=True, left_on='ID_REF')

In [5]:
# gene_id, gene_categorical, gene_continuous = ai.data_info(gene_expression_df, gene_expression_data, 20)
# patient_id, patient_categorical, patient_continuous = ai.data_info(patient_df, patient_data, 20)
df_id, df_cat, df_cont = ai.data_info(df, "Combined Colorectal Cancer Dataset", 20)

DATA FILE:
----------------------------------------------------------------------------------------------------
Combined Colorectal Cancer Dataset
----------------------------------------------------------------------------------------------------


DIMENSIONS:
----------------------------------------------------------------------------------------------------
Entries: 62
Features: 1944
----------------------------------------------------------------------------------------------------


CATEGORICAL FEATURES:
----------------------------------------------------------------------------------------------------
Features     Data Type    Categories & Counts
-----------  -----------  ---------------------------------------------------
Dukes Stage  object       {'A': 16, 'B': 14, 'C': 20, 'D': 12}
Gender       object       {'Female': 14, 'Male': 48}
Location     object       {'Colon': 2, 'Left': 20, 'Rectum': 18, 'Right': 22}
DFS event    float64      {0.0: 25, 1.0: 37}
Adj_Radio    float64 

In [6]:
# data split
X, y, X_train, y_train, X_test, y_test, X_val, y_val = ai.data_split(df, target="Dukes Stage", method="tt", random_state=0, train_proportion=0.8, test_proportion=0.2, validation_proportion=0.25, stratify="Yes")

In [7]:
# treat missing values
train_treat_na, test_treat_na = ai.treat_nan(X_train, y_train, X_test, y_test, df_id, df_cat, df_cont, target="Dukes Stage", drop_nan_rows=False, impute_cutoff=0.5, categorical_imputer="mode", continuous_imputer="median")

In [8]:
# treat duplicate values
train_df_treat_duplicate, test_df_treat_duplicate = ai.treat_duplicate(train_treat_na, test_treat_na, keep_in="first")

#### Experimental code

In [8]:
# selection_list = []
# for gene in df.columns[9:]:
#     selection_list.append((gene, gene))

In [9]:
# import ipywidgets as widgets
# import plotly.express as px
# def boxplot_creator(selection):  
#     fig = px.box(df, x='Dukes Stage', y=selection, color="Gender", title=f"Gene: {selection}", labels={selection:selection})
#     fig.show()
# widgets.interact(boxplot_creator, selection=widgets.Dropdown(options=selection_list, description='Select Gene:'));

In [10]:
# from ipywidgets import Output, Dropdown, VBox
# from IPython.display import clear_output
# out = Output()

# dropdown = Dropdown(options=selection_list, description='Select Gene:')



# def on_selection_change(selection):
#     with out:
#         clear_output()
#         print(selection.new)

# dropdown.observe(on_selection_change, 'value')
# dropdown.value = selection_list[0][0]
# display(VBox([dropdown, out]))
# with out:
#     display(fig)

In [11]:
# dropdown_gene = Dropdown(options=selection_list, description='Select Gene:')
# dropdown_color = Dropdown(options=[("Dukes Stage","Dukes Stage"), ('Gender','Gender') ('Location','Location')], description='Select Color:')
# dropdown_feature = Dropdown(options=[("Dukes Stage","Dukes Stage"), ('Gender','Gender') ('Location','Location')], description='Select Feature:')
# input_widgets = VBox([dropdown_feature, dropdown_gene, dropdown_color])

# out = Output()

# def fig_creator(df, x="Dukes Stage", y="117_at", color="Gender"):
#     import plotly.express as px
#     import pandas as pd
#     import numpy as np
#     title = f"Gene: {y}"
#     ax = px.box(df, x, y, color, title=title)
#     ax.show()
        
# def dropdown_filter(feature, gene, color):
#     out.clear_output()
#     with out:
#         fig_creator(df, x=feature, y=gene, color=color)

# def dropdown_gene_eventhandler(change):
#     dropdown_filter(change.new, dropdown.value)

# dropdown.observe(dropdown_gene_eventhandler, names='value')
# display(input_widgets)
# display(out)

# #initial plot
# with out:
#     fig_creator(df, x="Dukes Stage", y='117_at', color="Gender")

In [12]:
# dropdown = Dropdown(options=selection_list, description='Select Gene:')
# input_widgets = VBox([dropdown])

# out = Output()
# def com_filter(case, level, state, county):
#     out.clear_output()
#     if level == 'us':
#         with out:
#             mda.plot(us_timeseries, level="us", y=case)
#     elif level == "state":
#         with out:
#             display(dropdown_state)
#             mda.plot(state_timeseries, level="state", y=case, state=state)
#     elif level == "county":
#         with out:
#             display(dropdown_county)
#             mda.plot(counties_timeseries, level="county", y=case, county=county)

# def dropdown_case_eventhandler(change):
#     com_filter(change.new, dropdown_level.value, dropdown_state.value, dropdown_county.value)

# def dropdown_level_eventhandler(change):
#     com_filter(dropdown_case.value, change.new, dropdown_state.value, dropdown_county.value)    
    
# def dropdown_state_eventhandler(change):
#     com_filter(dropdown_case.value, dropdown_level.value, change.new, dropdown_county.value)
    
# def dropdown_county_eventhandler(change):
#     com_filter(dropdown_case.value, dropdown_level.value, dropdown_state.value, change.new)
    
# dropdown_case.observe(dropdown_case_eventhandler, names='value')
# dropdown_level.observe(dropdown_level_eventhandler, names='value')
# dropdown_state.observe(dropdown_state_eventhandler, names='value')
# dropdown_county.observe(dropdown_county_eventhandler, names='value')
# display(input_widgets)
# display(output)

# #initial plot
# with output:
#     mda.plot(us_timeseries, level="us", y='daily_cases')

In [13]:
# corr_matrix = ai.correlations(df, type="pearson", printout="matrix")

In [14]:
# corr_matrix = ai.correlations(df, type="pearson", printout="heatmap")

In [15]:
# # patient_df_treat_na = ai.treat_na(patient_df, patient_id, patient_categorical, patient_continuous, drop_na_rows=False, impute_value=0.5, categorical_imputer="mode", continuous_imputer="median")
# df_treat_na = ai.treat_na(df, df_id, df_cat, df_cont, drop_na_rows=False, impute_cutoff=0.5, categorical_imputer="mode", continuous_imputer="median")

In [16]:
# df_treat_duplicate = ai.treat_duplicate(df_treat_na, keep_in="first")