In [13]:
import pandas as pd
import numpy as np
import plotly.express as px


In [14]:
# Load the dataset
file_path = 'combined_2020-2022.csv'
df = pd.read_csv(file_path)

# Handle missing values and preprocess
df['TOTSLF22'] = pd.to_numeric(df['TOTSLF22'], errors='coerce')
df['FAMINC22'] = pd.to_numeric(df['FAMINC22'], errors='coerce')
df['HOUR53'] = pd.to_numeric(df['HOUR53'], errors='coerce')
df['SEX'] = df['SEX'].str.strip()
df['RACEV1X'] = df['RACEV1X'].str.strip()
df['REGION22'] = df['REGION22'].str.strip()
df = df.dropna(subset=['TOTSLF22'])  # Ensure the target column has no missing values

# Define columns where 1 means "Yes" and 2 means "No"
binary_columns = ['PRVEV22', 'MCREV22', 'MCDEV22', 'UNINS22',
                  'CABLADDR', 'CABREAST', 'CACERVIX', 'CACOLON',
                  'CALUNG', 'CALYMPH', 'CAMELANO', 'CAOTHER',
                  'CAPROSTA', 'CASKINNM', 'CASKINDK', 'CAUTERUS']

# Replace 1 and 2 with "Yes" and "No"
for col in binary_columns:
    if col in df.columns:
        df[col] = df[col].replace({1: "Yes", 2: "No"})


In [15]:
# Visualization

def plot_cost_distribution(df):
    fig = px.histogram(df, x='TOTSLF22', nbins=30, title="Distribution of Out-of-Pocket Costs",
                       labels={'TOTSLF22': 'Out-of-Pocket Cost'},
                       marginal='box')
    fig.update_layout(bargap=0.1)
    return fig

def plot_income_vs_cost(df):
    fig = px.scatter(df, x='FAMINC22', y='TOTSLF22', color='RACEV1X',
                     title="Family Income vs. Out-of-Pocket Costs",
                     labels={'FAMINC22': 'Family Income', 'TOTSLF22': 'Out-of-Pocket Cost'})
    fig.update_traces(marker=dict(size=7))
    return fig

def plot_geographic_costs(df):
    avg_costs = df.groupby('REGION22')['TOTSLF22'].mean().reset_index()
    fig = px.bar(avg_costs, x='REGION22', y='TOTSLF22', title="Average Out-of-Pocket Costs by Region",
                 labels={'REGION22': 'Region', 'TOTSLF22': 'Average Cost'})
    return fig

def plot_racial_disparities(df):
    avg_costs = df.groupby('RACEV1X')['TOTSLF22'].mean().reset_index()
    fig = px.bar(avg_costs, x='RACEV1X', y='TOTSLF22', title="Average Out-of-Pocket Costs by Race",
                 labels={'RACEV1X': 'Race/Ethnicity', 'TOTSLF22': 'Average Cost'})
    return fig

def plot_marital_status_costs(df):
    fig = px.box(df, x='MARRY22X', y='TOTSLF22', color='RACEV1X',
                 title="Out-of-Pocket Costs by Marital Status and Race",
                 labels={'MARRY22X': 'Marital Status', 'TOTSLF22': 'Out-of-Pocket Cost'})
    return fig

def plot_parallel_coordinates(df):
    fig = px.parallel_coordinates(
        df, dimensions=['TOTSLF22', 'FAMINC22', 'AGELAST', 'HOUR53'],
        color='TOTSLF22', color_continuous_scale=px.colors.diverging.Tealrose,
        title="Parallel Coordinates Plot: Costs and Predictors"
    )
    return fig

def plot_insurance_costs(df):
    fig = px.box(df, x='PRVEV22', y='TOTSLF22', color='PRVEV22',
                 title="Out-of-Pocket Costs by Private Insurance Status",
                 labels={'PRVEV22': 'Private Insurance', 'TOTSLF22': 'Out-of-Pocket Cost'})
    fig.update_xaxes(categoryorder="array", categoryarray=["Yes", "No"])  # Ensure "Yes" appears before "No"
    return fig

def plot_insurance_bar(df):
    insurance_counts = df['PRVEV22'].value_counts().reset_index()
    insurance_counts.columns = ['Private Insurance', 'Count']
    fig = px.bar(insurance_counts, x='Private Insurance', y='Count', title="Private Insurance Status Distribution")
    return fig

def plot_cancer_diagnoses(df):
    cancer_counts = df[['CABLADDR', 'CABREAST', 'CACERVIX']].melt(var_name='Cancer Type', value_name='Status')
    fig = px.bar(cancer_counts[cancer_counts['Status'] == "Yes"],
                 x='Cancer Type', title="Counts of Cancer Diagnoses by Type")
    return fig


def plot_feature_importance(feature_importances, feature_names):
    fig = px.bar(x=feature_names, y=feature_importances, title="Feature Importance",
                 labels={'x': 'Feature', 'y': 'Importance'})
    return fig


# plot_cost_distribution(df)
# plot_income_vs_cost(df)
# plot_geographic_costs(df)
# plot_racial_disparities(df)
# plot_marital_status_costs(df)
# plot_parallel_coordinates(df)
# plot_insurance_costs(df)


In [17]:
!pip install dash
from dash import Dash, dcc, html

app = Dash(__name__)

app.layout = html.Div([
    html.H1("Cancer Treatment Cost Analysis"),
    dcc.Graph(figure=plot_cost_distribution(df)),
    dcc.Graph(figure=plot_income_vs_cost(df)),
    dcc.Graph(figure=plot_geographic_costs(df)),
    dcc.Graph(figure=plot_racial_disparities(df)),
    dcc.Graph(figure=plot_marital_status_costs(df)),
    dcc.Graph(figure=plot_parallel_coordinates(df)),
    dcc.Graph(figure=plot_insurance_costs(df)),
    dcc.Graph(figure=plot_insurance_bar(df)),
    dcc.Graph(figure=plot_cancer_diagnoses(df))
])

if __name__ == '__main__':
    app.run_server(debug=True)




<IPython.core.display.Javascript object>