In [1]:
pip install dash plotly pandas

Note: you may need to restart the kernel to use updated packages.


# Required Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Loading the data

In [3]:
df1=pd.read_csv("AH_Provisional_Cancer_Death_Counts_by_Month_and_Year__2020-2021.csv")

In [4]:
df1.head()

Unnamed: 0,Data As Of,Start Date,End Date,Country,Year,Month,Sex,Age Group,Race and Hispanic Origin,Malignant neoplasms (C00-C97),...,Malignant neoplasm of breast (C50),Malignant neoplasm of cervix uteri (C53),"Malignant neoplasms of corpus uteri and uterus, part unspecified (C54-C55)",Malignant neoplasm of ovary (C56),Malignant neoplasm of prostate (C61),Malignant neoplasms of kidney and renal pelvis (C64-C65),Malignant neoplasm of bladder (C67),"Malignant neoplasms of meninges, brain and other parts of central nervous system (C70-C72)","Malignant neoplasms of lymphoid, hematopoietic and related tissue (C81-C96)","All other and unspecified malignant neoplasms (C17,C23-C24,C26-C31,C37-C41,C44-C49,C51-C52,C57-C60,C62-C63,C66,C68-C69,C73-C80,C97)"
0,11/23/2021,03/01/2020,03/31/2020,United States,2020,3,Female (F),0-14 years,Unknown,0,...,0,0,0,0,0,0,0,0,0,0
1,11/23/2021,04/01/2020,04/30/2020,United States,2020,4,Female (F),0-14 years,Unknown,0,...,0,0,0,0,0,0,0,0,0,0
2,11/23/2021,05/01/2020,05/31/2020,United States,2020,5,Female (F),0-14 years,Unknown,0,...,0,0,0,0,0,0,0,0,0,0
3,11/23/2021,01/01/2020,01/31/2020,United States,2020,1,Female (F),0-14 years,Hispanic,14,...,0,0,0,0,0,0,0,5,4,5
4,11/23/2021,02/01/2020,02/29/2020,United States,2020,2,Female (F),0-14 years,Hispanic,17,...,0,0,0,0,0,1,0,8,3,5


# Data Preprocessing

In [5]:
missing_values = df1.isnull().sum()
missing_values

# No Null Values

Data As Of                                                                                                                             0
Start Date                                                                                                                             0
End Date                                                                                                                               0
Country                                                                                                                                0
Year                                                                                                                                   0
Month                                                                                                                                  0
Sex                                                                                                                                    0
Age Group                                

In [6]:
# Making another copy of the data and naming it "data"

data = df1.copy()

In [7]:
# Replacing "Female (F)" with "F" and "Male (M)" with "M" in the "Sex" column of the new DataFrame for visualization purpose

data['Sex'] = data['Sex'].replace({'Female (F)': 'F', 'Male (M)': 'M'})
print(data['Sex'].unique())

['F' 'M']


In [8]:
# Removing "years" from the 'Age Group' column for visualization purpose

data['Age Group'] = data['Age Group'].str.replace(' years', '')
print(data['Age Group'].unique())

['0-14' '15-24' '25-34' '35-44' '55-64' '45-54' '65-74' '75 and over']


In [9]:
# Removing "Hispanic" from every row in the "Race" column for easy readability and visualization purposes

data['Race and Hispanic Origin'] = data['Race and Hispanic Origin'].str.replace('Non-Hispanic ', '')
print(data['Race and Hispanic Origin'].unique())

['Unknown' 'Hispanic' 'American Indian or Alaska Native' 'Asian' 'Black'
 'More than one race' 'Native Hawaiian or Other Pacific Islander' 'White']


In [10]:
# Renaming the "Race and Hispanic Origin" column

data.rename(columns={'Race and Hispanic Origin': 'Race'}, inplace=True)

In [11]:
# Replacing "Malignant Neoplasm" with "Cancer" for easy readability and visualization

data.columns = [col.replace('Malignant neoplasms', 'Cancer') for col in data.columns]
data.columns = [col.replace('Malignant neoplasm', 'Cancer') for col in data.columns]
print(data.columns)

Index(['Data As Of', 'Start Date', 'End Date', 'Country', 'Year', 'Month',
       'Sex', 'Age Group', 'Race', 'Cancer (C00-C97)',
       'Cancer of lip, oral cavity and pharynx (C00-C14)',
       'Cancer of esophagus (C15)', 'Cancer of stomach (C16)',
       'Cancer of colon, rectum and anus (C18-C21)',
       'Cancer of liver and intrahepatic bile ducts (C22)',
       'Cancer of pancreas (C25)', 'Cancer of larynx (C32)',
       'Cancer of trachea, bronchus and lung (C33-C34)',
       'Malignant melanoma of skin (C43)', 'Cancer of breast (C50)',
       'Cancer of cervix uteri (C53)',
       'Cancer of corpus uteri and uterus, part unspecified (C54-C55)',
       'Cancer of ovary (C56)', 'Cancer of prostate (C61)',
       'Cancer of kidney and renal pelvis (C64-C65)',
       'Cancer of bladder (C67)',
       'Cancer of meninges, brain and other parts of central nervous system (C70-C72)',
       'Cancer of lymphoid, hematopoietic and related tissue (C81-C96)',
       'All other and unspec

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2816 entries, 0 to 2815
Data columns (total 29 columns):
 #   Column                                                                                                                               Non-Null Count  Dtype 
---  ------                                                                                                                               --------------  ----- 
 0   Data As Of                                                                                                                           2816 non-null   object
 1   Start Date                                                                                                                           2816 non-null   object
 2   End Date                                                                                                                             2816 non-null   object
 3   Country                                                                            

In [13]:
# Changing column names for visualization purpose

data.rename(columns={'All other and unspecified malignant neoplasms (C17,C23-C24,C26-C31,C37-C41,C44-C49,C51-C52,C57-C60,C62-C63,C66,C68-C69,C73-C80,C97)': 'All Other Malignant Neoplasms', 'Cancer of meninges, brain and other parts of central nervous system (C70-C72)': 'Cancer of brain/central nervous system (C70-C72)', 'Cancer of lymphoid, hematopoietic and related tissue (C81-C96)' : 'Cancer of lymphoid/hematopoietic (C81-C96)'}, inplace=True)

In [14]:
# Converted the dataframe into csv

data.to_csv('cleaned_P4_2.csv', index=False)

# Data visulization : Dashboard

In [15]:
# Libraries required for creating Dashboard

import dash
from dash import dcc, html
from dash.dependencies import Input, Output
from dash import html, dcc, Input, Output, dash_table
import plotly.express as px
import plotly.graph_objects as go
import plotly.graph_objs as go

In [16]:
app = dash.Dash(__name__)

In [17]:
# Loading the cleaned data

df = pd.read_csv('cleaned_P4_2.csv')

In [None]:
# Loading the cancer types definitions Excel file to add to the dashboard as a feature

cancer_definitions = pd.read_excel('Cancer Types Definitions.XLSX')  

In [None]:
# Converting the cancer definitions dataframe to a dictionary for easy retrieval

cancer_definitions_dict = cancer_definitions.set_index('Cancer Type').to_dict('index')

In [18]:
# Converting 'Start Date' to datetime type and creating 'Month-Year' column

df['Start Date'] = pd.to_datetime(df['Start Date'])
df['Month-Year'] = df['Start Date'].dt.strftime('%Y-%m')

In [25]:
# Data Preparation for the heatmap 

df_dropped_updated = df.drop(columns=['Data As Of', 'Start Date', 'End Date', 'Country', 'Year', 'Month', 'Sex', 'Race', 'Cancer (C00-C97)', 'Month-Year'])
df_grouped_updated = df_dropped_updated.groupby('Age Group').sum().reset_index()

df_transposed_updated = df_grouped_updated.set_index('Age Group').transpose().reset_index()
df_transposed_updated.columns.name = None


In [26]:
# Selecting columns to sum
columns_to_sum = [col for col in df.columns if df[col].dtype in ['int64', 'float64'] and not any(z in col for z in ['Start Date', 'Month-Year', 'Year', 'Month'])]

# Aggregating data for the updated time series chart and bar chart
monthly_cases = df.groupby('Month-Year')[columns_to_sum].sum().reset_index()
aggregated_data = df.groupby('Age Group')[columns_to_sum].sum().reset_index()

# Heatmap
heatmap_fig = go.Figure(data=go.Heatmap(
    z=df_transposed_updated.iloc[:, 1:].values,
    x=df_transposed_updated.columns[1:],
    y=df_transposed_updated['index'],
    colorscale='YlOrRd',
    colorbar=dict(title='Total Cases'),
))
heatmap_fig.update_layout(
    height=600,
    width=1400, 
    title_text="Relationship between 'Age Group' & 'Cancer Type'",
    title_x=0.5,
    title_font=dict(size=24),  
    xaxis_title="Age Group",
    yaxis_title="Cancer Type"
)

# Initializing the Dash app
app = dash.Dash(__name__)
app.title = 'Dashboard for Deaths caused by Malignant Neoplasms (Cancer)'

# Define the layout of the app with two graphs per row and left-aligned dropdown, including the heatmap
app.layout = html.Div(children=[
    html.H1(
        children='Dashboard for Deaths caused by Malignant Neoplasms (Cancer)',
        style={'textAlign': 'center', 'fontWeight': 'bold', 'textDecoration': 'underline', 'fontSize': '42px', 'color': '#000000','paddingTop': '30px'}
    ),
    html.H2(
        children='For the Year 2020-21',
        style={'textAlign': 'center', 'fontWeight': 'bold'}
    ),
    html.Div([
        html.Label('Select Cancer Type:', style={'textAlign': 'left', 'fontSize': 22}),
        dcc.Dropdown(
            id='cancer-type-dropdown',
            options=[{'label': i, 'value': i} for i in columns_to_sum],
            value=columns_to_sum[0],
            style={'width': '100%'}
        )
    ], style={'padding': '10px'}),
    html.Div(id='total-deaths', style={'textAlign': 'center', 'fontSize': 30, 'marginTop': 20, 'marginBottom': 20, 'color': '#DD3721'}),  
    html.Div(id='cancer-info-table'),  
    html.Div([
        dcc.Graph(id='time-series-chart'),
        dcc.Graph(id='pie-chart')
    ], style={'display': 'flex', 'justify-content': 'center', 'flex-wrap': 'wrap'}),
    html.Div([
        dcc.Graph(id='age-group-bar-chart'),
        dcc.Graph(id='race-tree-chart'),
        dcc.Graph(figure=heatmap_fig)
    ], style={'display': 'flex', 'justify-content': 'center', 'flex-wrap': 'wrap'}),
], style={'backgroundColor': '#9CE2F8'})

# Callbacks for the charts, total deaths text, and cancer information display
@app.callback(
    [
        Output('time-series-chart', 'figure'),
        Output('pie-chart', 'figure'),
        Output('age-group-bar-chart', 'figure'),
        Output('race-tree-chart', 'figure'),
        Output('total-deaths', 'children'), 
        Output('cancer-info-table', 'children')  
    ],
    [Input('cancer-type-dropdown', 'value')]
)
def update_charts(selected_cancer_type):
    
    # Time Series Chart
    time_series_fig = px.line(monthly_cases, x='Month-Year', y=selected_cancer_type)
    time_series_fig.update_layout(
        title_text=f'Death Cases over Months',
        title_x=0.5,
        title_font=dict(size=24, family='Verdana, bold', color='black')
    )

    # Pie Chart for Sex Distribution
    pie_chart_colors = {'F': 'pink', 'M': 'steel blue'}
    pie_chart_fig = px.pie(df, values=selected_cancer_type, names='Sex')
    pie_chart_fig.update_traces(marker=dict(colors=[pie_chart_colors.get(sex, 'grey') for sex in df['Sex']]), textinfo='percent+label')
    pie_chart_fig.update_layout(
        title_text=f'Distribution by Sex',
        title_x=0.5,
        title_font=dict(size=24, family='Verdana, bold', color='black')
    )


    # Bar Chart for Age Group Distribution 
    age_group_fig = px.funnel(aggregated_data, x='Age Group', y=selected_cancer_type)

    age_group_fig.update_traces(
    marker_color='#FF8243',  
    marker_line_color='black',  
    marker_line_width=1.5  
    )

    age_group_fig.update_layout(
    title_text=f'Deaths by Age Group',
    title_x=0.5,
    title_font=dict(size=24, family='Verdana, bold', color='black')
    )


    # Tree Map for Race Distribution
    race_tree_fig = px.treemap(df, path=['Race'], values=selected_cancer_type)
    race_tree_fig.update_layout(
        title_text=f'Deaths by Race',
        title_x=0.5,
        title_font=dict(size=24, family='Verdana, bold', color='black')
    )

    # Total deaths for the selected cancer type
    total_deaths = df[selected_cancer_type].sum()
    total_deaths_text = f"Total Deaths: {total_deaths:,.0f}"

    # Cancer type information
    cancer_info = cancer_definitions_dict.get(selected_cancer_type, {})
    cancer_info_table = html.Div([
        html.H4(f"Cancer Type: {selected_cancer_type}"),
        html.P(f"Definition: {cancer_info.get('Definition', 'N/A')}"),
        html.P(f"Symptoms: {cancer_info.get('Symptoms', 'N/A')}")
    ], style={'padding': '20px', 'border': '1px solid #ddd', 'margin': '20px 0', 'borderRadius': '5px'})

    return time_series_fig, pie_chart_fig, age_group_fig, race_tree_fig, total_deaths_text, cancer_info_table

# Running the app
if __name__ == '__main__':
    app.run_server(debug=True, use_reloader=False)
    

print(f'Dashboard is running at http://127.0.0.1:8050/')


Dashboard is running at http://127.0.0.1:8050/
