In [208]:
# importing libraries:

import numpy as np
import plotly.graph_objects as go # graph objects
import plotly.offline as pyo  # to save the visualization in HTML format

# Alternative of plotly.offline
import plotly.io as pio

## Loading the datasets

In [209]:
import pandas as pd

url = "https://raw.githubusercontent.com/dspiegel29/ArtofStatistics/master/00-1-age-and-year-of-deathofharold-shipmans-victims/00-1-shipman-confirmed-victims-x.csv"

shipman_victims = pd.read_csv(url) #dataset that has the records of all the victims

# dataset with records of the hours of shipman's patients and other local family doctor
shipman_times = pd.read_csv('https://raw.githubusercontent.com/dspiegel29/ArtofStatistics/master/00-2-shipman-times/00-2-shipman-times-x.csv')


## Exploring the datasets

In [210]:
# Shipman Victim's Record:

shipman_victims.head()

Unnamed: 0,DateofDeath,Name,Age,PlaceofDeath,Decision,yearOfDeath,gender,fractionalDeathYear,ageBracket,gender2
0,17-Mar-75,Eva Lyons,70,Own home,Unlawful killing,1975,0,1974.71,70-74,Women
1,07-Aug-78,Sarah Hannah Marsland,86,Own home,Unlawful killing,1978,0,1978.1,85-89,Women
2,30-Aug-78,Mary Ellen Jordan,73,Own home,Unlawful killing,1978,0,1978.16,70-74,Women
3,07-Dec-78,Harold Bramwell,73,Own home,Unlawful killing,1978,1,1978.44,70-74,Men
4,20-Dec-78,Annie Campbell,88,Own home,Unlawful killing,1978,0,1978.47,85-89,Women


In [211]:
shipman_victims.shape

(215, 10)

In [212]:
shipman_times.head()

Unnamed: 0,Hour,Shipman,Comparison
0,0,2.6,1.1
1,1,1.0,3.0
2,2,2.6,3.1
3,3,3.0,3.8
4,4,0.3,4.0


In [213]:
# Record of the hours of death:

shipman_times.shape

(24, 3)

In [214]:
shipman_victims.columns


Index(['DateofDeath', 'Name', 'Age', 'PlaceofDeath', 'Decision', 'yearOfDeath',
       'gender', 'fractionalDeathYear', 'ageBracket', 'gender2'],
      dtype='object')

In [215]:
shipman_times.columns

Index(['Hour', 'Shipman', 'Comparison'], dtype='object')

In [216]:
shipman_victims.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 10 columns):
DateofDeath            215 non-null object
Name                   215 non-null object
Age                    215 non-null int64
PlaceofDeath           215 non-null object
Decision               215 non-null object
yearOfDeath            215 non-null int64
gender                 215 non-null int64
fractionalDeathYear    215 non-null float64
ageBracket             215 non-null object
gender2                215 non-null object
dtypes: float64(1), int64(3), object(6)
memory usage: 16.9+ KB


In [217]:
shipman_victims.isnull().sum()

DateofDeath            0
Name                   0
Age                    0
PlaceofDeath           0
Decision               0
yearOfDeath            0
gender                 0
fractionalDeathYear    0
ageBracket             0
gender2                0
dtype: int64

In [218]:
shipman_times.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 3 columns):
Hour          24 non-null int64
Shipman       24 non-null float64
Comparison    24 non-null float64
dtypes: float64(2), int64(1)
memory usage: 656.0 bytes


In [219]:
shipman_times.isnull().sum()

Hour          0
Shipman       0
Comparison    0
dtype: int64

In [220]:
# assuming your dataframe is called 'df'
# 'DateOfDeath' is object datatype but we will change it to datetime

shipman_victims['DateofDeath'] = pd.to_datetime(shipman_victims['DateofDeath'])


In [221]:
shipman_victims.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 215 entries, 0 to 214
Data columns (total 10 columns):
DateofDeath            215 non-null datetime64[ns]
Name                   215 non-null object
Age                    215 non-null int64
PlaceofDeath           215 non-null object
Decision               215 non-null object
yearOfDeath            215 non-null int64
gender                 215 non-null int64
fractionalDeathYear    215 non-null float64
ageBracket             215 non-null object
gender2                215 non-null object
dtypes: datetime64[ns](1), float64(1), int64(3), object(5)
memory usage: 16.9+ KB


In [222]:
shipman_times.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 3 columns):
Hour          24 non-null int64
Shipman       24 non-null float64
Comparison    24 non-null float64
dtypes: float64(2), int64(1)
memory usage: 656.0 bytes


In [9]:
# Unique values for Place of death:
shipman_victims['PlaceofDeath'].unique()

array(['Own home', 'Hospital', 'Residential Home', "Shipman's surgery",
       'Nursing home', 'Residential home'], dtype=object)

In [10]:
# Unique values for the Decision column:
shipman_victims['Decision'].unique()

array(['Unlawful killing', 'Conviction'], dtype=object)

## Data Visualization

#### Scatter Plot:

In [160]:
## Scatter Plot: Age vs. DateOfDeath

##data:

data= [go.Scatter(x= shipman_victims['DateofDeath'],
                  y= shipman_victims['Age'], 
                  mode='markers', 
                  marker=dict(size=12,
                              color='rgb(51,51,153)',
                              line={'width':.2}
                             )
                 )
      ]

##layout:

layout= go.Layout(title = 'Age vs. Date of Death', # Graph title
                  xaxis = dict(title = 'Date Of Death'), # x-axis label
                  yaxis = dict(title = 'Age'), # y-axis label
                  hovermode ='closest' # handles multiple points landing on the same vertical
)

## figure:

fig= go.Figure(data=data, layout=layout)
fig.show()
pyo.plot(fig, filename='ScatterPlot.html')



Lot of occurances from Dec 1993 to Jun 1998. First victim of 70 years of age was recored on 1975. After 3 years there are records of few other victims of age all greater than 70. In 1984 a patient of age 51 was recorded to be his victim and the youngest recorded patient of age 41 was also a victim of Harold Shipman at year 1985. Break in 1991, 1980, 1983.

#### Nested Bar Plot:

In [224]:
# Nested Bar plot:

# Creating traces for each bar based on the age bracket:

age_forties= go.Bar(x= shipman_victims['PlaceofDeath'],
               y= shipman_victims[shipman_victims['Age'] <= 49]['Age'],
               name='Age less than or equal to 49 years'  
             )
              
       

age_fifties= go.Bar(x= shipman_victims['PlaceofDeath'],
               y= shipman_victims[(shipman_victims['Age']> 49) & (shipman_victims['Age'] <=59)]['Age'],
               name='Age in 50-59 bracket' 
             )
              
      

age_sixties= go.Bar(x= shipman_victims['PlaceofDeath'],
                y= shipman_victims[(shipman_victims['Age']> 59) & (shipman_victims['Age'] <=69)]['Age'],
               name='Age in 60-69 bracket'  
             )
              
age_seventies= go.Bar(x= shipman_victims['PlaceofDeath'],
                y= shipman_victims[(shipman_victims['Age']> 69) & (shipman_victims['Age'] <=79)]['Age'],
               name='Age in 70-79 bracket'  
             ) 

age_eighties= go.Bar(x= shipman_victims['PlaceofDeath'],
                y= shipman_victims[(shipman_victims['Age']> 79) & (shipman_victims['Age'] <=89)]['Age'],
               name='Age in 80-89 bracket'   
             ) 

age_nineties= go.Bar(x= shipman_victims['PlaceofDeath'],
                y= shipman_victims[(shipman_victims['Age']> 89) & (shipman_victims['Age'] <= 99)]['Age'],
               name='Age in 90-99 bracket'   
             ) 

data= [age_forties, age_fifties, age_sixties, age_seventies, age_eighties, age_nineties]  

### layout:#

layout= go.Layout(title='Place of Death and Age',
                  xaxis= dict(title='Place of Death'),
                  yaxis= dict(title= 'Age')
                  )

fig= go.Figure(data=data, layout=layout)

fig.show()
pyo.plot(fig, filename='NestedBarGraph.html')

'NestedBarGraph.html'

#### Histogram:

In [225]:
# data to plot the age of all genders:

data = [go.Histogram(
    x=shipman_victims[shipman_victims['gender']==0]['Age'],
    opacity=0.75,
    name='Female'
),
go.Histogram(
    x=shipman_victims[shipman_victims['gender']==1]['Age'],
    opacity=0.75,
    name='Male'
)]

layout = go.Layout(
    barmode='overlay',
    title="Victim's Age Comparison by Gender"
)
fig = go.Figure(data=data, layout=layout)
fig.show()
#pyo.plot to save the html file locally:
pyo.plot(fig, filename='Histogram.html')

'Histogram.html'

Females with age bracket of 80-84 were targeted the most. Overall, we can see that there were more female victims than male

#### Bubble Chart:

In [167]:
# Bubble Chart:

data= [go.Scatter(x= shipman_victims['DateofDeath'],
                  y= shipman_victims['Age'],
                  text=shipman_victims['gender2'],
                  mode='markers',
                  marker=dict(
                      size= shipman_victims['Age']/5, # set the size
                      color=shipman_victims['Decision'], # color of markers based on decision column
                      colorscale='Viridis', # choose a colorscale
                      colorbar=dict(title='Decision') # add a colorbar with title
                  )
            
                 )
      ]

layout = go.Layout(
    title='Bubble Chart for Age vs. YearOfDeath',
    xaxis = dict(title = 'Year of Death'), # x-axis label
    yaxis = dict(title = 'Age'),        # y-axis label
    hovermode='closest'
)
fig = go.Figure(data=data, layout=layout)

fig.show()
pyo.plot(fig, filename='Bubblechart.html')


'Bubblechart.html'

Around 16 cases Harold was convicted for victims all women with least age being 49 and higfhest being 81.

 #### At what time of day did Harold Shipman's victims die?

In [227]:

# create the plot using Plotly:

shipman_trace = go.Scatter(x=shipman_times["Hour"], y=shipman_times["Shipman"], mode="lines+markers", name="Shipman")
comp_trace = go.Scatter(x=shipman_times["Hour"], y=shipman_times["Comparison"], mode="lines", name="Comparison GPs", line=dict(dash="dash"))

layout = go.Layout(title="Deaths by Hour of Day",
                   xaxis=dict(title="Hour of Day"),
                   yaxis=dict(title="% of Deaths", range=[0, 16]))

fig = go.Figure(data=[shipman_trace, comp_trace], layout=layout)

# update the legend font size and color
fig.update_layout(legend=dict(font=dict(size=16), orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1))

# add annotations
fig.add_annotation(x=12, y=14, text="Shipman", font=dict(size=14, color="blue"), showarrow=False)
fig.add_annotation(x=4, y=7, text="Comparison GPs", font=dict(size=14, color="red"), showarrow=False)

fig.show()
pyo.plot(fig, filename='Linechart.html')

'Linechart.html'

Most of the patients of Harold dies in the afternoon.

#### Boxplots:

In [229]:
# Comparing the distribution of the data:
data = [
    go.Box(
        y=shipmantimes["Comparison"],
        name='Comparion GPs'
    ),
    go.Box(
        y=shipmantimes["Shipman"],
        name='Harold Shipman'
    )
]
layout = go.Layout(
    title = "Comparison of Harold and other practitioner<br>\
    patient's death time"
)
fig = go.Figure(data=data, layout=layout)
fig.show()
pyo.plot(fig, filename='BoxPlot.html')

'BoxPlot.html'

Two outliers in case of Harold Shipman for 14.1 and 13 hours

 _ _ _ _ _ _ _  _