<a href="https://colab.research.google.com/github/Nickguild1993/Gun_Violence_Exploration/blob/main/Gun_Violence_Exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Trying to combine a couple different datasets of various gun violence/safety measures with the end goal being interactive plotly visualizations

In [1]:
# import the regulars

import pandas as pd
import numpy as np
from scipy import stats

! pip install sidetable
import sidetable


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sidetable
  Downloading sidetable-0.9.0-py3-none-any.whl (17 kB)
Installing collected packages: sidetable
Successfully installed sidetable-0.9.0


In [10]:
# get the datetime libraries

from datetime import datetime as dt
import datetime

In [57]:
# Import visualization libraries

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.graph_objects as go
import plotly.express as px

Load in mass shooting data set 

In [2]:
url = "https://raw.githubusercontent.com/Nickguild1993/Gun_Violence_Exploration/main/mass_shootings_2013_2022.csv"

df = pd.read_csv(url)
df.head(3)

Unnamed: 0,Incident ID,Incident Date,State,City Or County,Address,# Killed,# Injured
0,271363,"December 29, 2014",Louisiana,New Orleans,Poydras and Bolivar,0,4
1,269679,"December 27, 2014",California,Los Angeles,8800 block of South Figueroa Street,1,3
2,270036,"December 27, 2014",California,Sacramento,4000 block of May Street,0,4


Load in state population dataset

In [13]:
state_url = "https://raw.githubusercontent.com/Nickguild1993/Gun_Violence_Exploration/main/State_population_data.csv"
state_df = pd.read_csv("https://raw.githubusercontent.com/Nickguild1993/Gun_Violence_Exploration/main/State_population_data.csv")
state_df.head(3)

Unnamed: 0,rank,State,Pop,Growth,Pop2021,Pop2010,growthSince2010,Percent,density
0,1,California,39664128,0.0013,39613493,37319502,0.0628,0.118,254.6179
1,2,Texas,30097526,0.0124,29730311,25241971,0.1924,0.0896,115.2138
2,3,Florida,22177997,0.0106,21944577,18845537,0.1768,0.066,413.5757


Load in gun safety dataset

In [12]:
safety_url = "https://raw.githubusercontent.com/Nickguild1993/Gun_Violence_Exploration/main/State_GunLaw_Rankings.csv"
safety_df = pd.read_csv("https://raw.githubusercontent.com/Nickguild1993/Gun_Violence_Exploration/main/State_GunLaw_Rankings.csv")
safety_df.head(3)

Unnamed: 0,State,lawsRank,grade2019,gunDeathRate
0,California,1,A,7.45
1,New Jersey,2,A,4.75
2,Connecticut,3,A-,4.91


In [61]:
safety_df["grade2019"].value_counts()

F     21
A-     6
C+     5
C      4
D      4
C-     3
A      2
B+     2
D-     2
B      1
Name: grade2019, dtype: int64

Need to make it straight letter grade w/o plus minus so we can make it less noisey 

In [65]:
safety_df = safety_df.replace({"grade2019" : {"A-" : "A", "B+" : "B", "B-": "B", "C+" : "C", "C-" : "C", "D-": "D", "F": "F"}})

In [67]:
safety_df["grade2019"].value_counts()

F    21
C    12
A     8
D     6
B     3
Name: grade2019, dtype: int64

rename the safety_df["grade_2019"] column

In [68]:
safety_df = safety_df.rename(columns={"grade2019": "Safety Grade"})
safety_df.head(3)

Unnamed: 0,State,lawsRank,Safety Grade,gunDeathRate
0,California,1,A,7.45
1,New Jersey,2,A,4.75
2,Connecticut,3,A,4.91


#### EDA on mass shooting dataset

Normal inspection of dataset 

In [3]:
print("shape of df is:", df.shape)
print("---------------------------")
print("data types: ", df.dtypes)

shape of df is: (3609, 7)
---------------------------
data types:  Incident ID        int64
Incident Date     object
State             object
City Or County    object
Address           object
# Killed           int64
# Injured          int64
dtype: object


In [6]:
df.stb.missing()
# 8 address obs missing, we'll drop that column anyways

Unnamed: 0,missing,total,percent
Address,8,3609,0.221668
Incident ID,0,3609,0.0
Incident Date,0,3609,0.0
State,0,3609,0.0
City Or County,0,3609,0.0
# Killed,0,3609,0.0
# Injured,0,3609,0.0


Adding a column for total victims - deaths plus injuries

In [16]:
df["# Victims"] = df["# Killed"] + df["# Injured"]
df.head(3)

Unnamed: 0,Incident ID,Incident Date,State,City Or County,Address,# Killed,# Injured,# Victims
0,271363,2014-12-29,Louisiana,New Orleans,Poydras and Bolivar,0,4,4
1,269679,2014-12-27,California,Los Angeles,8800 block of South Figueroa Street,1,3,4
2,270036,2014-12-27,California,Sacramento,4000 block of May Street,0,4,4


In [17]:
df.columns

Index(['Incident ID', 'Incident Date', 'State', 'City Or County', 'Address',
       '# Killed', '# Injured', '# Victims'],
      dtype='object')

Need to change the "Incident Date" data type to a datetime object

In [15]:
# changing dtype of incident date to datetime

df["Incident Date"] = pd.to_datetime(df["Incident Date"], infer_datetime_format=True)
df.head(3)

Unnamed: 0,Incident ID,Incident Date,State,City Or County,Address,# Killed,# Injured
0,271363,2014-12-29,Louisiana,New Orleans,Poydras and Bolivar,0,4
1,269679,2014-12-27,California,Los Angeles,8800 block of South Figueroa Street,1,3
2,270036,2014-12-27,California,Sacramento,4000 block of May Street,0,4


In [8]:
# frequency tables with side table

df.stb.freq(["State"]).style.hide_index()

State,count,percent,cumulative_count,cumulative_percent
Illinois,374,10.362981,374,10.362981
California,340,9.420892,714,19.783874
Texas,244,6.760876,958,26.544749
Florida,218,6.040454,1176,32.585204
Louisiana,168,4.655029,1344,37.240233
Pennsylvania,167,4.627321,1511,41.867553
New York,166,4.599612,1677,46.467165
Ohio,141,3.906899,1818,50.374065
Georgia,137,3.796065,1955,54.17013
Missouri,117,3.241895,2072,57.412025


In [18]:
df.columns

Index(['Incident ID', 'Incident Date', 'State', 'City Or County', 'Address',
       '# Killed', '# Injured', '# Victims'],
      dtype='object')

Creating aggregate function dictionary for groupbys

In [45]:
agg_deaths = {
    "# Killed" : ["sum", "mean", "min", "max", "var"]
}

agg_injured = {
    "# Injured" : ["sum", "mean",  "min", "max", "var"]
}

agg_victims = {
    "# Victims" : [ "sum", "mean",  "min", "max", "var"]
}

In [51]:
# Looking at state deaths 
df_deaths = df.groupby("State").agg(agg_deaths).reset_index().round(2)
df_deaths[:5]

Unnamed: 0_level_0,State,# Killed,# Killed,# Killed,# Killed,# Killed
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,min,max,var
0,Alabama,98,1.03,0,7,1.52
1,Alaska,6,1.2,0,4,2.7
2,Arizona,65,1.76,0,5,2.47
3,Arkansas,41,0.95,0,5,1.43
4,California,394,1.16,0,16,3.08


In [52]:
df_deaths.columns

MultiIndex([(   'State',     ''),
            ('# Killed',  'sum'),
            ('# Killed', 'mean'),
            ('# Killed',  'min'),
            ('# Killed',  'max'),
            ('# Killed',  'var')],
           )

In [53]:
df_deaths = df_deaths.sort_values(by=[("# Killed", "sum")], ascending = False)
# state_deaths.style.hide_index() can't chain wiht either head or loc?
df_deaths.head(5)

Unnamed: 0_level_0,State,# Killed,# Killed,# Killed,# Killed,# Killed
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,min,max,var
41,Texas,419,1.72,0,27,9.27
4,California,394,1.16,0,16,3.08
9,Florida,285,1.31,0,50,13.85
12,Illinois,257,0.69,0,6,1.01
10,Georgia,149,1.09,0,8,1.95


Looking at state injuries

In [56]:
# agg_injured

df_injured = df.groupby(["State"]).agg(agg_injured).reset_index().round(2)
df_injured = df_injured.sort_values(by=[("# Injured", "sum")], ascending = False)
df_injured.head(5)

Unnamed: 0_level_0,State,# Injured,# Injured,# Injured,# Injured,# Injured
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,min,max,var
12,Illinois,1594,4.26,0,15,2.98
4,California,1350,3.97,0,19,4.18
41,Texas,1018,4.17,0,23,11.08
9,Florida,962,4.41,0,53,16.96
17,Louisiana,723,4.3,0,17,4.78


In [58]:
df.head(3)

Unnamed: 0,Incident ID,Incident Date,State,City Or County,Address,# Killed,# Injured,# Victims
0,271363,2014-12-29,Louisiana,New Orleans,Poydras and Bolivar,0,4,4
1,269679,2014-12-27,California,Los Angeles,8800 block of South Figueroa Street,1,3,4
2,270036,2014-12-27,California,Sacramento,4000 block of May Street,0,4,4


#### Joining the safety_df (refer to cell 10) with the df 

In [74]:
# Not a great join- need to fix this so I don't have to run next cell
combined_df = df.join(safety_df.set_index("State"), on ="State", how = "left")
combined_df.head(3)

Unnamed: 0,Incident ID,Incident Date,State,City Or County,Address,# Killed,# Injured,# Victims,lawsRank,Safety Grade,gunDeathRate
0,271363,2014-12-29,Louisiana,New Orleans,Poydras and Bolivar,0,4,4,32.0,F,21.31
1,269679,2014-12-27,California,Los Angeles,8800 block of South Figueroa Street,1,3,4,1.0,A,7.45
2,270036,2014-12-27,California,Sacramento,4000 block of May Street,0,4,4,1.0,A,7.45


Remove excess columns

In [75]:
combined_df = combined_df.drop(["Address", "Incident ID"], axis=1)
combined_df.columns

Index(['Incident Date', 'State', 'City Or County', '# Killed', '# Injured',
       '# Victims', 'lawsRank', 'Safety Grade', 'gunDeathRate'],
      dtype='object')

In [None]:

df = px.data.gapminder()
px.scatter(df, x="gdpPercap", y="lifeExp", animation_frame="year", animation_group="country",
           size="pop", color="continent", hover_name="country",
           log_x=True, size_max=55, range_x=[100,100000], range_y=[25,90])


px.scatter(df, x ="")