<a href="https://colab.research.google.com/github/Nickguild1993/Gun_Violence_Exploration/blob/main/Gun_Violence_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Trying to combine a couple different datasets of various gun violence/safety measures with the end goal being interactive plotly visualizations

In [1]:
# import the regulars

import pandas as pd
import numpy as np
from scipy import stats

! pip install sidetable
import sidetable


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
# get the datetime libraries

from datetime import datetime as dt
import datetime

In [3]:
# Import visualization libraries

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.graph_objects as go
import plotly.express as px

Load in mass shooting data set 

In [4]:
url = "https://raw.githubusercontent.com/Nickguild1993/Gun_Violence_Exploration/main/mass_shootings_2013_2022.csv"

df = pd.read_csv(url)
df.head(3)

Unnamed: 0,Incident ID,Incident Date,State,City Or County,Address,# Killed,# Injured
0,271363,"December 29, 2014",Louisiana,New Orleans,Poydras and Bolivar,0,4
1,269679,"December 27, 2014",California,Los Angeles,8800 block of South Figueroa Street,1,3
2,270036,"December 27, 2014",California,Sacramento,4000 block of May Street,0,4


Load in state population dataset

In [5]:
state_url = "https://raw.githubusercontent.com/Nickguild1993/Gun_Violence_Exploration/main/State_Populations_2013_2022.csv"
state_df = pd.read_csv(state_url)
state_df.head(3)

Unnamed: 0,State,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,United States,315993715,318301008,320635163,322941311,324985539,326687501,328239523,331449281,332403650,332915073
1,.Alabama,4830081,4841799,4852347,4863525,4874486,4887681,4903185,5024279,4934193,4949697
2,.Alaska,737068,736283,737498,741456,739700,735139,731545,733391,724357,720763


Load in gun safety dataset

In [6]:
safety_url = "https://raw.githubusercontent.com/Nickguild1993/Gun_Violence_Exploration/main/State_GunLaw_Rankings.csv"
safety_df = pd.read_csv("https://raw.githubusercontent.com/Nickguild1993/Gun_Violence_Exploration/main/State_GunLaw_Rankings.csv")
safety_df.head(3)

Unnamed: 0,State,lawsRank,grade2019,gunDeathRate
0,California,1,A,7.45
1,New Jersey,2,A,4.75
2,Connecticut,3,A-,4.91


In [7]:
safety_df["grade2019"].value_counts()

F     21
A-     6
C+     5
C      4
D      4
C-     3
A      2
B+     2
D-     2
B      1
Name: grade2019, dtype: int64

Need to make it straight letter grade w/o plus minus so we can make it less noisey 

In [8]:
safety_df = safety_df.replace({"grade2019" : {"A-" : "A", "B+" : "B", "B-": "B", "C+" : "C", "C-" : "C", "D-": "D", "F": "F"}})

In [9]:
safety_df["grade2019"].value_counts()

F    21
C    12
A     8
D     6
B     3
Name: grade2019, dtype: int64

rename the safety_df["grade_2019"] column

In [10]:
safety_df = safety_df.rename(columns={"grade2019": "Safety Grade"})
safety_df.head(3)

Unnamed: 0,State,lawsRank,Safety Grade,gunDeathRate
0,California,1,A,7.45
1,New Jersey,2,A,4.75
2,Connecticut,3,A,4.91


#### EDA on mass shooting dataset

Normal inspection of dataset 

In [11]:
print("shape of df is:", df.shape)
print("---------------------------")
print("data types: ", df.dtypes)

shape of df is: (3609, 7)
---------------------------
data types:  Incident ID        int64
Incident Date     object
State             object
City Or County    object
Address           object
# Killed           int64
# Injured          int64
dtype: object


In [12]:
df.stb.missing()
# 8 address obs missing, we'll drop that column anyways

Unnamed: 0,missing,total,percent
Address,8,3609,0.221668
Incident ID,0,3609,0.0
Incident Date,0,3609,0.0
State,0,3609,0.0
City Or County,0,3609,0.0
# Killed,0,3609,0.0
# Injured,0,3609,0.0


Adding a column for total victims - deaths plus injuries

In [13]:
df["# Victims"] = df["# Killed"] + df["# Injured"]
df.head(3)

Unnamed: 0,Incident ID,Incident Date,State,City Or County,Address,# Killed,# Injured,# Victims
0,271363,"December 29, 2014",Louisiana,New Orleans,Poydras and Bolivar,0,4,4
1,269679,"December 27, 2014",California,Los Angeles,8800 block of South Figueroa Street,1,3,4
2,270036,"December 27, 2014",California,Sacramento,4000 block of May Street,0,4,4


In [14]:
df.columns

Index(['Incident ID', 'Incident Date', 'State', 'City Or County', 'Address',
       '# Killed', '# Injured', '# Victims'],
      dtype='object')

Need to change the "Incident Date" data type to a datetime object

In [15]:
# changing dtype of incident date to datetime

df["Incident Date"] = pd.to_datetime(df["Incident Date"], infer_datetime_format=True)
df.head(3)

Unnamed: 0,Incident ID,Incident Date,State,City Or County,Address,# Killed,# Injured,# Victims
0,271363,2014-12-29,Louisiana,New Orleans,Poydras and Bolivar,0,4,4
1,269679,2014-12-27,California,Los Angeles,8800 block of South Figueroa Street,1,3,4
2,270036,2014-12-27,California,Sacramento,4000 block of May Street,0,4,4


In [16]:
# frequency tables with side table

df.stb.freq(["State"]).style.hide_index()

State,count,percent,cumulative_count,cumulative_percent
Illinois,374,10.362981,374,10.362981
California,340,9.420892,714,19.783874
Texas,244,6.760876,958,26.544749
Florida,218,6.040454,1176,32.585204
Louisiana,168,4.655029,1344,37.240233
Pennsylvania,167,4.627321,1511,41.867553
New York,166,4.599612,1677,46.467165
Ohio,141,3.906899,1818,50.374065
Georgia,137,3.796065,1955,54.17013
Missouri,117,3.241895,2072,57.412025


In [17]:
df.columns

Index(['Incident ID', 'Incident Date', 'State', 'City Or County', 'Address',
       '# Killed', '# Injured', '# Victims'],
      dtype='object')

Creating aggregate function dictionary for groupbys

In [18]:
agg_deaths = {
    "# Killed" : ["sum", "mean", "min", "max", "var"]
}

agg_injured = {
    "# Injured" : ["sum", "mean",  "min", "max", "var"]
}

agg_victims = {
    "# Victims" : [ "sum", "mean",  "min", "max", "var"]
}

In [19]:
# Looking at state deaths 
df_deaths = df.groupby("State").agg(agg_deaths).reset_index().round(2)
df_deaths[:5]

Unnamed: 0_level_0,State,# Killed,# Killed,# Killed,# Killed,# Killed
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,min,max,var
0,Alabama,98,1.03,0,7,1.52
1,Alaska,6,1.2,0,4,2.7
2,Arizona,65,1.76,0,5,2.47
3,Arkansas,41,0.95,0,5,1.43
4,California,394,1.16,0,16,3.08


In [20]:
df_deaths.columns

MultiIndex([(   'State',     ''),
            ('# Killed',  'sum'),
            ('# Killed', 'mean'),
            ('# Killed',  'min'),
            ('# Killed',  'max'),
            ('# Killed',  'var')],
           )

In [21]:
df_deaths = df_deaths.sort_values(by=[("# Killed", "sum")], ascending = False)
# state_deaths.style.hide_index() can't chain wiht either head or loc?
df_deaths.head(5)

Unnamed: 0_level_0,State,# Killed,# Killed,# Killed,# Killed,# Killed
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,min,max,var
41,Texas,419,1.72,0,27,9.27
4,California,394,1.16,0,16,3.08
9,Florida,285,1.31,0,50,13.85
12,Illinois,257,0.69,0,6,1.01
10,Georgia,149,1.09,0,8,1.95


Looking at state injuries

In [22]:
# agg_injured

df_injured = df.groupby(["State"]).agg(agg_injured).reset_index().round(2)
df_injured = df_injured.sort_values(by=[("# Injured", "sum")], ascending = False)
df_injured.head(5)

Unnamed: 0_level_0,State,# Injured,# Injured,# Injured,# Injured,# Injured
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,min,max,var
12,Illinois,1594,4.26,0,15,2.98
4,California,1350,3.97,0,19,4.18
41,Texas,1018,4.17,0,23,11.08
9,Florida,962,4.41,0,53,16.96
17,Louisiana,723,4.3,0,17,4.78


In [23]:
df.head(3)

Unnamed: 0,Incident ID,Incident Date,State,City Or County,Address,# Killed,# Injured,# Victims
0,271363,2014-12-29,Louisiana,New Orleans,Poydras and Bolivar,0,4,4
1,269679,2014-12-27,California,Los Angeles,8800 block of South Figueroa Street,1,3,4
2,270036,2014-12-27,California,Sacramento,4000 block of May Street,0,4,4


#### Joining the safety_df (refer to cell 10) with the df 

In [24]:
# Not a great join- need to fix this so I don't have to run next cell
combined_df = df.join(safety_df.set_index("State"), on ="State", how = "left")
combined_df.head(3)

Unnamed: 0,Incident ID,Incident Date,State,City Or County,Address,# Killed,# Injured,# Victims,lawsRank,Safety Grade,gunDeathRate
0,271363,2014-12-29,Louisiana,New Orleans,Poydras and Bolivar,0,4,4,32.0,F,21.31
1,269679,2014-12-27,California,Los Angeles,8800 block of South Figueroa Street,1,3,4,1.0,A,7.45
2,270036,2014-12-27,California,Sacramento,4000 block of May Street,0,4,4,1.0,A,7.45


Remove excess columns

In [25]:
combined_df = combined_df.drop(["Address", "Incident ID"], axis=1)
combined_df.columns

Index(['Incident Date', 'State', 'City Or County', '# Killed', '# Injured',
       '# Victims', 'lawsRank', 'Safety Grade', 'gunDeathRate'],
      dtype='object')

#### Now to clean and subsequently join the state_population dataframe 

In [26]:
state_df.head(3)

Unnamed: 0,State,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,United States,315993715,318301008,320635163,322941311,324985539,326687501,328239523,331449281,332403650,332915073
1,.Alabama,4830081,4841799,4852347,4863525,4874486,4887681,4903185,5024279,4934193,4949697
2,.Alaska,737068,736283,737498,741456,739700,735139,731545,733391,724357,720763


going to pop the United States row off because it isn't joinable w/ current DataFrame

In [27]:
# dropping 1st row using .drop with index 0 
state_df = state_df.drop(labels=0, axis = 0)
state_df[:2]

Unnamed: 0,State,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
1,.Alabama,4830081,4841799,4852347,4863525,4874486,4887681,4903185,5024279,4934193,4949697
2,.Alaska,737068,736283,737498,741456,739700,735139,731545,733391,724357,720763


Checking datatypes and taking care of the "." in front of all the State observations that we'll need to remove. 

In [28]:
print("Data types are :", state_df.dtypes)
print("----------------------------------")
print("Dataframe Shape :", state_df.shape)

Data types are : State    object
2013      int64
2014      int64
2015      int64
2016      int64
2017      int64
2018      int64
2019      int64
2020      int64
2021      int64
2022      int64
dtype: object
----------------------------------
Dataframe Shape : (51, 11)


In [29]:
# using the slidetable missing check instead of isnull value counts b/c its prettier
state_df.stb.missing()
# No missing values- very cool considering I concatenated the dataset myself

Unnamed: 0,missing,total,percent
State,0,51,0.0
2013,0,51,0.0
2014,0,51,0.0
2015,0,51,0.0
2016,0,51,0.0
2017,0,51,0.0
2018,0,51,0.0
2019,0,51,0.0
2020,0,51,0.0
2021,0,51,0.0


Removing the "." before each state | Going to use vectorised str (string) method to slice em 

In [30]:
state_df["State"] = state_df["State"].str[1:]
state_df.head(1)

Unnamed: 0,State,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
1,Alabama,4830081,4841799,4852347,4863525,4874486,4887681,4903185,5024279,4934193,4949697


Now to join them - going to join on unique ID - > set_index("State")

In [31]:
master_df = combined_df.join(state_df.set_index("State"), on = "State", how="left")


#### Cleaning up the master_df

Possibly make the DT column the index, rename the ["2013"] -> ["2013_pop"], make capitalization consistent

In [42]:
master_df = master_df.rename(columns = {"lawsRank": "Gun Law Rank",
                                         "gunDeathRate": "Gun Deaths Per 100k",
                                         "2013": "2013 Pop", "2014" : "2014 Pop",
                                         "2015": "2015 Pop", "2016" : "2016 Pop",
                                         "2017": "2017 Pop", "2018" : "2018 Pop",
                                         "2019": "2019 Pop", "2020" : "2020 Pop",
                                         "2021": "2021 Pop", "2022" : "2022 Pop",
                                         "City Or County" : "City"})
master_df.columns

Index(['Year', 'State', 'Population', 'City', '# Killed', '# Injured',
       '# Victims', 'Gun Law Rank', 'Safety Grade', 'Gun Deaths Per 100k',
       'Incident Date'],
      dtype='object')

Changing the datetime format from YYYY-MM-DD to MM-DD-YYYY (if this was a larger dataset I'd go ahead and make it the index to increase speed but not super necessary here, might do it anyways as a best practice)

In [33]:
master_df.dtypes[:1]

Incident Date    datetime64[ns]
dtype: object

In [34]:
master_df["Incident Date"] = master_df["Incident Date"].dt.strftime("%m-%d-%Y")

In [35]:
# checking
master_df.loc[:3, "Incident Date"]

0    12-29-2014
1    12-27-2014
2    12-27-2014
3    12-26-2014
Name: Incident Date, dtype: object

ADDING A YEAR COLUMN VIA DT.YEAR FROM DATETIME COLUMN

In [36]:
master_df["Year"] = pd.to_datetime(master_df["Incident Date"]).dt.year

#### Using np.select to create a population column based on the "20xx Pop" columns

In [37]:
master_df["Population"] = np.select(
    [master_df["Year"] == 2014,
     master_df["Year"] == 2015,
     master_df["Year"] == 2016,
     master_df["Year"] == 2017,
     master_df["Year"] == 2018,
     master_df["Year"] == 2019,
     master_df["Year"] == 2020,
     master_df["Year"] == 2021,
     master_df["Year"] == 2022
     ],
    [master_df["2014 Pop"],
     master_df["2015 Pop"],
     master_df["2016 Pop"],
     master_df["2017 Pop"],
     master_df["2018 Pop"],
     master_df["2019 Pop"],
     master_df["2020 Pop"],
     master_df["2021 Pop"],
     master_df["2022 Pop"]
     ],
    default='NONE'
)

Reordering columns 

In [43]:
master_df = master_df[["Year", "State", "Population", "Gun Deaths Per 100k", "City", "# Killed", "# Injured", 
                       "# Victims", "Gun Law Rank","Safety Grade", 
                        "Incident Date"]]

In [44]:
# Checking the Join / ["Year"] / Reordering

master_df.head(3)

Unnamed: 0,Year,State,Population,Gun Deaths Per 100k,City,# Killed,# Injured,# Victims,Gun Law Rank,Safety Grade,Incident Date
0,2014,Louisiana,4644013,21.31,New Orleans,0,4,4,32.0,F,12-29-2014
1,2014,California,38596972,7.45,Los Angeles,1,3,4,1.0,A,12-27-2014
2,2014,California,38596972,7.45,Sacramento,0,4,4,1.0,A,12-27-2014


#### Using style.format to create a nice little DataFrame. 

In [45]:
# Creating the format dictionary to pass through style.format()
# looks like you can't pass a list through for style.format?

# format_dict = {"Gun Law Rank" : "{0:.0f}", 
#                [["2013 Pop", "2014 Pop", "2015 Pop",
#                 "2016 Pop", "2017 Pop", "2018 Pop",
#                 "2019 Pop", "2020 Pop", "2021 Pop", "2022 Pop"]] : "{0:,.0f}"}

format_dict = {"Gun Law Rank" : "{0:.0f}", "Gun Deaths Per 100k" : "{0:.2f}",
               "2013 Pop" :  "{0:,.0f}", "2014 Pop" : "{0:,.0f}",
               "2015 Pop" :  "{0:,.0f}", "2016 Pop" :  "{0:,.0f}",
               "2017 Pop" :  "{0:,.0f}", "2018 Pop" :  "{0:,.0f}",
               "2019 Pop" :  "{0:,.0f}", "2020 Pop" :  "{0:,.0f}",
               "2021 Pop" :  "{0:,.0f}", "2022 Pop" :  "{0:,.0f}"}



Because you can't chain style.format with the background_gradient w/ gmap arg, going to create a named variable to hold the formatted df then chain the .style method

#### Creating a groupby to aggregate the data

In [46]:
# agg dict for group by
agg_dict = {"# Killed" : "sum", "# Injured" : "sum", "# Victims" : "sum"}

In [47]:
group_df = master_df.groupby(["State","Year", "Gun Law Rank", "Safety Grade", "Gun Deaths Per 100k"]).agg(agg_dict).reset_index()
group_df[:5]

Unnamed: 0,State,Year,Gun Law Rank,Safety Grade,Gun Deaths Per 100k,# Killed,# Injured,# Victims
0,Alabama,2014,38.0,F,21.7,1,9,10
1,Alabama,2015,38.0,F,21.7,4,13,17
2,Alabama,2016,38.0,F,21.7,16,53,69
3,Alabama,2017,38.0,F,21.7,6,38,44
4,Alabama,2018,38.0,F,21.7,12,55,67


In [48]:
format_progress = master_df.style.format(format_dict).hide_index()

In [49]:
grouped_df = master_df.groupby("State").agg(agg_dict)
grouped_df[:5]

Unnamed: 0_level_0,# Killed,# Injured,# Victims
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alabama,98,368,466
Alaska,6,18,24
Arizona,65,133,198
Arkansas,41,212,253
California,394,1350,1744


******************** COME BACK TO THIS AND DO A GROUPBY SO IT'S NOT SO LONG*********************** LETS MOVE THE PRETTY DF TO THE BOTTOM

In [50]:
# checking the pretty_df for format
master_df.style.format(format_dict).background_gradient(subset= ["Gun Law Rank"], cmap= "coolwarm").background_gradient(subset=["Gun Deaths Per 100k"], cmap = "bwr")


Unnamed: 0,Year,State,Population,Gun Deaths Per 100k,City,# Killed,# Injured,# Victims,Gun Law Rank,Safety Grade,Incident Date
0,2014,Louisiana,4644013,21.31,New Orleans,0,4,4,32.0,F,12-29-2014
1,2014,California,38596972,7.45,Los Angeles,1,3,4,1.0,A,12-27-2014
2,2014,California,38596972,7.45,Sacramento,0,4,4,1.0,A,12-27-2014
3,2014,Illinois,12884493,10.78,East St. Louis,1,3,4,8.0,A,12-26-2014
4,2014,Missouri,6056202,21.34,Saint Louis,1,3,4,46.0,F,12-24-2014
5,2014,Kentucky,4414349,16.81,Winchester,1,3,4,46.0,F,12-23-2014
6,2014,Michigan,9929848,12.82,Detroit,1,3,4,19.0,C,12-22-2014
7,2014,New York,19651049,4.03,Webster,4,2,6,4.0,A,12-22-2014
8,2014,Illinois,12884493,10.78,Chicago,0,5,5,8.0,A,12-22-2014
9,2014,Florida,19845911,12.81,Sarasota,2,2,4,22.0,C,12-21-2014


In [46]:
master_df.head(1)

Unnamed: 0,Year,State,City Or County,# Killed,# Injured,# Victims,Gun Law Rank,Safety Grade,Gun Deaths Per 100k,2013 Pop,2014 Pop,2015 Pop,2016 Pop,2017 Pop,2018 Pop,2019 Pop,2020 Pop,2021 Pop,2022 Pop,Incident Date
0,2014,Louisiana,New Orleans,0,4,4,32.0,F,21.31,4624527,4644013,4664628,4678135,4670560,4659690,4648794,4657757,4627002,4616106,12-29-2014


#### Using group_df for these

In [47]:
df_texas = group_df[group_df["State"] == "Texas"]
df_california = group_df[group_df["State"] == "California"]

In [48]:
fig = go.Figure(data = [go.Scatter(x = df_texas["Year"], y = df_texas["# Killed"],
                                                line = dict(color="crimson", width = 4),
                                                text = df_texas["State"], name = "Texas"),
                        go.Scatter(x = df_california["Year"], y = df_california["# Killed"],
                                                 line = dict(color = "seagreen", width = 4), 
                                                 text = "California", name = "California")])

fig

In [49]:

# df = px.data.gapminder()
# px.scatter(df, x="gdpPercap", y="lifeExp", animation_frame="year", animation_group="country",
#            size="pop", color="continent", hover_name="country",
#            log_x=True, size_max=55, range_x=[100,100000], range_y=[25,90])


# px.scatter(master_df, x ="# Victims", y = "# Killed", animation_frame = "Incident Date", animation_group = "Safety Grade",
#            size = "Gun Law Rank", color = "Safety Grade", hover_name = "Safety Grade",
#            log_x = False, size_max=55, range_x = [0, 700], range_y = [0,75])