In [1]:
library(tidyverse)
library(lubridate)
library(ggplot2)
library(sf)

-- [1mAttaching core tidyverse packages[22m ------------------------ tidyverse 2.0.0 --
[32mv[39m [34mdplyr    [39m 1.1.4     [32mv[39m [34mreadr    [39m 2.1.5
[32mv[39m [34mforcats  [39m 1.0.0     [32mv[39m [34mstringr  [39m 1.5.1
[32mv[39m [34mggplot2  [39m 3.5.2     [32mv[39m [34mtibble   [39m 3.3.0
[32mv[39m [34mlubridate[39m 1.9.4     [32mv[39m [34mtidyr    [39m 1.3.1
[32mv[39m [34mpurrr    [39m 1.1.0     
-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31mx[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mi[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors
"package 'sf' was built under R version 4.5.2"
Linking to GEOS 3.13.1, GDAL 3.11.4, PROJ 9.7.0; sf_use_s2() is TRUE



In [2]:
# Load 2024 data
df_2024 <- read_csv("../data/offenses_known_csv_1960_2024_month/offenses_known_monthly_2024.csv")|>
    glimpse()

"[1m[22mOne or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)"
[1mRows: [22m[34m309168[39m [1mColumns: [22m[34m197[39m
[36m--[39m [1mColumn specification[22m [36m--------------------------------------------------------[39m
[1mDelimiter:[22m ","
[31mchr[39m   (35): ori, ori9, agency_name, state, state_abb, month, last_month_repo...
[32mdbl[39m  (153): year, number_of_months_reported, month_missing, longitude, latit...
[33mlgl[39m    (8): number_of_months_missing, covered_by_population_group, populatio...
[34mdate[39m   (1): date

[36mi[39m Use `spec()` to retrieve the full column specification for this data.
[36mi[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Rows: 309,168
Columns: 197
$ ori                                     [3m[90m<chr>[39m[23m "AK00101"[90m, [39m"AK00101"[90m, [39m"AK00101~
$ ori9                                    [3m[90m<chr>[39m[23m "AK0010100"[90m, [39m"AK0010100"[90m, [39m"AK0~
$ agency_name                             [3m[90m<chr>[39m[23m "anchorage"[90m, [39m"anchorage"[90m, [39m"anc~
$ state                                   [3m[90m<chr>[39m[23m "alaska"[90m, [39m"alaska"[90m, [39m"alaska"[90m, [39m~
$ state_abb                               [3m[90m<chr>[39m[23m "AK"[90m, [39m"AK"[90m, [39m"AK"[90m, [39m"AK"[90m, [39m"AK"[90m, [39m~
$ year                                    [3m[90m<dbl>[39m[23m 2024[90m, [39m2024[90m, [39m2024[90m, [39m2024[90m, [39m2024[90m, [39m~
$ month                                   [3m[90m<chr>[39m[23m "january"[90m, [39m"february"[90m, [39m"march"~
$ date                                    [3m[90m<date>[39m[23m 20

In [3]:
# Identify all unique states in the dataset
unique_states <- df_2024 |> 
    distinct(state) |> 
    pull(state)

print(length(unique_states))

[1] 57


In [4]:
# identify all unique census regions in the 2024 data
unique_regions <- df_2024 |> 
    distinct(census_name) |> 
    pull(census_name)

# Print the unique census regions
print(length(unique_regions))

[1] 13070


In [5]:
# Trim the df
df_2024_trimmed <- df_2024 |>
    filter(!state_abb %in% c("AS", "GU", "MP", "PR", "VI"))|>
    filter(!is.na(census_name))|>
    select(state, state_abb, address_city, census_name, year, month, date, population_1, population, core_city_indication, 
            officers_killed_by_felony:actual_index_total)|>
    pivot_longer(cols = actual_murder:actual_assault_aggravated,
                 names_to = "offense_type",
                 values_to = "offense_count") |>
    mutate(crime_rate = offense_count/population_1 * 100000)|>
    glimpse()

Rows: 7,396,704
Columns: 19
$ state                       [3m[90m<chr>[39m[23m "alaska"[90m, [39m"alaska"[90m, [39m"alaska"[90m, [39m"alaska"[90m, [39m"a~
$ state_abb                   [3m[90m<chr>[39m[23m "AK"[90m, [39m"AK"[90m, [39m"AK"[90m, [39m"AK"[90m, [39m"AK"[90m, [39m"AK"[90m, [39m"AK"[90m, [39m~
$ address_city                [3m[90m<chr>[39m[23m "anchorage"[90m, [39m"anchorage"[90m, [39m"anchorage"[90m, [39m"an~
$ census_name                 [3m[90m<chr>[39m[23m "anchorage municipality"[90m, [39m"anchorage munic~
$ year                        [3m[90m<dbl>[39m[23m 2024[90m, [39m2024[90m, [39m2024[90m, [39m2024[90m, [39m2024[90m, [39m2024[90m, [39m2024[90m, [39m~
$ month                       [3m[90m<chr>[39m[23m "january"[90m, [39m"january"[90m, [39m"january"[90m, [39m"january"~
$ date                        [3m[90m<date>[39m[23m 2024-01-01[90m, [39m2024-01-01[90m, [39m2024-01-01[90m, [39m2024-~

In [6]:
# Summarize crime count by type
crime_type_summary <- df_2024_trimmed|>
    group_by(offense_type)|>
    summarize(count = sum(offense_count),
            mean = mean(offense_count),
            sd = sd(offense_count))|>
    arrange(by = count)

crime_type_summary

offense_type,count,mean,sd
<chr>,<dbl>,<dbl>,<dbl>
actual_theft_under50_dollar,0,0.0,0.0
actual_manslaughter,1936,0.00732867,0.1127764
actual_rape_attempted,4253,0.0160996,0.2137941
actual_murder,15816,0.05987099,0.6773848
actual_robbery_with_a_knife,17347,0.06566655,1.7450008
actual_robbery_other_weapon,23760,0.08994276,1.5630838
actual_burglary_attempted,49420,0.18707792,2.1923817
actual_motor_vehicle_theft_other,61211,0.2317124,2.2356503
actual_robbery_with_a_gun,63611,0.24079752,3.6943929
actual_robbery_unarmed,92816,0.35135217,7.5466935


In [None]:
# Get the stat for major US cities
us