<h1 style="font-size: 40px;">Introduction</h1>

<h2 style="font-size: 20px;
           line-height: 1.5;
           margin-bottom: 20px"
    > Crime activities in Cambridge from 2009 to 2023 are analyzed in aspects of their overall trend, relative change, density make-up each year, and activity hours which the crime usually occurs. Of all crimes, Larceny of Bicycle and Shoplifting appears to be growing more and more prevalent. Despite Hit and Run being the more popular crime in terms of reporting, both Larceny of Bicycle and Shoplifting are increasing more drastically than when they started in 2009, and increasing much faster as compared to Hit and Run. Most crimes occur around the mark of 3:00PM in the evening, for both nieghborhoods highly affected by crimes and other neighborhoods in general.
</h2>

In [None]:
library(tidyverse)
library(data.table)
library(patchwork)

theme_custom <- theme_classic() +
                theme(## Axis labels
                      axis.text.x = element_text(size=18, family="Econ Sans Cnd light"),
                      axis.text.y = element_text(size=18, family="Econ Sans Cnd light"),
                      ## Title, subtitle, caption, legend
                      plot.title = element_text(size=22, face="bold", family="Econ Sans Cnd bold", 
                                                hjust = 0, vjust = 1, margin = margin(b = 10)),
                      plot.subtitle = element_text(size=18, family="Econ Sans Cnd regular",
                                                   hjust = 0),
                      plot.caption = element_text(size=12, face="bold", family="Econ Sans Cnd light", hjust=0, colour="#6F8793"),
                      legend.position = "top",
                      legend.text = element_text(size=15, hjust=0, vjust=.1),
                      legend.title = element_blank(),
                      ## y-axis lines
                      panel.grid.major.y = element_line(color="#6F8793"),
                      ## Background color
                      panel.background=element_rect(fill="#F3F0E0"),
                      plot.background=element_rect(fill="#F3F0E0"),
                      legend.background=element_rect(fill="#F3F0E0")
                     )

fig_size <- function(width=22, heigth=8){
    options(repr.plot.width=width, repr.plot.height=heigth)
}
fig_size()

<h1 style="font-size: 40px;">Data Loading</h1>

<h2 style="font-size: 20px;
           line-height: 1.5;
           margin-bottom: 20px"
    > The data is loaded, then inspected for missing or duplicated values. No data processing was performed here; for data processing, all was done in this <a href="https://www.kaggle.com/code/khoatran311/cambridge-crime-data-feature-engineering">notebook</a>. 
</h2>

In [None]:
data <- read_csv("/kaggle/input/cambridge-crime-data-feature-engineering/cambridge_crime_clean.csv")

head(data, 5)
glimpse(data, 2)
spec(data)

In [None]:
data %>%
    is.na() %>%
    apply(2, sum) %>%
    data.frame() %>%
    rename("NA_count"=".") %>%
    arrange(desc(NA_count))

In [None]:
duplicate_count <-
    data %>%
    duplicated() %>%
    sum()

print(paste("Duplicate count: ", duplicate_count, sep=""), quote=FALSE)

In [None]:
data <- as.data.table(data)

<h1 style="font-size: 40px;">Crime Analysis</h1>

<h2 style="font-size: 20px;
           line-height: 1.5;
           margin-bottom: 20px"
    > Various factors regarding to crime are analyzed: the prevalent crimes, the trends, the relative changes, the proportional make-up, and the activity hours. Of all crimes, it is determined that there must be a greater emphasis placed on crimes related to Larceny of Bicycle and Shoplifting. Both may not be as popular to Hit and Run in terms of how many reports there are, but both have increased with a much greater percentage than Hit and Run.
</h2>

In [None]:
color_fill <- rep("No", 20)
color_fill[1:5] <- rep("Yes", 5)

fig_size(22, 10)
data %>%
    group_by(crime) %>%
    summarise("count"=n()) %>%
    arrange(desc(count)) %>%
    head(20) %>%
    mutate(crime=fct_reorder(crime,count),
           "color_code"=color_fill) %>%
    
    ## Plot
    ggplot(aes(x=count, y=crime, fill=color_code)) +
    geom_bar(stat="identity") +
    labs(title="Top 20 Crimes Prevalent in Cambridge City",
         y="",
         x="Frequency"
        ) +
    theme_custom +
    theme(axis.line.y = element_blank(),
          axis.ticks.y = element_blank(),
          panel.grid.major.y = element_blank(),
          axis.ticks.x = element_blank(),
          axis.text.y = element_blank(),
          legend.position="none",
          plot.title = element_text(size=22, face="bold", family="Econ Sans Cnd bold", 
                                    hjust = .065, vjust = 1, margin = margin(b = 10)),
          axis.title.x = element_text(size=18, family="Econ Sans Cnd light"),
         ) +
    geom_text(aes(x=0, y=crime, label=crime),
              hjust=0,
              nudge_x=50,
              color="black",
              family = "Econ Sans Cnd regular",
              size = 6
              ) +
    scale_fill_manual(values=c("Yes"="#ED2939", "No"="#BFBFBF"))

In [None]:
color_fill <- rep("No", 13)
color_fill[1:2] <- rep("Yes", 2)

data %>%
    group_by(neighborhood) %>%
    summarise("count"=n()) %>%
    arrange(desc(count)) %>%
    mutate(neighborhood=fct_reorder(neighborhood,count),
           "color_code"=color_fill) %>%
    ## Plot
    ggplot(aes(x=count, y=neighborhood, fill=color_code)) +
    geom_bar(stat="identity") +
    labs(title="Neighborhoods Affected by Crimes in Cambridge City",
         y="",
         x="Frequency"
         ) +
    theme_custom +
    theme(axis.line.y = element_blank(),
          axis.ticks.y = element_blank(),
          panel.grid.major.y = element_blank(),
          axis.ticks.x = element_blank(),
          axis.text.y = element_blank(),
          legend.position="none",
          plot.title = element_text(size=22, face="bold", family="Econ Sans Cnd bold", 
                                    hjust = .075, vjust = 1, margin = margin(b = 10)),
          axis.title.x = element_text(size=18, family="Econ Sans Cnd light")
         ) +
    geom_text(aes(x=0, y=neighborhood, label=neighborhood),
              hjust=0,
              nudge_x=50,
              color="black",
              family = "Econ Sans Cnd regular",
              size = 6
              ) +
    scale_fill_manual(values=c("Yes"="#ED2939", "No"="#BFBFBF"))

In [None]:
### Hit and Run is the dominant crime 
top_crime_df <-  ### Top crime, by neighborhood
    data %>%
    group_by(neighborhood, crime) %>%
    summarise("crime_count"=n(), .groups="drop") %>%
    arrange(neighborhood, desc(crime_count)) %>%
    group_by(neighborhood) %>%
    slice_max(order_by=crime_count, n=1, with_ties=FALSE) %>%
    rename("popular_crime"="crime")

top_crime_df

### Stacked bar chart
top_crime_df %>%
    group_by(popular_crime) %>%
    summarise("count"=n()) %>%
    mutate("prop"=count/sum(count))

In [None]:
### Popular crime from eacy year
data %>%
    group_by(crime_year, crime) %>%
    summarise("crime_count"=n(), .groups="drop") %>%
    arrange(crime_year, desc(crime_count)) %>%
    group_by(crime_year) %>%
    slice_max(order_by=crime_count, n=1, with_ties=FALSE) %>%
    ungroup() %>%
    select(crime_year, crime) %>%
    rename("popular_crime"="crime")

In [None]:
### Compare proportion of change relative to first occurence number of crime k
crimes <- c("Hit and Run", "Larceny from MV", "Larceny of Bicycle",
            "Forgery", "Shoplifting", "Mal. Dest. Property")

popular_crimes_change <- 
    data %>%
    filter(crime_year>=2009,      ## Few crime occurrences before 2009, making the ratio too large
          crime_year<2024) %>%    ## Incomplete data in 2024 => deceiving appearance of lesser relative change ratio
    group_by(crime_year, crime) %>%
    summarise("crime_count"=n(), .groups="drop") %>%
    arrange(crime, crime_year) %>%
    group_by(crime) %>%
    mutate("change"=crime_count - lag(crime_count)) %>%
    group_by(crime) %>%
    summarise("relative_change"=sum(change, na.rm=TRUE)/first(crime_count)) %>% 
    arrange(desc(relative_change)) %>%
    filter(crime %in% crimes)

popular_crimes_change

In [None]:
crime_levels <- popular_crimes_change$crime
subtitle <- "Most activity trends are slightly increasing or decreasing; 
the activity trend for Larceny of Bicycle is steeply increasing."

data %>%
    filter(crime_year>=2009,      ## Few crime occurrences before 2009, making the ratio too large
          crime_year<2024) %>%    ## Incomplete data in 2024 => deceiving appearance of lesser relative change ratio
    group_by(crime_year, crime) %>%
    summarise("crime_count"=n(), .groups="drop") %>%
    arrange(crime, crime_year) %>%
    filter(crime %in% crimes) %>%
    mutate(crime=factor(crime, levels=crime_levels)) %>%
    ## Plot
    ggplot(aes(x=crime_year, y=crime_count)) +
    geom_point(size=2.5) +
    geom_point(size=1, color="white") +
    geom_line() +
    labs(title="Trend of Popular Crime Activities from 2009 to 2023",
         subtitle=subtitle,
         x="",
         y="Frequency"
        ) +
    geom_smooth(method="lm", se=FALSE, colour="#ED2939") +
    geom_vline(xintercept=2020, linetype="dashed") +
    facet_wrap(~crime, scales="free_y") +
    theme_custom +
    theme(panel.grid.major.y = element_blank(),
          plot.margin = margin(t = 20, r = 20, b = 5, l = 20),
          strip.text = element_text(size=16, family="Econ Sans Cnd light")
         )

In [None]:
subtitle <- "From 2009 to 2023, crime occurrences related to Larceny of Bicycle doubled,
while Hit and Run has increased only by about 19% despite being more popular."
color_code <- rep("No", 6)
color_code[1] <- "Yes" 

popular_crimes_change %>%
    mutate(crime=fct_reorder(crime, relative_change),
           "color_code" = color_code) %>%
    ## Plot
    ggplot(aes(x=relative_change, y=crime, fill=color_code)) +
    geom_bar(stat="identity") +
    labs(title="Relative Change in Popular Crime Activities from 2009 to 2023",
         subtitle=subtitle,
         x="Relative Change in Activity",
         y=""
        ) +
    theme_custom +
    theme(axis.line.y = element_blank(),
          panel.grid.major.y = element_blank(),
          axis.ticks.y = element_blank(),
          legend.position="none",
          axis.text.y=element_blank(),
          axis.title.x=element_text(size=18, family="Econ Sans Cnd light")
         ) +
    geom_text(aes(x=0, y=crime, label=crime),
              hjust=1,
              nudge_x=-.015,
              color="black",
              family = "Econ Sans Cnd regular",
              size = 6
              ) +
    scale_fill_manual(values=c("Yes"="#ED2939", "No"="#BFBFBF")) +
    scale_x_continuous(labels = scales::percent_format())

In [None]:
subtitle <- "Larency of Bicycle and Shoplifting are increasing in their overall contribution to crimes;
Hit and Run has remained relatively stable. All 3 crimes contributed to 25% of total crimes in 2023."

data %>%
    filter(crime_year>=2009,
           crime_year<2024,
          ) %>%
    mutate(crime = case_when(crime=="Larceny of Bicycle" ~ "Larceny of Bicycle   ",
                             crime=="Hit and Run" ~ "Hit and Run   ",
                             crime=="Shoplifting" ~ "Shoplifting   ",
                             .default="Other Crimes   "
                            )
           ) %>%
    group_by(crime_year, crime) %>%
    summarise("count"=n()) %>%
    mutate("proportion"=count/sum(count)) %>%
    mutate(crime_year=factor(crime_year)) %>%
    mutate(crime=factor(crime, levels=c("Other Crimes   ", "Larceny of Bicycle   ", "Shoplifting   ", "Hit and Run   "))) %>%

    ggplot(aes(x=crime_year, y=proportion, fill=crime)) +
    geom_bar(stat="identity") +
    geom_hline(yintercept=.25, linetype="dashed") +
    labs(title="Changes in Popular Crimes' Make-up from 2009 to 2023",
         subtitle=subtitle,
         x="",
         y=""
        ) +
    theme_custom + 
    theme(axis.line.x = element_blank(),
          axis.ticks.x = element_blank(),
          axis.line.y = element_blank(),
          panel.grid.major.y = element_blank(),
          axis.ticks.y = element_blank(),
          legend.text = element_text(size=20, hjust=0, vjust=.5)
         ) +
    scale_y_continuous(labels = scales::percent_format()) +
    scale_fill_manual(values=c("Other Crimes   "="gray",
                               "Larceny of Bicycle   "="#ED2939",
                               "Shoplifting   "="#E89C31",
                               "Hit and Run   "="#FED100"
                               )
                     )

In [None]:
# crimes <- c("Larceny of Bicycle", "Hit and Run", "Shoplifting")

# data %>%
#     filter(crime_year>=2009,
#            crime_year<2024,
#            crime %in% crimes
#           ) %>%
#     group_by(crime_year, crime) %>%
#     summarise("count"=n()) %>%
#     mutate("proportion"=count/sum(count)) %>%
#     mutate(crime=factor(crime, levels=crimes),
#            crime_year=factor(crime_year)) %>%
#     ggplot(aes(x=crime_year, y=proportion, fill=crime)) +
#     geom_bar(stat="identity") +
#     labs(title="Changes in Proportions of Popular Crimes from 2009 to 2023",
#          x="",
#          y=""
#         ) +
#     theme_custom + 
#     theme(axis.line.x = element_blank(),
#           axis.ticks.x = element_blank(),
#           axis.line.y = element_blank(),
#           panel.grid.major.y = element_blank(),
#           axis.ticks.y = element_blank(),
#          ) +
#     scale_y_continuous(labels = scales::percent_format())

In [None]:
crime_levels <- c( "Hit and Run", "Shoplifting",
                  "Larceny of Bicycle", "Other Crimes")
crimes_hours_df <- 
    data %>%
    mutate(crime = case_when(crime=="Larceny of Bicycle" ~ "Larceny of Bicycle",
                             crime=="Hit and Run" ~ "Hit and Run",
                             crime=="Shoplifting" ~ "Shoplifting",
                             .default="Other Crimes"
                            )
           ) %>%
    mutate(crime = factor(crime, level=crime_levels)) %>%
    select(crime, crime_time)

In [None]:
ggplot(data=crimes_hours_df, aes(x=crime_time, y=crime, fill=crime)) +
    geom_boxplot() +
    labs(title="Comparision of Popular Crime Activities' Hours",
         subtitle="On average, crimes occur around 3:00 PM.",
         x="Hour of Day",
         y=""
        ) +
    geom_vline(xintercept=15, linetype="dashed", linewidth=1) +
    theme_custom +
    theme(axis.line.y = element_blank(),
          panel.grid.major.y = element_blank(),
          axis.ticks.y = element_blank(),
          axis.ticks.x = element_blank(),
          legend.position="none",
          axis.title.x=element_text(size=18, family="Econ Sans Cnd light")
         ) +
    scale_fill_manual(values=c("Other Crimes"="gray",
                               "Larceny of Bicycle"="#ED2939",
                               "Shoplifting"="#E89C31",
                               "Hit and Run"="#FED100"
                               )
                     )

In [None]:
neighborhoods <- c("Cambridgeport", "East Cambridge",
                   "North Cambridge", "Mid-Cambridge")
crime_levels <- c( "Hit and Run   ", "Shoplifting   ",
                  "Larceny of Bicycle   ", "Other Crimes   ")

data %>%
    filter(neighborhood %in% neighborhoods) %>%
    mutate(crime = case_when(crime=="Larceny of Bicycle" ~ "Larceny of Bicycle   ",
                             crime=="Hit and Run" ~ "Hit and Run   ",
                             crime=="Shoplifting" ~ "Shoplifting   ",
                             .default="Other Crimes   "
                            )
           ) %>%
    mutate(crime = factor(crime, level=rev(crime_levels)),
           neighborhood = factor(neighborhood, level=neighborhoods)
          ) %>%
    ggplot(aes(x=neighborhood, y=crime_time, fill=crime)) +
    geom_violin(alpha=.8, width=.7) +
    geom_boxplot(width=.7, alpha=.3, coef=1) +
    geom_hline(yintercept=15, linetype="dashed") +
    labs(title="Comparision of Popular Crime Activities' Hours across Top 4 Neighborhoods",
         x="",
         y="Hour of Day"
         ) +
    theme_custom +
    theme(axis.ticks.x=element_blank(),
          axis.ticks.y=element_blank(),
          panel.grid.major.y=element_blank(),
          axis.title.y=element_text(size=18, family="Econ Sans Cnd light"),
          legend.text = element_text(size=20, hjust=0, vjust=.5)
         ) +
    scale_fill_manual(values=c("Other Crimes   "="gray",
                               "Larceny of Bicycle   "="#ED2939",
                               "Shoplifting   "="#E89C31",
                               "Hit and Run   "="#FED100"
                               )
                     )