In [185]:
import pandas as pd
import seaborn as sns
import plotly.express as px

import folium
from folium.plugins import HeatMap
from folium.plugins import HeatMapWithTime
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio



# Load the dataset to understand its structure
data_path = 'NYPD_Shooting_Incident_Data__Historic__20240410.csv'
data = pd.read_csv(data_path)

In [176]:
# Continuous color scale: light gray to black
continuous_colorscale = [
    [0.0, 'rgb(220,220,220)'],  # light gray
    [1.0, 'rgb(0,0,0)']        # black
]

# Discrete color palette: Adding more colors if needed, depending on the number of years
discrete_colorscale = [
    'rgb(143,188,143)',  # Sea green
    'rgb(119,136,153)',  # Light slate gray
    'rgb(192,192,192)',  # Silver
    'rgb(47,79,79)',     # Dark slate gray
    'rgb(105,105,105)',  # Dim gray
]

year_to_color_2020_2021 = {
    2006: 'rgb(105,105,105)',  # Dim gray
    2007: 'rgb(105,105,105)',  # Dim gray
    2008: 'rgb(105,105,105)',  # Dim gray
    2009: 'rgb(105,105,105)',  # Dim gray
    2010: 'rgb(105,105,105)',  # Dim gray
    2011: 'rgb(105,105,105)',  # Dim gray
    2012: 'rgb(105,105,105)',  # Dim gray
    2013: 'rgb(105,105,105)',  # Dim gray
    2014: 'rgb(105,105,105)',  # Dim gray
    2015: 'rgb(105,105,105)',  # Dim gray
    2016: 'rgb(105,105,105)',  # Dim gray
    2017: 'rgb(105,105,105)',  # Dim gray
    2018: 'rgb(105,105,105)',  # Dim gray
    2019: 'rgb(105,105,105)',  # Dim gray
    2020: 'rgb(93,214,145)', # Bright green
    2021: 'rgb(143,188,143)', # Sea green
    2022: 'rgb(105,105,105)',  # Dim gray
}

year_to_color = {
    2018: 'rgb(119,136,153)',  # Light slate gray
    2019: 'rgb(119,136,192)', # Silver
    2020: 'rgb(93,214,145)', # Bright green
    2021: 'rgb(143,188,143)', # Sea green
    2022: 'rgb(47,79,79)'} # Dark slate gray


emphasis_color = 'rgb(255,165,0)'
extra_color = '#D4F3CC'


Clean data

In [177]:
# Dropping rows where JURISDICTION_CODE, Latitude, Longitude, or Lon_Lat have missing values
cleaned_data = data.dropna(subset=['JURISDICTION_CODE', 'Latitude', 'Longitude', 'Lon_Lat'])

# Check the number of remaining missing values in these columns to confirm the operation
remaining_missing = cleaned_data.isnull().sum()

# Convert the 'OCCUR_DATE' to datetime format and extract the year
cleaned_data['OCCUR_DATE'] = pd.to_datetime(cleaned_data['OCCUR_DATE'])
cleaned_data['YEAR'] = cleaned_data['OCCUR_DATE'].dt.year
cleaned_data['OCCUR_TIME'] = pd.to_datetime(cleaned_data['OCCUR_TIME'], format='%H:%M:%S').dt.hour

# Filter out rows containing any of these unrealistic age groups
unrealistic_ages = ['1020', '224', '940', '(null)']
cleaned_data = cleaned_data[~cleaned_data['PERP_AGE_GROUP'].isin(unrealistic_ages)]
cleaned_data = cleaned_data[cleaned_data['VIC_AGE_GROUP'] != '1022']

# Assuming the column name is confirmed to be 'PERP_SEX'
cleaned_data = cleaned_data[cleaned_data['PERP_SEX'] != '(null)']



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [178]:
cleaned_data.columns

Index(['INCIDENT_KEY', 'OCCUR_DATE', 'OCCUR_TIME', 'BORO', 'LOC_OF_OCCUR_DESC',
       'PRECINCT', 'JURISDICTION_CODE', 'LOC_CLASSFCTN_DESC', 'LOCATION_DESC',
       'STATISTICAL_MURDER_FLAG', 'PERP_AGE_GROUP', 'PERP_SEX', 'PERP_RACE',
       'VIC_AGE_GROUP', 'VIC_SEX', 'VIC_RACE', 'X_COORD_CD', 'Y_COORD_CD',
       'Latitude', 'Longitude', 'Lon_Lat', 'YEAR'],
      dtype='object')

Over Time Analysis

In [179]:
incidents_per_year = cleaned_data.groupby('YEAR').size().reset_index(name='Incidents')


In [218]:
# Group data by year and count incidents
incidents_per_year = cleaned_data.groupby('YEAR').size().reset_index(name='Incidents')
incidents_per_year['color'] =incidents_per_year['YEAR'].map(year_to_color_2020_2021)

# Calculate the trend line
x = incidents_per_year['YEAR']  # Years
y = incidents_per_year['Incidents']  # Incident counts
coefficients = np.polyfit(x, y, 1)
trend_line = np.poly1d(coefficients)

# Create a DataFrame for the trend line
trend_df = pd.DataFrame({
    'YEAR': x,
    'Trend': trend_line(x)
})


# Plotting with Plotly Express
fig = px.bar(incidents_per_year, x='YEAR', y='Incidents', text='Incidents',
             labels={'YEAR': 'Year', 'Incidents': 'Number of Shootings'},
             title='Number of Shooting Incidents in NYC per Year (2006-2024)',
             color_discrete_sequence=['skyblue'])
fig.update_traces(texttemplate='%{text}', textposition='outside', textfont=dict(size=10))
fig.add_scatter(x=trend_df['YEAR'], y=trend_df['Trend'], mode='lines',
                name='Trend Line', line=dict(color=emphasis_color))


# Calculate the trend line
x = incidents_per_year['YEAR']  # Years
y = incidents_per_year['Incidents']  # Incident counts
coefficients = np.polyfit(x, y, 1)
trend_line = np.poly1d(coefficients)

# Create a DataFrame for the trend line
trend_df = pd.DataFrame({
    'YEAR': x,
    'Trend': trend_line(x)
})

# Create the bar chart
fig = go.Figure()

# Add bars
for idx, row in incidents_per_year.iterrows():
    fig.add_trace(go.Bar(x=[row['YEAR']], y=[row['Incidents']], 
                         marker_color=row['color'], name=str(row['YEAR'])))

# Add trend line
fig.add_trace(go.Scatter(x=trend_df['YEAR'], y=trend_df['Trend'], mode='lines',
                         name='Trend Line', line=dict(color=emphasis_color)))

# Enhance the plot
fig.update_layout(
    title='Number of Shooting Incidents in NYC per Year (2006-2024)',
    title_font_color='white',  # Set title color to white for visibility
    xaxis=dict(
        title='Year',
        tickangle=-45,
        # title_font_color='white',  # Set x-axis title color to white
        # tickfont_color='white',  # Set x-axis tick labels to white
        gridcolor='gray',  # Set grid color for better visibility against black
    ),
    yaxis=dict(
        title='Number of Shootings',
        title_font_color='white',  # Set y-axis title color to white
        tickfont_color='white',  # Set y-axis tick labels to white
        gridcolor='gray',  # Set grid color for better visibility against black
    ),
    plot_bgcolor='black',  # Set the plotting area background to black
    paper_bgcolor='black',  # Set the entire chart background to black
    showlegend=False,
    margin=dict(l=30, r=20, t=60, b=20),
    font=dict(
        color='white'  # Ensure all default text (like legend, annotations) is white
    )
)


fig.show()
pio.write_html(fig, file='shootings_by_year.html', auto_open=False)


In [221]:
# Convert 'OCCUR_DATE' to datetime if it's not already
cleaned_data['OCCUR_DATE'] = pd.to_datetime(cleaned_data['OCCUR_DATE'])


# Extract year and month
cleaned_data['YEAR'] = cleaned_data['OCCUR_DATE'].dt.year
cleaned_data['MONTH'] = cleaned_data['OCCUR_DATE'].dt.month

# Group by year and month and count the occurrences
monthly_crime_data = cleaned_data.groupby(['YEAR', 'MONTH']).size().reset_index(name='INCIDENTS')


# keep only last 5 years
monthly_crime_data = monthly_crime_data[monthly_crime_data['YEAR'] >=2018]

# Calculate maximum incidents per year for color scaling
max_incidents_per_year = monthly_crime_data.groupby('YEAR')['INCIDENTS'].max()

# Normalize these values to a 0-1 range for color mapping, set a minimum opacity
min_opacity = 0.2  # Adjust minimum opacity here
norm = (max_incidents_per_year - max_incidents_per_year.min()) / (max_incidents_per_year.max() - max_incidents_per_year.min()) * (1 - min_opacity) + min_opacity

# Map these normalized values back to the main DataFrame
monthly_crime_data['COLOR'] = monthly_crime_data['YEAR'].map(norm)

# Make sure we have enough colors for the number of years, repeat if needed
unique_years = monthly_crime_data['YEAR'].unique()
if len(discrete_colorscale) < len(unique_years):
    num_repeats = -(-len(unique_years) // len(discrete_colorscale))  # Ceiling division
    discrete_colorscale = (discrete_colorscale * num_repeats)[:len(unique_years)]

# Map years to colors
# year_to_color = {year: color for year, color in zip(unique_years, discrete_colorscale)}
# year_to_color[2020] = emphasis_color  # Assign the emphasis color to 2020

# Create the plot using Graph Objects for more control
fig = go.Figure()

# Add each year as a separate trace
for year in unique_years:
    year_data = monthly_crime_data[monthly_crime_data['YEAR'] == year]
    fig.add_trace(go.Scatter(
        x=year_data['MONTH'], 
        y=year_data['INCIDENTS'], 
        mode='lines',
        name=str(year),
        line=dict(color=year_to_color[year])  # Apply color based on the year
    ))

# Update the layout to add customizations
fig.update_layout(
    title='Monthly Shooting Incidents by Year',
    xaxis=dict(
        title='Month',
        tickmode='array',
        tickvals=list(range(1, 13)),
        ticktext=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
        gridcolor='gray',
    ),
    yaxis=dict(
        title='Number of Incidents',
        gridcolor='gray',
    ),
    legend_title_text='Year',
    plot_bgcolor='black',  # Set the plotting area background to black
    paper_bgcolor='black',  # Set the entire chart background to black
    font=dict(
        color='white'  # Ensure all default text (like legend, annotations) is white
    )
    # margin=dict(l=30, r=20, t=60, b=20)
)

fig.show()
pio.write_html(fig, file='monthly_shootings_per_year.html', auto_open=False)




Layoff Interest

In [224]:
layoffs = pd.read_csv("NYC_layoff_interest.csv")
# Ensure the 'Week' column is datetime type
layoffs['Week'] = pd.to_datetime(layoffs['Week'])

# Extract the month from the 'Week' column
layoffs['Month'] = layoffs['Week'].dt.month

monthly_layoffs = layoffs.groupby('Month')['Layoff Google Searches in New York'].sum().reset_index()

max_val = monthly_layoffs['Layoff Google Searches in New York'].max()
monthly_layoffs['Layoff Google Interest in New York for 2020'] = monthly_layoffs['Layoff Google Searches in New York'] * 100/max_val


# Create the line graph with Plotly Express
fig = px.line(
    monthly_layoffs, 
    x='Month', 
    y='Layoff Google Interest in New York for 2020', 
    title='Monthly Layoff Interest in New York for 2020',
    labels={'Month': 'Month', 'Layoff Google Interest in New York for 2020': 'Layoffs Interest'},
    color_discrete_sequence=['green']
)

# Add layout customization
fig.update_layout(
    xaxis=dict(
        title='Month',
        tickmode='array',
        tickvals=list(range(1, 13)),
        ticktext=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
         gridcolor='gray',
    ),
    yaxis=dict(
        title='Layoffs',
        gridcolor='gray',
    ),
    plot_bgcolor='black',  # Set the plotting area background to black
    paper_bgcolor='black',  # Set the entire chart background to black
    font=dict(
        color='white'  # Ensure all default text (like legend, annotations) is white
    )
)

fig.show()
pio.write_html(fig, file='monthly_layoffs.html', auto_open=False)



Race of Involed

In [233]:
discrete_colorscale = [
    'rgb(143,188,143)',  # Sea green
    'rgb(119,136,153)',  # Light slate gray
    'rgb(192,192,192)',  # Silver
    'rgb(47,79,79)',     # Dark slate gray
    'rgb(105,105,105)',  # Dim gray
    'rgb(255,165,0)'     # Orange for emphasis
]

# Replace 'nan' and any other identifier for unknown data with 'UNKNOWN'
cleaned_data['VIC_RACE'] = cleaned_data['VIC_RACE'].fillna('UNKNOWN').replace(['', ' ', 'N/A'], 'UNKNOWN')

# Group by year and victim race, count the occurrences
vic_race_yearly = cleaned_data.groupby(['YEAR', 'VIC_RACE']).size().unstack(fill_value=0)

# Combine 'UNKNOWN' and any other nan categories into one
vic_race_yearly['UNKNOWN'] = vic_race_yearly.get('UNKNOWN', 0) + vic_race_yearly.get('', 0)


# Calculate total incidents per year
total_incidents_per_year = vic_race_yearly.sum(axis=1)

# Calculate percentages
percentages = vic_race_yearly.divide(total_incidents_per_year, axis=0) * 100

# Assuming 'vic_race_yearly' and 'percentages' are already calculated as per your previous code

# Reset the index to turn 'YEAR' into a column and prepare data for plotting
vic_race_yearly_reset = vic_race_yearly.reset_index()
percentages_reset = percentages.reset_index()
percentages_reset = percentages_reset.round(0)

# Melt the data to long format, which Plotly can use to plot stacked area charts
vic_race_yearly_melted = vic_race_yearly_reset.melt(id_vars='YEAR', var_name='Victim Race', value_name='Number of Incidents')
percentages_melted = percentages_reset.melt(id_vars='YEAR', var_name='Victim Race', value_name='Percentage')

# Merge the counts and percentages into a single DataFrame
area_chart_data = vic_race_yearly_melted.merge(percentages_melted, on=['YEAR', 'Victim Race'])
# Filter out 'UNKNOWN' from visualization, not from calculation
area_chart_data = area_chart_data[area_chart_data['Victim Race'] != 'UNKNOWN']
area_chart_data.sort_values(by=['YEAR', 'Percentage'], inplace=True)


# Create a stacked area chart with hover data showing percentages
# Assuming you know the races involved or you extract them from the DataFrame
races = area_chart_data['Victim Race'].unique()
color_map = {race: color for race, color in zip(races, discrete_colorscale)}

# Apply the discrete color palette to the area chart
fig_area = px.area(area_chart_data, x='YEAR', y='Number of Incidents', color='Victim Race',
                   title="Victim Race Distribution by Year",
                   labels={'YEAR': 'Year', 'Number of Incidents': 'Number of Incidents', 'Victim Race': 'Victim Race'},
                   hover_data={'Number of Incidents': True, 'Percentage': True},
                   color_discrete_map=color_map)  # Using the color mapping

# Update layout
fig_area.update_layout(
    xaxis=dict(
        title='Year',
        gridcolor='gray',
    ),
    yaxis=dict(
        title='Number of Incidents',
        gridcolor='gray',
    ),
    legend_title_text='Victim Race',
    plot_bgcolor='black',  # Set the plotting area background to black
    paper_bgcolor='black',  # Set the entire chart background to black
    font=dict(
        color='white'  # Ensure all default text (like legend, annotations) is white
    )
)

fig_area.show()
pio.write_html(fig, file='victim_race_distribution.html', auto_open=False)


The distridutions remain the same even after the spike

In [234]:
# Filter out unknown and nan races for perpetrators and victims
filtered_data = cleaned_data[
    (cleaned_data['PERP_RACE'].notna()) & (cleaned_data['PERP_RACE'] != 'UNKNOWN') &
    (cleaned_data['VIC_RACE'].notna()) & (cleaned_data['VIC_RACE'] != 'UNKNOWN') &
    (cleaned_data['VIC_RACE'] != 'AMERICAN INDIAN/ALASKAN NATIVE') &
    (cleaned_data['PERP_RACE'] != 'AMERICAN INDIAN/ALASKAN NATIVE')
]

# Create a crosstab of perpetrator and victim races
# Ensure PERP_RACE are rows and VIC_RACE are columns
race_on_race = pd.crosstab(filtered_data['PERP_RACE'], filtered_data['VIC_RACE'])

for i in range(0,5):
    s = race_on_race.iloc[i].sum(axis=0)
    race_on_race.iloc[i] = race_on_race.iloc[i]/s
    race_on_race.iloc[i] = (race_on_race.iloc[i] * 100).round(2)


# Plotting a density heatmap using the grayscale color scale
fig = px.imshow(
    race_on_race,
    labels=dict(x="Victim Race", y="Perpetrator Race", color="Incident Count"),
    title='Heatmap of Race-on-Race Crime Percentages',
    aspect='auto',  # Adjust the aspect ratio to auto if needed
    color_continuous_scale=continuous_colorscale  # Apply the continuous color scale
)

# Ensure axes labels are correct and on the appropriate side
fig.update_xaxes(side="bottom", tickmode='array', tickvals=list(range(len(race_on_race.columns))), ticktext=race_on_race.columns)
fig.update_yaxes(tickmode='array', tickvals=list(range(len(race_on_race.index))), ticktext=race_on_race.index)

# Optionally, you can add a color axis title
fig.update_layout(coloraxis_colorbar=dict(
    title="",  # Adding two line breaks for spacing
    titleside="top"
    ),
    plot_bgcolor='black',  # Set the plotting area background to black
    paper_bgcolor='black',  # Set the entire chart background to black
    font=dict(
        color='white'  # Ensure all default text (like legend, annotations) is white
    )
    
    )
fig.show()
pio.write_html(fig, file='confusion_matrix_interracial_shootings.html', auto_open=False)



Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '0.33766233766233766' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '0.34415584415584416' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '0.08441558441558442' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '0.07792207792207792' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pand