# Crime Data

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Unemployment Data

Insert Data from the Excel File

## Per Borough

In [None]:
unemployed = pd.read_excel('./Data/Unemployment_NYC.xlsx',sheet_name=0)

In [None]:
# converting headers to lowercase and removing whitespaces:
unemployed.columns = unemployed.columns.str.lower()
unemployed.columns = unemployed.columns.str.strip()

In [None]:
unemployed.head()

In [None]:
unemployed.info()

### Filter the data
for the 5 boroughs: Bronx, Kings, New York, Queens, Richmond are the official names for the boroughs

In [None]:
unemployed['areatype'].unique()

In [None]:
unemployed['area'].unique()

In [None]:
borough = ['Bronx County', 'Kings County', 'New York County', 'Queens County', 'Richmond County']
borough_unem = unemployed.loc[unemployed['area'].isin(borough)]
borough_unem.head()

In [None]:
borough_unem.info()

In [None]:
borough_unem['avg_year']= borough_unem['month']==0
borough_unem.head()

In [None]:
borough_unem['avg_year'].unique()

### Creating df with only avg yearly and with only monthly data.

In [None]:
bo_unem_yearly = borough_unem.loc[borough_unem['avg_year'] == True]
bo_unem_yearly.head()

In [None]:
bo_unem_yearly.info()

In [None]:
bo_unem_monthly = borough_unem.loc[borough_unem['avg_year'] == False]
bo_unem_monthly.head()

In [None]:
# Resetting Index:
bo_unem_yearly.reset_index(inplace=True)
bo_unem_yearly.drop('index', axis=1, inplace=True)
bo_unem_yearly.head()

In [None]:
# Resetting Index monthly:
bo_unem_monthly.reset_index(inplace=True)
bo_unem_monthly.drop('index', axis=1, inplace=True)
bo_unem_monthly.head()

In [None]:
bo_unem_monthly.info()

In [None]:
#Including new column (day) to convert to datetime
bo_unem_monthly_d = bo_unem_monthly
bo_unem_monthly_d['day'] = 1
bo_unem_monthly_d.head()

In [None]:
#Inserting Date
bo_unem_monthly_d['date'] = pd.to_datetime(bo_unem_monthly[['year', 'month', 'day']], format="%y/%m")
bo_unem_monthly_d.head()

In [None]:
# dropping columns we don't need
bo_unem_monthly_d.drop(['areatype', 'avg_year', 'day'], axis= 1, inplace = True)
bo_unem_monthly_d.head()

In [None]:
bo_unem_monthly_d.head()

In [None]:
bo_unem_monthly_d.info()

### Looking at the Data

In [None]:
plt.figure(figsize=(12, 6))

# Create the histogram with 'waiting_for_truck_time'
sns.lineplot(x='date',y = 'unemprate', hue='area', data=bo_unem_monthly_d, linewidth=1, color='palevioletred')
# Fill the area under the curve with blue color
#plt.fill_between(harrisburg_full['Date '], harrisburg_full['pct_cancelled'], color='palevioletred', alpha=0.3)
# Set labels and title
##plt.xlabel('pct_cancelled')
plt.ylabel('unemploymentrate')
plt.title('Rate of Unemployment')

# Show the legend
plt.legend()

# Show the plot
plt.show()

## Data for whole NYC

In [None]:
area = pd.read_excel('./data/Unemployment_NYC.xlsx',sheet_name=1)
area.head()

In [None]:
# converting headers to lowercase and removing whitespaces:
area.columns = area.columns.str.lower()
area.columns = area.columns.str.strip()
area.head()

In [None]:
area['area'].unique()

In [None]:
nyc = area.loc[area['area'].isin(['New York City'])]
nyc.head()

In [None]:
nyc.info()

### Creating DF with monthly and yearly

In [None]:
nyc_yearly = nyc.loc[nyc['month'] == 0]
nyc_yearly.head()

In [None]:
nyc_monthly = nyc.loc[nyc['month'] != 0]
nyc_monthly.head()

In [None]:
# Resetting Index monthly:
nyc_monthly.reset_index(inplace=True)
nyc_monthly.drop('index', axis=1, inplace=True)
nyc_monthly.head()

In [None]:
#Including new column (day) to convert to datetime
nyc_monthly_d = nyc_monthly
nyc_monthly_d['day'] = 1

#Inserting Date
nyc_monthly_d['date'] = pd.to_datetime(nyc_monthly_d[['year', 'month', 'day']], format="%y/%m")

nyc_monthly_d.head()

In [None]:
# dropping columns we don't need
nyc_monthly_d.drop(['areatype', 'day'], axis= 1, inplace = True)
nyc_monthly_d.head()

In [None]:
nyc_monthly_d.info()

## Concatenate NYC and Borough Data

In [None]:
nyc_monthly_d.info()

In [None]:
bo_unem_monthly_d.info()

In [None]:
nyc_unemployment = pd.concat([bo_unem_monthly_d, nyc_monthly_d])
nyc_unemployment.head()

In [None]:
nyc_unemployment.info()

In [None]:
#filtering for starting at 2000
nyc_unemployment_filtered = nyc_unemployment[nyc_unemployment['date'].dt.year >= 2000]

plt.figure(figsize=(12, 6))

# Create the histogram with 'waiting_for_truck_time'
sns.lineplot(x='date',y = 'unemprate', hue='area', data=nyc_unemployment_filtered, linewidth=1, color='palevioletred')
# Fill the area under the curve with blue color
#plt.fill_between(harrisburg_full['Date '], harrisburg_full['pct_cancelled'], color='palevioletred', alpha=0.3)
# Set labels and title
##plt.xlabel('pct_cancelled')
plt.ylabel('unemploymentrate')
plt.title('Rate of Unemployment')

# Show the legend
plt.legend()

# Show the plot
plt.show()

## Uploading Data to Postgres

In [None]:
from sql_functions import get_engine

In [None]:
table_name = 'unemployment_nyc'
schema = 'capstone_crime_nerds' # UPDATE 'TABLE_SCHEMA' based on schema used in class 
engine = get_engine() # assign engine to be able to query against the database
# If the specified table doesn't exist yet, it will be created
# With 'replace', your data will be replaced if the table already exists.
# This may take some time ...

# Write records stored in a dataframe to SQL database
if engine!=None:
    try:
        nyc_unemployment.to_sql(name=table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # Use schmea that was defined earlier
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

In [None]:
table_name_sql = f'''SELECT count(*) 
                    FROM {schema}.{table_name}
                    '''
engine.execute(table_name_sql).fetchall()[0][0] == nyc_unemployment.shape[0]

# Police Stations

## Try Nr. 1

In [None]:
#Inserting Police Stations Data
agencies = pd.read_csv('./data/Agency_Performance_Mapping_NYC.csv')
agencies.head()

In [None]:
agencies.info()

In [None]:
agencies.shape

In [None]:
agencies['Agency'].unique()

In [None]:
#Cleaning columns - converting to lower case, removing whitespaces from front and end and replacing whitespaces with underscores.
agencies.columns = agencies.columns.str.lower()
agencies.columns = agencies.columns.str.strip()
agencies.columns = agencies.columns.str.replace(" ", "_")
agencies.head()

In [None]:
nypd = agencies[agencies['agency'] == 'NYPD']
nypd.info()

In [None]:
nypd.head()

In [None]:
columns1 = ['jul-10', "aug-10", "sep-10", "oct-10", "nov-10", "dec-10",
           "jan-11", "feb-11", "mar-11", "apr-11", "may-11", "jun-11", "jul-11", "aug-11", "sep-11", "oct-11", "nov-11", "dec-11",
           "jan-12", "feb-12", "mar-12", "apr-12", "may-12", "jun-12", "jul-12", "aug-12", "sep-12", "oct-12", "nov-12", "dec-12",
           "jan-13", "feb-13", "mar-13", "apr-13", "may-13", "jun-13", "jul-13", "aug-13", "sep-13", "oct-13", "nov-13", "dec-13",
           "jan-14", "feb-14", "mar-14", "apr-14", "may-14", "jun-14", "jul-14", "aug-14", "sep-14", "oct-14", "nov-14", "dec-14",
           "jan-15", "feb-15", "mar-15", "apr-15", "may-15", "jun-15", "jul-15", "aug-15", "sep-15", "oct-15", "nov-15", "dec-15",
           "jan-16", "feb-16", "mar-16", "apr-16", "may-16", "jun-16", "jul-16", "aug-16", "sep-16", "oct-16", "nov-16", "dec-16",
           "jan-17", "feb-17", "mar-17", "apr-17", "may-17", "jun-17", "jul-17", "aug-17", "sep-17", "oct-17", "nov-17", "dec-17",
           "jan-18", "feb-18", "mar-18", "apr-18", "may-18", "jun-18", "jul-18", "aug-18", "sep-18", "oct-18", "nov-18", "dec-18",
           "jan-19", "feb-19", "mar-19", "apr-19", "may-19", "jun-19"]
nypd_s = nypd.drop(columns1, axis=1)
nypd_s.head()

In [None]:
nypd_s['geographic_identifier'].unique()

## Try Nr. 2 - CSV from Website (created on my own)

In [None]:
police_st = pd.read_csv('./data/NYC_Precicts_per_borough.csv')
police_st.head()

In [None]:
police_st.columns = police_st.columns.str.lower()
police_st.columns = police_st.columns.str.strip()
police_st.head()

In [None]:
police_pb = police_st.groupby('borough')['precinct'].count()
police_pb

#.agg({'Precinct':['count']})

In [None]:
police_pb.info()

In [None]:
new_entry_dict = {'New York City': 77}

# Umwandlung des Dictionarys in eine Series
new_entry_series = pd.Series(new_entry_dict, name='borough')

# Konkatenation der ursprünglichen Series mit der neuen Series
police_series = pd.concat([police_pb, new_entry_series])

police_series

In [None]:
police_df = police_series.reset_index()
police_df.columns = ['borough', 'no_policestation']
police_df

In [None]:
police_st

# Size per borough and population (2022)

In [None]:
size = pd.read_csv('./data/Borough_Size.csv')
size.head()

In [None]:
size

In [None]:
size.info()

In [None]:
size['size_sq_km'] = (size['size_sq_miles']*2.59).round(2)
size

In [None]:
police_df

In [None]:
police_size = size.merge(police_df, how='outer', on = 'borough')
police_size

In [None]:
police_size['no_ps_sq_km']= police_size['no_policestation']/police_size['size_sq_km']
police_size

In [None]:
police_size['no_ps_pop']= police_size['no_policestation']/(police_size['population_2022']/100000)
police_size

In [None]:
police_size['no_ps_sq_miles']= police_size['no_policestation']/(police_size['size_sq_miles'])
police_size

# Data on Salary of Police Officers etc.

In [2]:
city_pay = pd.read_csv('./data/Citywide_Payroll_Data_Yearly.csv')
city_pay.head()

#Notes:
#OT = Over Time
#All that is captured, is the employee's final base and gross salary at the end of the fiscal year.  In very limited cases, a check replacement and subsequent refund may reflect both the original check as well as the re-issued check in employee pay totals.  

  city_pay = pd.read_csv('./Data/Citywide_Payroll_Data_Yearly.csv')


Unnamed: 0,Fiscal Year,Payroll Number,Agency Name,Last Name,First Name,Mid Init,Agency Start Date,Work Location Borough,Title Description,Leave Status as of June 30,Base Salary,Pay Basis,Regular Hours,Regular Gross Paid,OT Hours,Total OT Paid,Total Other Pay
0,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,BEREZIN,MIKHAIL,,08/10/2015,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820.0,84698.21,0.0,0.0,0.0
1,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,GEAGER,VERONICA,M,09/12/2016,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820.0,84698.21,0.0,0.0,0.0
2,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,RAMANI,SHRADDHA,,02/22/2016,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820.0,84698.21,0.0,0.0,0.0
3,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,ROTTA,JONATHAN,D,09/16/2013,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820.0,84698.21,0.0,0.0,0.0
4,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,WILSON II,ROBERT,P,04/30/2018,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820.0,84698.21,0.0,0.0,0.0


In [3]:
#Cleaning Column Names
city_pay.columns = city_pay.columns.str.lower()
city_pay.columns = city_pay.columns.str.strip()
city_pay.columns = city_pay.columns.str.replace(" ", "_")
city_pay.head()

Unnamed: 0,fiscal_year,payroll_number,agency_name,last_name,first_name,mid_init,agency_start_date,work_location_borough,title_description,leave_status_as_of_june_30,base_salary,pay_basis,regular_hours,regular_gross_paid,ot_hours,total_ot_paid,total_other_pay
0,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,BEREZIN,MIKHAIL,,08/10/2015,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820.0,84698.21,0.0,0.0,0.0
1,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,GEAGER,VERONICA,M,09/12/2016,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820.0,84698.21,0.0,0.0,0.0
2,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,RAMANI,SHRADDHA,,02/22/2016,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820.0,84698.21,0.0,0.0,0.0
3,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,ROTTA,JONATHAN,D,09/16/2013,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820.0,84698.21,0.0,0.0,0.0
4,2020,17.0,OFFICE OF EMERGENCY MANAGEMENT,WILSON II,ROBERT,P,04/30/2018,BROOKLYN,EMERGENCY PREPAREDNESS MANAGER,ACTIVE,86005.0,per Annum,1820.0,84698.21,0.0,0.0,0.0


In [4]:
# Dropping Columns, we don't need.

city_pay.drop(['last_name', 'first_name', 'mid_init'], axis=1, inplace=True)
#city_pay.drop('mid_init', axis=1, inplace = True)
city_pay.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5662713 entries, 0 to 5662712
Data columns (total 14 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   fiscal_year                 int64  
 1   payroll_number              float64
 2   agency_name                 object 
 3   agency_start_date           object 
 4   work_location_borough       object 
 5   title_description           object 
 6   leave_status_as_of_june_30  object 
 7   base_salary                 float64
 8   pay_basis                   object 
 9   regular_hours               float64
 10  regular_gross_paid          float64
 11  ot_hours                    float64
 12  total_ot_paid               float64
 13  total_other_pay             float64
dtypes: float64(7), int64(1), object(6)
memory usage: 604.8+ MB


In [5]:
city_pay['agency_name'].unique()

array(['OFFICE OF EMERGENCY MANAGEMENT', 'OFFICE OF MANAGEMENT & BUDGET',
       'PERSONNEL MONITORS', 'TAX COMMISSION', 'LAW DEPARTMENT',
       'DEPT OF ED PEDAGOGICAL', 'DEPARTMENT OF CITY PLANNING',
       'DEPARTMENT OF INVESTIGATION', 'TEACHERS RETIREMENT SYSTEM',
       'CIVILIAN COMPLAINT REVIEW BD', 'POLICE DEPARTMENT',
       'NYC HOUSING AUTHORITY', 'BOARD OF ELECTION POLL WORKERS',
       'FIRE DEPARTMENT', "NYC DEPT OF VETERANS' SERVICES",
       "ADMIN FOR CHILDREN'S SVCS", 'HRA/DEPT OF SOCIAL SERVICES',
       'DEPT. OF HOMELESS SERVICES', 'DEPARTMENT OF CORRECTION',
       'BOARD OF CORRECTION', 'MAYORS OFFICE OF CONTRACT SVCS',
       'PUBLIC ADVOCATE', 'CITY COUNCIL', 'CITY CLERK',
       'DEPARTMENT FOR THE AGING', 'CULTURAL AFFAIRS',
       'FINANCIAL INFO SVCS AGENCY', 'DEPARTMENT OF JUVENILE JUSTICE',
       'OFF OF PAYROLL ADMINISTRATION', 'INDEPENDENT BUDGET OFFICE',
       'EQUAL EMPLOY PRACTICES COMM', 'CIVIL SERVICE COMMISSION',
       'LANDMARKS PRESERVATION

In [6]:
# Filtering for police
#'POLICE DEPARTMENT''NYC POLICE PENSION FUND''Police Department'

police_pay = city_pay.loc[city_pay['agency_name'].isin(['POLICE DEPARTMENT', 'NYC POLICE PENSION FUND', 'Police Department'])]
police_pay.head()

Unnamed: 0,fiscal_year,payroll_number,agency_name,agency_start_date,work_location_borough,title_description,leave_status_as_of_june_30,base_salary,pay_basis,regular_hours,regular_gross_paid,ot_hours,total_ot_paid,total_other_pay
4789,2020,56.0,POLICE DEPARTMENT,12/20/1998,MANHATTAN,SUPERVISOR OF SCHOOL SECURITY,CEASED,57813.0,per Annum,0.0,0.0,0.0,0.0,650000.0
4790,2020,56.0,POLICE DEPARTMENT,04/25/2016,MANHATTAN,STATIONARY ENGINEER,ACTIVE,508.8,per Day,2080.0,140146.17,1125.25,117728.07,40696.41
4791,2020,56.0,POLICE DEPARTMENT,04/30/1995,MANHATTAN,LIEUTENANT D/A SPECIAL ASSIGNMENT,CEASED,141196.0,per Annum,665.18,48008.86,2451.57,162000.47,78132.88
4792,2020,56.0,POLICE DEPARTMENT,12/16/2013,MANHATTAN,STATIONARY ENGINEER,ACTIVE,508.8,per Day,2080.0,140146.17,1007.0,103226.97,27980.42
4793,2020,56.0,POLICE DEPARTMENT,07/11/2012,MANHATTAN,POLICE OFFICER,CEASED,85292.0,per Annum,22.87,227914.67,0.0,0.0,30289.83


In [7]:
police_pay['agency_name'].unique()

array(['POLICE DEPARTMENT', 'NYC POLICE PENSION FUND',
       'Police Department'], dtype=object)

In [8]:
police_pay.reset_index(inplace=True, drop=True)
#police_pay.columns.drop('index')
police_pay

Unnamed: 0,fiscal_year,payroll_number,agency_name,agency_start_date,work_location_borough,title_description,leave_status_as_of_june_30,base_salary,pay_basis,regular_hours,regular_gross_paid,ot_hours,total_ot_paid,total_other_pay
0,2020,56.0,POLICE DEPARTMENT,12/20/1998,MANHATTAN,SUPERVISOR OF SCHOOL SECURITY,CEASED,57813.0,per Annum,0.00,0.00,0.00,0.00,650000.00
1,2020,56.0,POLICE DEPARTMENT,04/25/2016,MANHATTAN,STATIONARY ENGINEER,ACTIVE,508.8,per Day,2080.00,140146.17,1125.25,117728.07,40696.41
2,2020,56.0,POLICE DEPARTMENT,04/30/1995,MANHATTAN,LIEUTENANT D/A SPECIAL ASSIGNMENT,CEASED,141196.0,per Annum,665.18,48008.86,2451.57,162000.47,78132.88
3,2020,56.0,POLICE DEPARTMENT,12/16/2013,MANHATTAN,STATIONARY ENGINEER,ACTIVE,508.8,per Day,2080.00,140146.17,1007.00,103226.97,27980.42
4,2020,56.0,POLICE DEPARTMENT,07/11/2012,MANHATTAN,POLICE OFFICER,CEASED,85292.0,per Annum,22.87,227914.67,0.00,0.00,30289.83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
603544,2023,256.0,NYC POLICE PENSION FUND,08/29/2022,MANHATTAN,COMPUTER SYSTEMS MANAGER,ACTIVE,151830.0,per Annum,1470.00,117171.69,1.00,78.49,3033.00
603545,2023,256.0,NYC POLICE PENSION FUND,02/17/2005,MANHATTAN,COMMUNITY COORDINATOR,ACTIVE,89110.0,per Annum,1820.00,40874.90,0.00,0.00,3312.45
603546,2023,256.0,NYC POLICE PENSION FUND,12/03/2018,MANHATTAN,ASSOCIATE RETIREMENT BENEFITS EXAMINER,ACTIVE,68558.0,per Annum,1820.00,68651.40,0.00,0.00,12709.93
603547,2023,256.0,NYC POLICE PENSION FUND,06/14/2004,MANHATTAN,CUSTOMER INFORMATION REPRESENTATIVE MA L 1549,ACTIVE,91622.0,per Annum,1820.00,90137.89,344.75,20875.95,7015.34


In [9]:
police_pay['leave_status_as_of_june_30'].unique()

array(['CEASED', 'ACTIVE', 'ON LEAVE', 'ON SEPARATION LEAVE', 'SEASONAL'],
      dtype=object)