# Crime Data

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Unemployment Data

Insert Data from the Excel File

## Per Borough

In [None]:
unemployed = pd.read_excel('./Data/Unemployment_NYC.xlsx',sheet_name=0)

In [None]:
# converting headers to lowercase and removing whitespaces:
unemployed.columns = unemployed.columns.str.lower()
unemployed.columns = unemployed.columns.str.strip()

In [None]:
unemployed.head()

In [None]:
unemployed.info()

### Filter the data
for the 5 boroughs: Bronx, Kings, New York, Queens, Richmond are the official names for the boroughs

In [None]:
unemployed['areatype'].unique()

In [None]:
unemployed['area'].unique()

In [None]:
borough = ['Bronx County', 'Kings County', 'New York County', 'Queens County', 'Richmond County']
borough_unem = unemployed.loc[unemployed['area'].isin(borough)]
borough_unem.head()

In [None]:
borough_unem.info()

In [None]:
borough_unem['avg_year']= borough_unem['month']==0
borough_unem.head()

In [None]:
borough_unem['avg_year'].unique()

### Creating df with only avg yearly and with only monthly data.

In [None]:
bo_unem_yearly = borough_unem.loc[borough_unem['avg_year'] == True]
bo_unem_yearly.head()

In [None]:
bo_unem_yearly.info()

In [None]:
bo_unem_monthly = borough_unem.loc[borough_unem['avg_year'] == False]
bo_unem_monthly.head()

In [None]:
# Resetting Index:
bo_unem_yearly.reset_index(inplace=True)
bo_unem_yearly.drop('index', axis=1, inplace=True)
bo_unem_yearly.head()

In [None]:
# Resetting Index monthly:
bo_unem_monthly.reset_index(inplace=True)
bo_unem_monthly.drop('index', axis=1, inplace=True)
bo_unem_monthly.head()

In [None]:
bo_unem_monthly.info()

In [None]:
#Including new column (day) to convert to datetime
bo_unem_monthly_d = bo_unem_monthly
bo_unem_monthly_d['day'] = 1
bo_unem_monthly_d.head()

In [None]:
#Inserting Date
bo_unem_monthly_d['date'] = pd.to_datetime(bo_unem_monthly[['year', 'month', 'day']], format="%y/%m")
bo_unem_monthly_d.head()

In [None]:
# dropping columns we don't need
bo_unem_monthly_d.drop(['areatype', 'avg_year', 'day'], axis= 1, inplace = True)
bo_unem_monthly_d.head()

In [None]:
bo_unem_monthly_d.head()

In [None]:
bo_unem_monthly_d.info()

### Looking at the Data

In [None]:
plt.figure(figsize=(12, 6))

# Create the histogram with 'waiting_for_truck_time'
sns.lineplot(x='date',y = 'unemprate', hue='area', data=bo_unem_monthly_d, linewidth=1, color='palevioletred')
# Fill the area under the curve with blue color
#plt.fill_between(harrisburg_full['Date '], harrisburg_full['pct_cancelled'], color='palevioletred', alpha=0.3)
# Set labels and title
##plt.xlabel('pct_cancelled')
plt.ylabel('unemploymentrate')
plt.title('Rate of Unemployment')

# Show the legend
plt.legend()

# Show the plot
plt.show()

## Data for whole NYC

In [None]:
area = pd.read_excel('./Data/Unemployment_NYC.xlsx',sheet_name=1)
area.head()

In [None]:
# converting headers to lowercase and removing whitespaces:
area.columns = area.columns.str.lower()
area.columns = area.columns.str.strip()
area.head()

In [None]:
area['area'].unique()

In [None]:
nyc = area.loc[area['area'].isin(['New York City'])]
nyc.head()

In [None]:
nyc.info()

### Creating DF with monthly and yearly

In [None]:
nyc_yearly = nyc.loc[nyc['month'] == 0]
nyc_yearly.head()

In [None]:
nyc_monthly = nyc.loc[nyc['month'] != 0]
nyc_monthly.head()

In [None]:
# Resetting Index monthly:
nyc_monthly.reset_index(inplace=True)
nyc_monthly.drop('index', axis=1, inplace=True)
nyc_monthly.head()

In [None]:
#Including new column (day) to convert to datetime
nyc_monthly_d = nyc_monthly
nyc_monthly_d['day'] = 1

#Inserting Date
nyc_monthly_d['date'] = pd.to_datetime(nyc_monthly_d[['year', 'month', 'day']], format="%y/%m")

nyc_monthly_d.head()

In [None]:
# dropping columns we don't need
nyc_monthly_d.drop(['areatype', 'day'], axis= 1, inplace = True)
nyc_monthly_d.head()

In [None]:
nyc_monthly_d.info()

## Concatenate NYC and Borough Data

In [None]:
nyc_monthly_d.info()

In [None]:
bo_unem_monthly_d.info()

In [None]:
nyc_unemployment = pd.concat([bo_unem_monthly_d, nyc_monthly_d])
nyc_unemployment.head()

In [None]:
nyc_unemployment.info()

In [None]:
#filtering for starting at 2000
nyc_unemployment_filtered = nyc_unemployment[nyc_unemployment['date'].dt.year >= 2000]

plt.figure(figsize=(12, 6))

# Create the histogram with 'waiting_for_truck_time'
sns.lineplot(x='date',y = 'unemprate', hue='area', data=nyc_unemployment_filtered, linewidth=1, color='palevioletred')
# Fill the area under the curve with blue color
#plt.fill_between(harrisburg_full['Date '], harrisburg_full['pct_cancelled'], color='palevioletred', alpha=0.3)
# Set labels and title
##plt.xlabel('pct_cancelled')
plt.ylabel('unemploymentrate')
plt.title('Rate of Unemployment')

# Show the legend
plt.legend()

# Show the plot
plt.show()

## Uploading Data to Postgres

In [None]:
from sql_functions import get_engine

In [None]:
table_name = 'unemployment_nyc'
schema = 'capstone_crime_nerds' # UPDATE 'TABLE_SCHEMA' based on schema used in class 
engine = get_engine() # assign engine to be able to query against the database
# If the specified table doesn't exist yet, it will be created
# With 'replace', your data will be replaced if the table already exists.
# This may take some time ...

# Write records stored in a dataframe to SQL database
if engine!=None:
    try:
        nyc_unemployment.to_sql(name=table_name, # Name of SQL table
                        con=engine, # Engine or connection
                        if_exists='replace', # Drop the table before inserting new values 
                        schema=schema, # Use schmea that was defined earlier
                        index=False, # Write DataFrame index as a column
                        chunksize=5000, # Specify the number of rows in each batch to be written at a time
                        method='multi') # Pass multiple values in a single INSERT clause
        print(f"The {table_name} table was imported successfully.")
    # Error handling
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        engine = None

In [None]:
table_name_sql = f'''SELECT count(*) 
                    FROM {schema}.{table_name}
                    '''
engine.execute(table_name_sql).fetchall()[0][0] == nyc_unemployment.shape[0]

# Police Stations

## Try Nr. 1

In [2]:
#Inserting Police Stations Data
agencies = pd.read_csv('./Data/Agency_Performance_Mapping_NYC.csv')
agencies.head()

Unnamed: 0,Agency,Geographic Unit,Geographic Identifier,Indicator,Jul-10,Aug-10,Sep-10,Oct-10,Nov-10,Dec-10,...,Sep-18,Oct-18,Nov-18,Dec-18,Jan-19,Feb-19,Mar-19,Apr-19,May-19,Jun-19
0,DSNY,Community District,Staten Island 3,Streets rated acceptably clean (%),100.0,100.0,100.0,100.0,99.3,100.0,...,100.0,100.0,,,,,,,,
1,DSNY,Community District,Staten Island 2,Streets rated acceptably clean (%),96.1,99.1,100.0,98.8,98.1,98.2,...,95.72,99.0,,,,,,,,
2,DSNY,Community District,Staten Island 1,Streets rated acceptably clean (%),92.1,94.4,94.1,96.4,95.1,96.9,...,99.17,95.84,,,,,,,,
3,DSNY,Community District,Queens 14,Streets rated acceptably clean (%),97.0,97.8,97.5,95.7,96.6,99.1,...,97.29,98.19,,,,,,,,
4,DSNY,Community District,Queens 13,Streets rated acceptably clean (%),95.6,95.9,97.2,99.7,99.4,97.9,...,95.45,94.88,,,,,,,,


In [3]:
agencies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1167 entries, 0 to 1166
Columns: 112 entries, Agency to Jun-19
dtypes: float64(8), object(104)
memory usage: 1021.2+ KB


In [4]:
agencies.shape

(1167, 112)

In [5]:
agencies['Agency'].unique()

array(['DSNY', 'NYPD', 'FDNY'], dtype=object)

In [6]:
#Cleaning columns - converting to lower case, removing whitespaces from front and end and replacing whitespaces with underscores.
agencies.columns = agencies.columns.str.lower()
agencies.columns = agencies.columns.str.strip()
agencies.columns = agencies.columns.str.replace(" ", "_")
agencies.head()

Unnamed: 0,agency,geographic_unit,geographic_identifier,indicator,jul-10,aug-10,sep-10,oct-10,nov-10,dec-10,...,sep-18,oct-18,nov-18,dec-18,jan-19,feb-19,mar-19,apr-19,may-19,jun-19
0,DSNY,Community District,Staten Island 3,Streets rated acceptably clean (%),100.0,100.0,100.0,100.0,99.3,100.0,...,100.0,100.0,,,,,,,,
1,DSNY,Community District,Staten Island 2,Streets rated acceptably clean (%),96.1,99.1,100.0,98.8,98.1,98.2,...,95.72,99.0,,,,,,,,
2,DSNY,Community District,Staten Island 1,Streets rated acceptably clean (%),92.1,94.4,94.1,96.4,95.1,96.9,...,99.17,95.84,,,,,,,,
3,DSNY,Community District,Queens 14,Streets rated acceptably clean (%),97.0,97.8,97.5,95.7,96.6,99.1,...,97.29,98.19,,,,,,,,
4,DSNY,Community District,Queens 13,Streets rated acceptably clean (%),95.6,95.9,97.2,99.7,99.4,97.9,...,95.45,94.88,,,,,,,,


In [21]:
nypd = agencies[agencies['agency'] == 'NYPD']
nypd.info()

<class 'pandas.core.frame.DataFrame'>
Index: 616 entries, 42 to 1146
Columns: 112 entries, agency to jun-19
dtypes: float64(8), object(104)
memory usage: 543.8+ KB


In [22]:
nypd.head()

Unnamed: 0,agency,geographic_unit,geographic_identifier,indicator,jul-10,aug-10,sep-10,oct-10,nov-10,dec-10,...,sep-18,oct-18,nov-18,dec-18,jan-19,feb-19,mar-19,apr-19,may-19,jun-19
42,NYPD,Precinct,106,Murder and non-negligent manslaughter,2,1,0,0,0,1,...,2,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
64,NYPD,Precinct,105,Murder and non-negligent manslaughter,0,1,1,2,2,0,...,1,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
314,NYPD,Precinct,104,Murder and non-negligent manslaughter,0,0,1,2,0,0,...,1,0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
403,NYPD,Precinct,103,Murder and non-negligent manslaughter,2,1,3,0,1,3,...,0,0,1.0,0.0,2.0,1.0,0.0,1.0,1.0,0.0
445,NYPD,Precinct,94,Major felony crime,94,94,81,84,58,86,...,86,96,76.0,68.0,73.0,60.0,42.0,61.0,65.0,56.0


In [23]:
columns1 = ['jul-10', "aug-10", "sep-10", "oct-10", "nov-10", "dec-10",
           "jan-11", "feb-11", "mar-11", "apr-11", "may-11", "jun-11", "jul-11", "aug-11", "sep-11", "oct-11", "nov-11", "dec-11",
           "jan-12", "feb-12", "mar-12", "apr-12", "may-12", "jun-12", "jul-12", "aug-12", "sep-12", "oct-12", "nov-12", "dec-12",
           "jan-13", "feb-13", "mar-13", "apr-13", "may-13", "jun-13", "jul-13", "aug-13", "sep-13", "oct-13", "nov-13", "dec-13",
           "jan-14", "feb-14", "mar-14", "apr-14", "may-14", "jun-14", "jul-14", "aug-14", "sep-14", "oct-14", "nov-14", "dec-14",
           "jan-15", "feb-15", "mar-15", "apr-15", "may-15", "jun-15", "jul-15", "aug-15", "sep-15", "oct-15", "nov-15", "dec-15",
           "jan-16", "feb-16", "mar-16", "apr-16", "may-16", "jun-16", "jul-16", "aug-16", "sep-16", "oct-16", "nov-16", "dec-16",
           "jan-17", "feb-17", "mar-17", "apr-17", "may-17", "jun-17", "jul-17", "aug-17", "sep-17", "oct-17", "nov-17", "dec-17",
           "jan-18", "feb-18", "mar-18", "apr-18", "may-18", "jun-18", "jul-18", "aug-18", "sep-18", "oct-18", "nov-18", "dec-18",
           "jan-19", "feb-19", "mar-19", "apr-19", "may-19", "jun-19"]
nypd_s = nypd.drop(columns1, axis=1)
nypd_s.head()

Unnamed: 0,agency,geographic_unit,geographic_identifier,indicator
42,NYPD,Precinct,106,Murder and non-negligent manslaughter
64,NYPD,Precinct,105,Murder and non-negligent manslaughter
314,NYPD,Precinct,104,Murder and non-negligent manslaughter
403,NYPD,Precinct,103,Murder and non-negligent manslaughter
445,NYPD,Precinct,94,Major felony crime


In [28]:
nypd_s['geographic_identifier'].unique()

array(['106', '105', '104', '103', '94', '120', '108', '123', '122',
       '121', '115', '114', '113', '112', '111', '110', '109', '107',
       '102', '101', '100', '90', '88', '84', '83', '81', '79', '78',
       '77', '76', '75', '73', '72', '71', '70', '69', '68', '67', '66',
       '63', '62', '61', '60', '52', '50', '49', '48', '47', '46', '45',
       '44', '43', '42', '41', '40', '34', '33', '32', '30', '28', '26',
       '25', '24', '23', '22', '20', '19', '18', '17', '14', '13', '10',
       '9', '7', '6', '5', '1'], dtype=object)

## Try Nr. 2 - CSV from Website (created on my own)

In [29]:
police_st = pd.read_csv('./Data/NYC_Precicts_per_borough.csv')
police_st.head()

Unnamed: 0,Borough,Precinct,Adress
0,Manhattan,1st Precinct,16 Ericsson Place
1,Manhattan,5th Precinct,19 Elizabeth Street
2,Manhattan,6th Precinct,233 West 10 Street
3,Manhattan,7th Precinct,19 1/2 Pitt Street
4,Manhattan,9th Precinct,321 East 5 Street


In [32]:
police_st.groupby('Borough').agg({'Precinct':['count']})

Unnamed: 0_level_0,Precinct
Unnamed: 0_level_1,count
Borough,Unnamed: 1_level_2
Bronx,12
Brooklyn,23
Manhattan,22
Queens,16
Staten Island,4


In [33]:
police_st

Unnamed: 0,Borough,Precinct,Adress
0,Manhattan,1st Precinct,16 Ericsson Place
1,Manhattan,5th Precinct,19 Elizabeth Street
2,Manhattan,6th Precinct,233 West 10 Street
3,Manhattan,7th Precinct,19 1/2 Pitt Street
4,Manhattan,9th Precinct,321 East 5 Street
...,...,...,...
72,Queens,115th Precinct,92-15 Northern Boulevard
73,Staten Island,120th Precinct,78 Richmond Terrace
74,Staten Island,121st Precinct,970 Richmond Avenue
75,Staten Island,122nd Precinct,2320 Hylan Boulevard
