In [16]:
import pandas as pd
import numpy as np
import math
import datetime
pd.set_option('display.max_columns', 40)
pd.set_option('display.max_rows', 4)
den_total_pop = 619.968

In [17]:
df = pd.read_csv("Denver.csv")

In [18]:
df.head()

Unnamed: 0,INCIDENT_ID,OFFENSE_ID,OFFENSE_CODE,OFFENSE_CODE_EXTENSION,OFFENSE_TYPE_ID,OFFENSE_CATEGORY_ID,FIRST_OCCURRENCE_DATE,LAST_OCCURRENCE_DATE,REPORTED_DATE,INCIDENT_ADDRESS,GEO_X,GEO_Y,GEO_LON,GEO_LAT,DISTRICT_ID,PRECINCT_ID,NEIGHBORHOOD_ID,IS_CRIME,IS_TRAFFIC
0,2016376978,2016376978521300,5213,0,weapon-unlawful-discharge-of,all-other-crimes,6/15/2016 11:31:00 PM,,6/15/2016 11:31:00 PM,,3193983.0,1707251.0,-104.809881,39.773188,5,521,montbello,1,0
1,20186000994,20186000994239900,2399,0,theft-other,larceny,10/11/2017 12:30:00 PM,10/11/2017 4:55:00 PM,1/29/2018 5:53:00 PM,,3201943.0,1711852.0,-104.781434,39.785649,5,522,gateway-green-valley-ranch,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,201872333,201872333239900,2399,0,theft-other,larceny,1/30/2018 7:20:00 PM,,1/30/2018 10:29:00 PM,705 S COLORADO BLVD,3157162.0,1681320.0,-104.941440,39.702698,3,312,belcaro,1,0
4,2017411405,2017411405230300,2303,0,theft-shoplift,larceny,6/22/2017 8:53:00 PM,,6/23/2017 4:09:00 PM,2810 E 1ST AVE,3153211.0,1686545.0,-104.955370,39.717107,3,311,cherry-creek,1,0


In [19]:
# Organizing columns
df.rename(columns={'INCIDENT_ADDRESS': 'Location-name', 'GEO_LON': 'Longitude', 'GEO_LAT': 'Latitude',
                   'NEIGHBORHOOD_ID': 'Neighborhood'}, inplace=True) # Renaming wrong names
df['City'] = 'Denver' # Adding the city column
df.drop(['INCIDENT_ID', 'OFFENSE_ID', 'OFFENSE_CODE', 'OFFENSE_CODE_EXTENSION', 'OFFENSE_TYPE_ID',
         'OFFENSE_CATEGORY_ID', 'FIRST_OCCURRENCE_DATE', 'LAST_OCCURRENCE_DATE', 'REPORTED_DATE',
         'GEO_X', 'GEO_Y', 'DISTRICT_ID', 'PRECINCT_ID', 'IS_TRAFFIC']
         , axis=1, inplace=True) # Dropping useless columns to this table
df['Location-key'] = df.index # Creates Date_Key column based on index
df["Location-key"] += 193578 # Adds 100 to every row in the column
cols = ["Location-key", "Location-name", "Longitude", "Latitude", 
        "Neighborhood", "City", 'IS_CRIME']

crime_rate = df[df.IS_CRIME == 1].shape[0] / den_total_pop * 100000 # Creating the crime_rate list of values
df = df[cols] # Reorganize the columns
df['Crime-rate'] = crime_rate # Create the crime-rate column

In [20]:
df.head()

Unnamed: 0,Location-key,Location-name,Longitude,Latitude,Neighborhood,City,IS_CRIME,Crime-rate
0,193578,,-104.809881,39.773188,montbello,Denver,1,5.514236e+07
1,193579,,-104.781434,39.785649,gateway-green-valley-ranch,Denver,1,5.514236e+07
...,...,...,...,...,...,...,...,...
3,193581,705 S COLORADO BLVD,-104.941440,39.702698,belcaro,Denver,1,5.514236e+07
4,193582,2810 E 1ST AVE,-104.955370,39.717107,cherry-creek,Denver,1,5.514236e+07


In [21]:
# Formatting neighborhood names
formatted_neigh = []
for neigh in df['Neighborhood'].unique().tolist():
    temp_neigh = neigh.replace('-', ' ')
    temp_neigh = temp_neigh.title()
    formatted_neigh.append(temp_neigh)

for i in range(len(formatted_neigh)):
    if formatted_neigh[i] == 'Cbd':
        formatted_neigh[i] = 'CBD'
    if formatted_neigh[i] == 'College View South Platte':
        formatted_neigh[i] = 'College View / South Platte'
    if formatted_neigh[i] == 'Cory Merrill':
        formatted_neigh[i] = 'Cory - Merrill'
    if formatted_neigh[i] == 'Dia':
        formatted_neigh[i] = 'DIA'
    if formatted_neigh[i] == 'Gateway Green Valley Ranch':
        formatted_neigh[i] = 'Gateway / Green Valley Ranch'

In [22]:
census_df = pd.read_csv("Census_Denver_Changed.csv")

In [23]:
# Properly formats the df neighborhood names 
unformatted_formatted = {} # Key is unformmated and value is formatted neighborhood 
for i in range(len(df['Neighborhood'].unique().tolist())):
    unformatted_formatted[df['Neighborhood'].unique().tolist()[i]] = formatted_neigh[i]
df.replace({"Neighborhood": unformatted_formatted}, inplace=True)

In [24]:
df

Unnamed: 0,Location-key,Location-name,Longitude,Latitude,Neighborhood,City,IS_CRIME,Crime-rate
0,193578,,-104.809881,39.773188,Montbello,Denver,1,5.514236e+07
1,193579,,-104.781434,39.785649,Gateway / Green Valley Ranch,Denver,1,5.514236e+07
...,...,...,...,...,...,...,...,...
462403,655981,1717 CHAMPA ST,-104.992694,39.747876,CBD,Denver,1,5.514236e+07
462404,655982,22ND ST / ARAPAHOE ST,-104.988424,39.753477,Five Points,Denver,0,5.514236e+07


In [25]:
data_neigh = {} # Dictonary of key neighborhood and value as census following the order of the dataframe
for column in census_df:
    data_neigh[column] = census_df[column].tolist()
del data_neigh['Unnamed: 0'] # Delete uncessary column

In [26]:
table = [] # Table containing a list of lists, where list[0] is the values of data_neigh correspoding to df original order
n_columns = len(census_df.iloc[:, 0].tolist())
for i in range(df.shape[0]):
    neigh = df['Neighborhood'][i]
    if neigh in data_neigh.keys():
        table.append(data_neigh[neigh])
    else:
        table.append(['Nan'] * n_columns)
neigh_df = pd.DataFrame(table, columns=census_df.iloc[:, 0].tolist())

In [27]:
neigh_df

Unnamed: 0,Total-neighborhood-population,Years-0-to-4,Years-5-to-9,Years-10-to-14,Years-15-to-19,Years-20-to-24,Years-25-to-29,Years-30-to-34,Years-35-to-39,Years-40-to-44,Years-45-to-49,Years-50-to-54,Years-55-to-59,Years-60-to-64,Years-65-to-69,Years-70-to-74,Years-75-to-79,Years-80-to-84,Years-85-plus
0,30348.0,3096.0,3356.0,2987.0,2699.0,2099.0,2243.0,2399.0,2290.0,1938.0,1662.0,1514.0,1210.0,947.0,721.0,525.0,388.0,165.0,109.0
1,29201.0,3071.0,3005.0,2620.0,2059.0,1735.0,2527.0,2835.0,2730.0,2218.0,1860.0,1501.0,1270.0,826.0,477.0,243.0,115.0,70.0,39.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462403,3648.0,50.0,19.0,6.0,449.0,876.0,640.0,332.0,197.0,178.0,161.0,201.0,189.0,132.0,83.0,58.0,35.0,26.0,16.0
462404,12712.0,578.0,397.0,280.0,360.0,1624.0,2643.0,1525.0,1086.0,909.0,805.0,773.0,672.0,415.0,254.0,141.0,117.0,71.0,62.0


In [28]:
df = pd.concat([df, neigh_df], axis=1) # Concatenates side-by-side the df and neigh_df dataframes 

In [29]:
df

Unnamed: 0,Location-key,Location-name,Longitude,Latitude,Neighborhood,City,IS_CRIME,Crime-rate,Total-neighborhood-population,Years-0-to-4,Years-5-to-9,Years-10-to-14,Years-15-to-19,Years-20-to-24,Years-25-to-29,Years-30-to-34,Years-35-to-39,Years-40-to-44,Years-45-to-49,Years-50-to-54,Years-55-to-59,Years-60-to-64,Years-65-to-69,Years-70-to-74,Years-75-to-79,Years-80-to-84,Years-85-plus
0,193578,,-104.809881,39.773188,Montbello,Denver,1,5.514236e+07,30348.0,3096.0,3356.0,2987.0,2699.0,2099.0,2243.0,2399.0,2290.0,1938.0,1662.0,1514.0,1210.0,947.0,721.0,525.0,388.0,165.0,109.0
1,193579,,-104.781434,39.785649,Gateway / Green Valley Ranch,Denver,1,5.514236e+07,29201.0,3071.0,3005.0,2620.0,2059.0,1735.0,2527.0,2835.0,2730.0,2218.0,1860.0,1501.0,1270.0,826.0,477.0,243.0,115.0,70.0,39.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462403,655981,1717 CHAMPA ST,-104.992694,39.747876,CBD,Denver,1,5.514236e+07,3648.0,50.0,19.0,6.0,449.0,876.0,640.0,332.0,197.0,178.0,161.0,201.0,189.0,132.0,83.0,58.0,35.0,26.0,16.0
462404,655982,22ND ST / ARAPAHOE ST,-104.988424,39.753477,Five Points,Denver,0,5.514236e+07,12712.0,578.0,397.0,280.0,360.0,1624.0,2643.0,1525.0,1086.0,909.0,805.0,773.0,672.0,415.0,254.0,141.0,117.0,71.0,62.0


In [30]:
df.to_csv("Denver_Location_Final.csv", index=False) # Creates the csv file