## Import Libraries for Analysis

In [167]:
# Import Core Libraries
import os
import sys
import time
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter


# Set Theme for graphs
sns.set_theme(palette='bright')

### Self-Made Functions

In [168]:
def infoOut(data):
    '''
    Title: Display Dataframe of df.info
    Description: Display a pandas dataframe of df.info

    data: Pandas dataframe.
    '''
    dfInfo = data.columns.to_frame(name='Column')   # Create dataframe         
    dfInfo['Non-Null Count'] = data.notna().sum()   # Add non-null counts to dataframe
    dfInfo['NULL Count'] = data.isnull().sum()      # Add NULL counts to dataframe        
    dfInfo['Dtype'] = data.dtypes                   # add dtype to dataframe
    dfInfo.reset_index(drop=True,inplace=True)      # Reset index        
    return dfInfo                                   # display info dataframe

def nullValues(data):
    '''
    Title: Display Null values in Pandas Dataframe
    Description: Display a pandas dataframe of Null values for each column
    
    data: Pandas dataframe.
    '''
    display(data.isnull().sum().to_frame().rename(columns = {0:'NULL Amounts'}))

## Data Overview

In [169]:
# Create DataFrame to Hold data
stl_county_df = pd.read_csv('STL-County-2023.csv', low_memory=False)
stl_city_df = pd.read_csv('STL-City-2023.csv', low_memory=False)
stl_census_df = pd.read_csv('STL-Census-2023.csv', low_memory=False)

# Show DataFrame Head
print('St.Louis County')
print('-' * 20)
display(stl_county_df.head(3))

# Show DataFrame Head
print('\n\nSt. Louis City')
print('-' * 20)
display(stl_city_df.head(3))

# Show Census Data
print('\n\nSt.Louis Census')
print('-'*20)
display(stl_census_df.head(3))

St.Louis County
--------------------


Unnamed: 0,OffenseName,OffenseCategory,Report Number,reportingJuris,forJuris,address,latitude,longitude,dtCalledIntoCad,occurred,OccDOW,OccMonth,premise,zone,district
0,Destruction/Damage/Vandalism Of Property,Property,3-431,RICHMOND HEIGHTS,RICHMOND HEIGHTS,1107 EAST LINDEN,38.6336,-90.3423,2/27/2023 18:08,2/26/2003,Wed,2,CHURCH,3212,DISTRICT 3
1,Destruction/Damage/Vandalism Of Property,Property,23-1,MOLINE ACRES,MOLINE ACRES,2352 GARDNER DR,38.749906,-90.247051,1/1/2023 0:04,1/1/2023,Sun,1,RESIDENCE,1133,
2,Destruction/Damage/Vandalism Of Property,Property,23-1,LAKESHIRE,LAKESHIRE,10080 SQUIRE MEADOWS DR,38.536756,-90.341975,1/2/2023 11:31,1/2/2023,Mon,1,APARTMENT PARKING LOT,3422,




St. Louis City
--------------------


Unnamed: 0,inci_id,date_inc,time_inc,offense,nibrs_grp,nibrs_code,beat,district,neighborhood,latitude,longitude,incidentlocation,locationStreet2_Apt,city,state,zip
0,23000004,2023-01-01,21:00.0,UNLAWFUL USE OF WEAPON - SUBSECTION 3 - DISCHA...,A,520,524,5.0,38,38.646434,-90.26485,2 PORTLAND PL,,Saint Louis,MO,
1,23000004,2023-01-01,21:00.0,PROPERTY DAMAGE - 2ND DEGREE,A,290,524,5.0,38,38.646434,-90.26485,2 PORTLAND PL,,Saint Louis,MO,
2,23000007,2023-01-01,10:00.0,ASSAULT 1ST DEGREE OR ATTEMPT,A,13A,425,4.0,60,38.652984,-90.20108,3001 N FLORISSANT AVE,,Saint Louis,MO,




St.Louis Census
--------------------


Unnamed: 0,Fact,St.Louis County,St.Louis City
0,"Population estimates, July 1, 2023, (V2023)",,
1,"Population Estimates, July 1, 2022, (V2022)",990414.0,286578.0
2,"Population estimates base, April 1, 2020, (V2023)",,


### Data Information

In [170]:
### Data Information
# Show St.Louis County
print('St.Louis County')
print('-' * 20)
display(infoOut(stl_county_df))

# Show St.Louis City
print('\n\n\nSt. Louis City')
print('-' * 20)
display(infoOut(stl_city_df))

# Show St.Louis Census Data
print('\n\nSt.Louis Census')
print('-'*20)
display(infoOut(stl_census_df))

St.Louis County
--------------------


Unnamed: 0,Column,Non-Null Count,NULL Count,Dtype
0,OffenseName,35140,0,object
1,OffenseCategory,35140,0,object
2,Report Number,35140,0,object
3,reportingJuris,35140,0,object
4,forJuris,35140,0,object
5,address,34681,459,object
6,latitude,34681,459,float64
7,longitude,34681,459,float64
8,dtCalledIntoCad,35140,0,object
9,occurred,35140,0,object





St. Louis City
--------------------


Unnamed: 0,Column,Non-Null Count,NULL Count,Dtype
0,inci_id,56532,0,int64
1,date_inc,56532,0,object
2,time_inc,56532,0,object
3,offense,56532,0,object
4,nibrs_grp,56531,1,object
5,nibrs_code,56532,0,object
6,beat,56532,0,object
7,district,56532,0,float64
8,neighborhood,56532,0,object
9,latitude,56266,266,float64




St.Louis Census
--------------------


Unnamed: 0,Column,Non-Null Count,NULL Count,Dtype
0,Fact,66,0,object
1,St.Louis County,63,3,object
2,St.Louis City,63,3,object


## Data Preprocessing

### St. Louis City 

#### Map NIBRS Data to DataFrames

##### Mapping Objectives-
* First we need to map the NIBRS codes to the actual Offense description. As we saw above in the Data Overview section that we have different names for Offenses. So by using the NIBRS-Offense-Codes document we can map the NIBRS codes that the STL City dataset has to the actual Description that is used in the documentation! :-O
* We also need to map the Offense Category that is found in the NIBRS-Offense-Codes document as well! 
* One last thing that we can map is the district! I found a way to map the district number to the actual district by refrencing this number with the Patrol District map on the St. Louis City Police webiste! I have the image below! 

<img src=STL-Districts.png alt='STL City Districts' style="width: 500px; border-radius: 2%; margin: auto; display: block;"/>

In [171]:
# Create NIBRS Code for Offense Map
nibrs_code_maps = {'200': 'Arson', '13A': 'Aggravated Assault','13B': 'Simple Assault','13C': 'Intimidation',
                   '510': 'Bribery','220': 'Burglary/Breaking & Entering','250': 'Counterfeiting/Forgery',
                   '290': 'Destruction/Damage/Vandalism of Property','35A': 'Drug/Narcotic Violations',
                   '35B': 'Drug Equipment Violations','270': 'Embezzlement','210': 'Extortion/Blackmail',
                   '26A': 'False Pretenses/Swindle/Confidence Game','26B': 'Credit Card/Automated Teller Machine Fraud',
                   '26C': 'Impersonation','26D': 'Welfare Fraud','26E': 'Wire Fraud','39A': 'Betting/Wagering',
                   '39B': 'Operating/Promoting/Assisting Gambling','39C': 'Gambling Equipment Violations','39D': 'Sports Tampering',
                   '09A': 'Murder & Nonnegligent Manslaughter','09B': 'Negligent Manslaughter','09C': 'Justifiable Homicide',
                   '100': 'Kidnapping/Abduction','23A': 'Pocket-picking','23B': 'Purse-snatching','23C': 'Shoplifting',
                   '23D': 'Theft From Building','23E': 'Theft From Coin-Operated Machine or Device','23F': 'Theft From Motor Vehicle',
                   '23G': 'Theft of Motor Vehicle Parts or Accessories','23H': 'All Other Larceny','240': 'Motor Vehicle Theft',
                   '370': 'Pornography/Obscene Material', '40A': 'Prostitution','40B': 'Assisting or Promoting Prostitution',
                   '120': 'Robbery','11A': 'Forcible Rape','11B': 'Forcible Sodomy','11C': 'Sexual Assault With An Object',
                   '11D': 'Forcible Fondling','36A': 'Incest','36B': 'Statutory Rape','280': 'Stolen Property Offenses',
                   '520': 'Weapon Law Violations','90A': 'Bad Checks','90B': 'Curfew/Loitering/Vagrancy Violations',
                   '90C': 'Disorderly Conduct','90D': 'Driving Under the Influence','90E': 'Drunkenness',
                   '90F': 'Family Offenses, Nonviolent','90G': 'Liquor Law Violations','90H': 'Peeping Tom',
                   '90I': 'Runaway','90J': 'Trespass of Real Property','90Z': 'All Other Offenses'}

# Create NIBRS Category Map
nibrs_category_maps = {'Arson': 'Property', 'Aggravated Assault': 'Person','Simple Assault': 'Person','Intimidation': 'Person',
                       'Bribery': 'Property','Burglary/Breaking & Entering': 'Property','Counterfeiting/Forgery': 'Property',
                       'Destruction/Damage/Vandalism of Property': 'Property','Drug/Narcotic Violations': 'Society',
                       'Drug Equipment Violations': 'Society','Embezzlement': 'Property','Extortion/Blackmail': 'Property',
                       'False Pretenses/Swindle/Confidence Game': 'Property','Credit Card/Automated Teller Machine Fraud': 'Property',
                       'Impersonation': 'Property','Welfare Fraud': 'Property','Wire Fraud': 'Property','Betting/Wagering': 'Society',
                       'Operating/Promoting/Assisting Gambling': 'Society','Gambling Equipment Violations': 'Society',
                       'Sports Tampering': 'Society','Murder & Nonnegligent Manslaughter': 'Person','Negligent Manslaughter': 'Person',
                       'Justifiable Homicide': 'Person/ Not a Crime','Kidnapping/Abduction': 'Person','Pocket-picking': 'Property',
                       'Purse-snatching': 'Property','Shoplifting': 'Property','Theft From Building': 'Property',
                       'Theft From Coin-Operated Machine or Device': 'Property','Theft From Motor Vehicle': 'Property',
                       'Theft of Motor Vehicle Parts or Accessories': 'Property','All Other Larceny': 'Property',
                       'Motor Vehicle Theft': 'Property','Pornography/Obscene Material': 'Society', 
                       'Prostitution': 'Society','Assisting or Promoting Prostitution': 'Society','Robbery': 'Property',
                       'Forcible Rape': 'Person','Forcible Sodomy': 'Person','Sexual Assault With An Object': 'Person',
                       'Forcible Fondling': 'Person','Incest': 'Person','Statutory Rape': 'Person','Stolen Property Offenses': 'Property',
                       'Weapon Law Violations': 'Society','Bad Checks': 'Property','Curfew/Loitering/Vagrancy Violations': 'Society',
                       'Disorderly Conduct': 'Society','Driving Under the Influence': 'Society','Drunkenness': 'Society',
                       'Family Offenses, Nonviolent': 'Society','Liquor Law Violations': 'Society','Peeping Tom': 'Society',
                       'Runaway': 'Not a Crime','Trespass of Real Property': 'Society','All Other Offenses': 'Person, Property, or Society'}

stl_city_districts_maps = { 1.0: 'South Patrol Division', 2.0: 'South Patrol Division',
                            3.0: 'Central Patrol Division',4.0: 'Central Patrol Division',
                            5.0: 'North Patrol Division', 6.0: 'North Patrol Division'}


# Map Codes to Values for Offenses & Offense Categories
stl_city_df['NIBRS-OffenseName'] = stl_city_df['nibrs_code'].map(nibrs_code_maps)
stl_city_df['NIBRS-OffenseCategory'] = stl_city_df['NIBRS-OffenseName'].map(nibrs_category_maps)
stl_city_df['Patrol District'] = stl_city_df['district'].map(stl_city_districts_maps)

# Show Updated DataFrame
display(stl_city_df.head(3))

Unnamed: 0,inci_id,date_inc,time_inc,offense,nibrs_grp,nibrs_code,beat,district,neighborhood,latitude,longitude,incidentlocation,locationStreet2_Apt,city,state,zip,NIBRS-OffenseName,NIBRS-OffenseCategory,Patrol District
0,23000004,2023-01-01,21:00.0,UNLAWFUL USE OF WEAPON - SUBSECTION 3 - DISCHA...,A,520,524,5.0,38,38.646434,-90.26485,2 PORTLAND PL,,Saint Louis,MO,,Weapon Law Violations,Society,North Patrol Division
1,23000004,2023-01-01,21:00.0,PROPERTY DAMAGE - 2ND DEGREE,A,290,524,5.0,38,38.646434,-90.26485,2 PORTLAND PL,,Saint Louis,MO,,Destruction/Damage/Vandalism of Property,Property,North Patrol Division
2,23000007,2023-01-01,10:00.0,ASSAULT 1ST DEGREE OR ATTEMPT,A,13A,425,4.0,60,38.652984,-90.20108,3001 N FLORISSANT AVE,,Saint Louis,MO,,Aggravated Assault,Person,Central Patrol Division


#### Clean time_inc

##### Clean time_inc Obejctives-
* As we see can see from the output for the cell below, that there are times that are over 24:00 and there are seconds with .0 format. Since this is the case we are going to have to convert theses to a time format that Pandas and Python can understand. By taking the first 4 characters and then adding :00 after to convert to a datatime format that Pandas and Python can understand!  
* To combat the times over 24:00 we will be using a timedelta technique! This will take the time over 24:00 and then add that time to the next day. For example, if you have 1/1/2023 36:00. By using the timedelta we subtract 36:00-24:00= 12:00 and then increase the date by 1 day and add the remaining time. So we would have 1/2/2023 12:00.

In [172]:
# Show Unique Times
display(stl_city_df['time_inc'].unique())

array(['21:00.0', '10:00.0', '15:00.0', '00:00.0', '20:00.0', '45:00.0',
       '30:00.0', '35:00.0', '09:00.0', '25:00.0', '08:00.0', '36:00.0',
       '33:00.0', '40:00.0', '50:00.0', '05:00.0', '47:00.0', '32:00.0',
       '01:00.0', '51:00.0', '17:00.0', '19:00.0', '44:00.0', '38:00.0',
       '55:00.0', '37:00.0', '49:00.0', '54:00.0', '11:00.0', '41:00.0',
       '02:00.0', '29:00.0', '48:00.0', '43:00.0', '16:00.0', '59:00.0',
       '18:00.0', '23:00.0', '07:00.0', '53:00.0', '34:00.0', '14:00.0',
       '04:00.0', '27:00.0', '58:00.0', '12:00.0', '46:00.0', '24:00.0',
       '56:00.0', '57:00.0', '22:00.0', '06:00.0', '03:00.0', '42:00.0',
       '26:00.0', '28:00.0', '31:00.0', '13:00.0', '39:00.0', '52:00.0'],
      dtype=object)

In [173]:
# Only accept Hours and Minutes Ex: 12:00
stl_city_df['time_inc'] = stl_city_df['time_inc'].astype(str).str[:5]

# Add Second format to the end of time
stl_city_df['time_inc'] = stl_city_df['time_inc'] + ':00'

#### Convert To DateTime Format

##### Convert To Datetime Objectives-
* As stated above, we need to convert the dates from the date_inc and time_inc columns to the datetime. We are doing so by using the timedelt and datetime functions in pandas. First we will loop through the dataframe and convert the date_inc value to a datetime format and then add the timedelta, which converts hours to days and hours if the value is greater than 24:00, and add that to get our actual datetime value!

In [174]:
# Make Datetime the date plus the time difference in hours if the hours are over 24.
stl_city_df['Datetime'] = (pd.to_datetime(stl_city_df['date_inc'], format='%Y-%m-%d') + pd.to_timedelta(stl_city_df['time_inc']))

# Show Correct Time of Crimes
stl_city_df['Datetime'].astype(str).str[10:16].unique()

array([' 21:00', ' 10:00', ' 15:00', ' 00:00', ' 20:00', ' 06:00',
       ' 11:00', ' 09:00', ' 01:00', ' 08:00', ' 12:00', ' 16:00',
       ' 02:00', ' 05:00', ' 23:00', ' 03:00', ' 17:00', ' 19:00',
       ' 14:00', ' 07:00', ' 13:00', ' 18:00', ' 04:00', ' 22:00'],
      dtype=object)

#### Geographical Cleansing

##### Geographical Cleansing Objectives-
* If you look at the map below you can probably notice that there are crimes that were in the "City" dataset that are for sure not in St. Louis CITY. So we are going to have to fix this!
* When looking at the data set, we can see that there are addresses. Those addresses are great! But we have a 2 better pieces of information where we can verify the location. We can do that with the Latitude and Longitude! 
* We are going to use a amazing set of tools from GeoPy to do so! We are going to input our Latitude and Longitude values and gather all the information so we can accurately determine exactly where is crime occured!

In [175]:

# Create Ploty Geographical Map
stlCityUNCLEASNED_map = px.scatter_mapbox(stl_city_df, 
                        lat="latitude", 
                        lon="longitude", 
                        zoom=10, 
                        height=900,
                        width=900)


# Different MapBox Styles
# carto-darkmatter
# open-street-map
# carto-positron

# Choose what Style to use for Map
stlCityUNCLEASNED_map.update_layout(mapbox_style="open-street-map")

# Select Margins
stlCityUNCLEASNED_map.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

# Display Map
stlCityUNCLEASNED_map.show()

In [176]:
# Set GeoLocator
geolocator = Nominatim(user_agent="Geolocation")

# Set Latitude & Longitude
Latitude = str(stl_city_df['latitude'][0])
Longitude = str(stl_city_df['longitude'][0])

# Show Output of first row of data
display(stl_city_df.head(1))
 
 # Set Location Variable
location = geolocator.reverse(Latitude+","+Longitude)
print(location)

address = location.raw['address']
print(address)

Unnamed: 0,inci_id,date_inc,time_inc,offense,nibrs_grp,nibrs_code,beat,district,neighborhood,latitude,longitude,incidentlocation,locationStreet2_Apt,city,state,zip,NIBRS-OffenseName,NIBRS-OffenseCategory,Patrol District,Datetime
0,23000004,2023-01-01,21:00:00,UNLAWFUL USE OF WEAPON - SUBSECTION 3 - DISCHA...,A,520,524,5.0,38,38.646434,-90.26485,2 PORTLAND PL,,Saint Louis,MO,,Weapon Law Violations,Society,North Patrol Division,2023-01-01 21:00:00


2, Portland Place, Central West End, Saint Louis, Missouri, 63108, United States
{'house_number': '2', 'road': 'Portland Place', 'neighbourhood': 'Central West End', 'city': 'Saint Louis', 'state': 'Missouri', 'ISO3166-2-lvl4': 'US-MO', 'postcode': '63108', 'country': 'United States', 'country_code': 'us'}


In [177]:
# Fill Null values with a dummy value
stl_city_df['latitude'].fillna(0.0, inplace=True)
stl_city_df['longitude'].fillna(0.0, inplace=True)

# Create Coordinate format 
stl_city_df['Geo-Coordinates'] = stl_city_df['latitude'].astype(str) + ',' + stl_city_df['longitude'].astype(str)

In [178]:
# Create DataFrame to Store Geolocation Data
Geo_STL_City_df = pd.DataFrame()

# Create a list of all the Unqiue Coordinates
Geo_STL_City_df['Geo-Coordinates'] = stl_city_df['Geo-Coordinates'].unique()

# Display Sample of DataFrame
display(Geo_STL_City_df)

Unnamed: 0,Geo-Coordinates
0,"38.646434,-90.26485"
1,"38.652984,-90.20108"
2,"38.607878,-90.2221"
3,"38.627714,-90.249994"
4,"38.655824,-90.22954"
...,...
24451,"38.674771,-90.2311"
24452,"38.587915,-90.25287"
24453,"38.677708,-90.21716"
24454,"38.569796,-90.25449"


In [179]:
# Geo_STL_City_df = Geo_STL_City_df.head(100)

# Create Geolocator with information
geolocator = Nominatim(user_agent="Geolocation", timeout=1000)

# Create RateLimiter
rgeocode = RateLimiter(geolocator.reverse, min_delay_seconds=0.1)

# Apply Coordinates to get Location Data
Geo_STL_City_df['Geo-Location'] = Geo_STL_City_df['Geo-Coordinates'].apply(rgeocode)

# Display Finished DataFrame
display(Geo_STL_City_df['Geo-Location'])

RateLimiter caught an error, retrying (0/2 tries). Called with (*('38.654927,-90.28187',), **{}).
Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.11/site-packages/geopy/adapters.py", line 298, in get_text
    page = self.urlopen(req, timeout=timeout)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar/python@3.11/3.11.7/Frameworks/Python.framework/Versions/3.11/lib/python3.11/urllib/request.py", line 519, in open
    response = self._open(req, data)
               ^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar/python@3.11/3.11.7/Frameworks/Python.framework/Versions/3.11/lib/python3.11/urllib/request.py", line 536, in _open
    result = self._call_chain(self.handle_open, protocol, protocol +
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/Cellar/python@3.11/3.11.7/Frameworks/Python.framework/Versions/3.11/lib/python3.11/urllib/request.py", line 496, in _call_chain
    result = func(*args)
        

0        (2, Portland Place, Central West End, Saint Lo...
1        (1919, Dodier Street, Old North Saint Louis, S...
2        (2272, South Jefferson Avenue, McKinley Height...
3        (Rehab Bar & Grill, South Sarah Street, Forest...
4        (3925, Cottage Avenue, Greater Ville, Saint Lo...
                               ...                        
24451    (4389, Penrose Street West, O'Fallon, Saint Lo...
24452    (3936, Dunnica Avenue, Dutchtown, Saint Louis,...
24453    (5500, Algernon Street, O'Fallon, Saint Louis,...
24454    (5544, Grace Avenue, Holly Hills, Saint Louis,...
24455    (1321, Shawmut Place, Hamilton Heights, Saint ...
Name: Geo-Location, Length: 24456, dtype: object

In [180]:
from numpy import nan

# Create Functions to extract information
def get_neighborhood(row):
    location = row['Geo-Location']
    try: 
        return location.raw['address']['neighbourhood']
    except KeyError:
        return np.nan

def get_city(row):
    location = row['Geo-Location']
    try: 
        return location.raw['address']['city']
    except KeyError:
        return np.nan

def get_zipcode(row):
    location = row['Geo-Location']
    try: 
        return location.raw['address']['postcode']
    except KeyError:
        return np.nan
    
def get_village(row):
    location = row['Geo-Location']
    try: 
        return location.raw['address']['village']
    except KeyError:
        return np.nan
    
def get_county(row):
    location = row['Geo-Location']
    try: 
        return location.raw['address']['county']
    except KeyError:
        return np.nan

def get_state(row):
    location = row['Geo-Location']
    try: 
        return location.raw['address']['state']
    except KeyError:
        return np.nan
    
def get_road(row):
    location = row['Geo-Location']
    try: 
        return location.raw['address']['road']
    except KeyError:
        return np.nan
    
def get_country(row):
    location = row['Geo-Location']
    try: 
        return location.raw['address']['country']
    except KeyError:
        return np.nan
    
def get_houseNumber(row):
    location = row['Geo-Location']
    try: 
        return location.raw['address']['house_number']
    except KeyError:
        return np.nan

# Create Function to check whether Address is in the county or not.
def checkCounty(row):
    if row['Geo-City'] == 'Saint Louis' and pd.isna(row['Geo-County']):
        return 'Saint-Louis City'
    else: 
        return row['Geo-County']

# Extract Information and put it into Columns
Geo_STL_City_df['Geo-City'] = Geo_STL_City_df.apply(get_city, axis=1)
Geo_STL_City_df['Geo-Road'] = Geo_STL_City_df.apply(get_road, axis=1)
Geo_STL_City_df['Geo-State'] = Geo_STL_City_df.apply(get_state, axis=1)
Geo_STL_City_df['Geo-County'] = Geo_STL_City_df.apply(get_county, axis=1)
Geo_STL_City_df['Geo-Village'] = Geo_STL_City_df.apply(get_village, axis=1)
Geo_STL_City_df['Geo-Country'] = Geo_STL_City_df.apply(get_country, axis=1)
Geo_STL_City_df['Geo-ZipCode'] = Geo_STL_City_df.apply(get_zipcode, axis=1)
Geo_STL_City_df['Geo-HouseNumber'] = Geo_STL_City_df.apply(get_houseNumber, axis=1)
Geo_STL_City_df['Geo-Neighborhood'] = Geo_STL_City_df.apply(get_neighborhood, axis=1)

# If Saint Louis is the City, then Saint Louis is the County as well.
Geo_STL_City_df['Geo-County'] = Geo_STL_City_df.apply(checkCounty, axis=1)

# checkCounty(Geo_STL_City_df)
# Geo_STL_City_df = Geo_STL_City_df.apply(checkCounty)



# Restructure Format of Columns
Geo_STL_City_df = Geo_STL_City_df.loc[:,['Geo-Coordinates','Geo-Location','Geo-HouseNumber','Geo-Road',
                                     'Geo-Neighborhood','Geo-Village','Geo-City','Geo-ZipCode', 
                                     'Geo-County' , 'Geo-State','Geo-Country']]



# Save DataFrame for further use
Geo_STL_City_df.to_csv('Geographics_StlCity.csv', index=False)

In [181]:
stl_GeoData = pd.read_csv('Geographics_StlCity.csv')
infoOut(stl_GeoData)

Unnamed: 0,Column,Non-Null Count,NULL Count,Dtype
0,Geo-Coordinates,24456,0,object
1,Geo-Location,24456,0,object
2,Geo-HouseNumber,20961,3495,object
3,Geo-Road,24407,49,object
4,Geo-Neighborhood,22804,1652,object
5,Geo-Village,104,24352,object
6,Geo-City,24205,251,object
7,Geo-ZipCode,24424,32,float64
8,Geo-County,24453,3,object
9,Geo-State,24455,1,object


In [199]:
stl_GeoData['Geo-City'].value_counts()

tester = stl_GeoData.loc[stl_GeoData['Geo-City'].isnull()]
display(tester.loc[tester['Geo-Village'].isnull()])



Unnamed: 0,Geo-Coordinates,Geo-Location,Geo-HouseNumber,Geo-Road,Geo-Neighborhood,Geo-Village,Geo-City,Geo-ZipCode,Geo-County,Geo-State,Geo-Country
10,"0.0,0.0",Soul Buoy,,,,,,,,,
77,"38.71876,-90.260387","8342, Jennings Station Road, Jennings, Saint L...",8342,Jennings Station Road,,,,63136.0,Saint Louis County,Missouri,United States
359,"38.718146,-90.252413","1922, D Amato Court, Goodfellow Park, Jennings...",1922,D Amato Court,,,,63136.0,Saint Louis County,Missouri,United States
521,"38.540407,-90.303952","I 55, Affton, Saint Louis County, Missouri, 63...",,I 55,,,,63125.0,Saint Louis County,Missouri,United States
760,"38.654933,-90.328615","7348, Norwood Avenue, University Hills, Univer...",7348,Norwood Avenue,,,,63130.0,Saint Louis County,Missouri,United States
...,...,...,...,...,...,...,...,...,...,...,...
22798,"38.70198,-90.26502","6366, Lillian Avenue, West Walnut Manor, Jenni...",6366,Lillian Avenue,,,,63136.0,Saint Louis County,Missouri,United States
22905,"38.713066,-90.256056","5739, Acme Avenue, Acme Heights, Jennings, Sai...",5739,Acme Avenue,,,,63136.0,Saint Louis County,Missouri,United States
23055,"38.545622,-90.27175","River City Boulevard, Boulevard Heights, Luxem...",,River City Boulevard,Boulevard Heights,,,63111.0,Saint Louis County,Missouri,United States
23235,"38.713884,-90.25577","6400, West Florissant Avenue, Jennings, Saint ...",6400,West Florissant Avenue,,,,63136.0,Saint Louis County,Missouri,United States


In [201]:
 # Set Location Variable
location2 = geolocator.reverse("38.71876,-90.260387")
print(location2)

address2 = location2.raw['address']
print(address2)

8342, Jennings Station Road, Jennings, Saint Louis County, Missouri, 63136, United States
{'house_number': '8342', 'road': 'Jennings Station Road', 'town': 'Jennings', 'county': 'Saint Louis County', 'state': 'Missouri', 'ISO3166-2-lvl4': 'US-MO', 'postcode': '63136', 'country': 'United States', 'country_code': 'us'}


In [None]:
# # Create DataFrames to hold new information



# from geopy.extra.rate_limiter import RateLimiter

# geolocator = Nominatim(user_agent="Geolocation", timeout=10)
# rgeocode = RateLimiter(geolocator.reverse, min_delay_seconds=0.1)

# stl_city_df["location"] = stl_city_df["coordinates"].apply(rgeocode)

# comma = ','

# for i in range(0,len(stl_city_df)):
#     if stl_city_df['latitude']
# stl_city_df['Geo-Lat-Long'] = stl_city_df['latitude'].astype(str) + comma + stl_city_df['longitude'].astype(str)
# # stl_city_df['Geo-City'] = 
# # stl_city_df['Geo-Road'] = 'UNKOWN'
# # stl_city_df['Geo-State'] = 'UNKOWN'
# # stl_city_df['Geo-County'] = 'UNKOWN'
# # stl_city_df['Geo-Village'] = 'UNKOWN'
# # stl_city_df['Geo-ZipCode'] = 'UNKOWN'
# stl_city_df['Geo-NeighbourHood'] = stl_city_df['Geo-Lat-Long'].apply(get_neighborhood, axis=1)

# display(stl_city_df.head(1))

In [None]:
display(stl_city_df.loc[stl_city_df['latitude'].isnull()])

In [None]:
# # Create DataFrames to hold new information
# stl_city_df['Geo-Lat-Long'] = str(stl_city_df['latitude']) + ',' + str(stl_city_df['longitude'])

# stl_city_df['Geo-City'] = stl_city_df.[]
# stl_city_df['Geo-Road'] = 'UNKOWN'
# stl_city_df['Geo-State'] = 'UNKOWN'
# stl_city_df['Geo-County'] = 'UNKOWN'
# stl_city_df['Geo-Village'] = 'UNKOWN'
# stl_city_df['Geo-ZipCode'] = 'UNKOWN'
# stl_city_df['Geo-NeighbourHood'] = 'UNKOWN'





In [None]:
# def checkAddKeys(address, key):
#     if key in address.keys():
#         return True
#     else:
#         return False

# Create DataFrames to hold new information
stl_city_df['Geo-City'] = 'UNKOWN'
stl_city_df['Geo-Road'] = 'UNKOWN'
stl_city_df['Geo-State'] = 'UNKOWN'
stl_city_df['Geo-County'] = 'UNKOWN'
stl_city_df['Geo-Village'] = 'UNKOWN'
stl_city_df['Geo-ZipCode'] = 'UNKOWN'
stl_city_df['Geo-NeighbourHood'] = 'UNKOWN'


for i in range(0,len(stl_city_df)):
    if stl_city_df['latitude'].isnull().iloc[i] or stl_city_df['longitude'].isnull().iloc[i]:
        pass
    else: 
        # Set Values for Latitude & Longitude
        Latitude = str(stl_city_df['latitude'][i])
        Longitude = str(stl_city_df['longitude'][i])

        # Set Location Variable
        location = geolocator.reverse(Latitude+","+Longitude)

        # We want the Address Information
        address = location.raw['address']

        if checkAddKeys(address, 'village') == True:
            stl_city_df['Geo-City'][i] = np.NaN
            stl_city_df['Geo-Road'][i] = np.NaN
            stl_city_df['Geo-State'][i] = address['state']
            stl_city_df['Geo-County'][i] = address['county']
            stl_city_df['Geo-Village'][i] = address['village']
            stl_city_df['Geo-ZipCode'][i] = address['postcode']
            stl_city_df['Geo-NeighbourHood'][i] = np.NaN

        elif checkAddKeys(address, 'city') == True:
            stl_city_df['Geo-City'][i] = address['city']
            stl_city_df['Geo-Road'][i] = address['road']
            stl_city_df['Geo-State'][i] = address['state']
            stl_city_df['Geo-County'][i] = np.NaN
            stl_city_df['Geo-Village'][i] = np.NaN
            stl_city_df['Geo-ZipCode'][i] = address['postcode']
            stl_city_df['Geo-NeighbourHood'][i] = address['neighbourhood']

display(stl_city_df)

In [None]:
# Set Latitude & Longitude
Latitude = str(stl_county_df['latitude'][0])
Longitude = str(stl_county_df['longitude'][0])

# Show Output of first row of data
display(stl_county_df.head(1))
 
 # Set Location Variable
location = geolocator.reverse(Latitude+","+Longitude)
print(location)

address = location.raw['address']
print(address)

#### Consolidate Columns 

In [None]:
# Rename Columns
stl_city_df['City'] = stl_city_df['city']
stl_city_df['State'] = stl_city_df['state']
stl_city_df['District'] = stl_city_df['district']
stl_city_df['Latitude'] = stl_city_df['latitude']
stl_city_df['Longitude'] = stl_city_df['longitude']
stl_city_df['Address'] = stl_city_df['incidentlocation']
stl_city_df['ORIG-OffenseName'] = stl_city_df['offense']


# Remove unnecessary columns
stl_city_df = stl_city_df.drop(columns=['inci_id','date_inc','time_inc', 
                                        'offense','nibrs_grp', 'nibrs_code',
                                        'beat','neighborhood', 'city', 'state',
                                        'locationStreet2_Apt', 'zip', 'longitude', 
                                        'latitude', 'district', 'incidentlocation'])

# Restructure Format of Columns
stl_city_df = stl_city_df.loc[:,['Datetime','NIBRS-OffenseName', 'NIBRS-OffenseCategory', 'ORIG-OffenseName', 'Latitude', 
                                 'Longitude' ,'Address', 'City', 'State', 'District', 'Patrol District']]

# Sort by Datetime and Reset Index
stl_city_df = stl_city_df.sort_values(by='Datetime', ascending=True).reset_index(drop=True)

# Show new Clean DataFrame
display(stl_city_df)

### St. Louis County

In [None]:
stl_county_df

#### Map NIBRS Data to DataFrame

In [None]:
# NIBR Mappings for St. Louis County
nibrs_maps_county = { 'Assault, Simple/Other': 'Simple Assault',
                      'Aggravated Assault, Hands/Feet/Fists': 'Aggravated Assault',
                      'Aggravated Assault, Firearm': 'Aggravated Assault',
                      'Aggravated Assault, Other Weapon': 'Aggravated Assault',
                      'Aggravated Assault, Knife/Cutting Instrument': 'Aggravated Assault',
                      'Assault, Intimidation': 'Intimidation',
                      'Burglary': 'Burglary/Breaking & Entering',
                      'Destruction/Damage/Vandalism Of Property': 'Destruction/Damage/Vandalism of Property',
                      'Drug Equipment/Paraphernalia Violations': 'Drug Equipment Violations',
                      'Fraud, Wire': 'Wire Fraud',
                      'Fraud, Welfare': 'Wire Fraud',
                      'Fraud, Credit Card/Automatic Teller Machine': 'Credit Card/Automated Teller Machine Fraud',
                      'Fraud, Identity Theft': 'Impersonation',
                      'Fraud, False Pretenses/Swindle/Confidence Game': 'False Pretenses/Swindle/Confidence Game',
                      'Fraud, Impersonation': 'Impersonation',
                      'Fraud, Hacking/Computer Invasion': 'Impersonation',
                      'Homicide, Murder And Nonnegligent Manslaughter': 'Murder & Nonnegligent Manslaughter',
                      'Homicide, Negligent Manslaughter': 'Negligent Manslaughter',
                      'Abduction/Kidnapping': 'Kidnapping/Abduction',
                      'Larceny': 'All Other Larceny',
                      'Pornography/Obscene Literature And Objects': 'Pornography/Obscene Material',
                      'Prostitution Offenses': 'Prostitution',
                      'Robbery, Other Dangerous Weapon': 'Robbery',
                      'Robbery, Firearm': 'Robbery',
                      'Robbery, Strongarm': 'Robbery',
                      'Robbery, Knife/Cutting Instrument': 'Robbery',
                      'Stolen Property (Buying, Receiving, Possessing)': 'Stolen Property Offenses',
                      'Weapon Law Violations': 'Weapon Law Violations',
                      'Motor Vehicle Theft': 'Motor Vehicle Theft',
                      'Extortion/Blackmail': 'Extortion/Blackmail',
                      'Bribery': 'Bribery',
                      'Drug/Narcotic Violations': 'Drug/Narcotic Violations',
                      'Counterfeiting/Forgery': 'Counterfeiting/Forgery',
                      'Arson': 'Arson',
                      'Embezzlement': 'Embezzlement',
                      'Human Trafficking': 'Kidnapping/Abduction'}

# Map Codes to Values for Offenses & Offense Categories
stl_county_df['NIBRS-OffenseName'] = stl_county_df['OffenseName'].map(nibrs_maps_county)
stl_county_df['NIBRS-OffenseCategory'] = stl_county_df['NIBRS-OffenseName'].map(nibrs_category_maps)

# Show Updated DataFrame
display(stl_county_df.head(3))

#### Convert to DateTime Format

In [None]:
# Convert the data it was called in to DateTime Format
stl_county_df['Datetime'] = pd.to_datetime(stl_county_df['dtCalledIntoCad'])

# Show the times
stl_county_df['Datetime'].head(3)

#### Consolidate Columns 

In [None]:
stl_county_df.columns

In [None]:
# Rename Columns
stl_county_df['State'] = 'MO'
stl_county_df['City'] = stl_county_df['forJuris']
stl_county_df['Address'] = stl_county_df['address']
stl_county_df['Patrol District'] = stl_county_df['district']
stl_county_df['Latitude'] = stl_county_df['latitude']
stl_county_df['Longitude'] = stl_county_df['longitude']
stl_county_df['ORIG-OffenseName'] = stl_county_df['OffenseName']

# Drop Unnecessary Columns
stl_county_df = stl_county_df.drop(columns=['Report Number', 'reportingJuris', 'forJuris',
                                            'address', 'latitude','longitude', 'dtCalledIntoCad',
                                            'occurred','OccDOW','OccMonth','premise', 'zone', 'district',
                                            'OffenseName'])

# ReStructure Columns
stl_county_df = stl_county_df.loc[:,['Datetime','NIBRS-OffenseName', 'NIBRS-OffenseCategory', 'ORIG-OffenseName', 'Latitude', 
                                     'Longitude' ,'Address', 'City', 'State', 'Patrol District']]

# Sort DataFrame by Datetime
stl_county_df = stl_county_df.sort_values(by='Datetime', ascending=True).reset_index(drop=True)

# Show Cleaned DataFrame
display(stl_county_df)
infoOut(stl_county_df)

## Exploritory Data Analysis

### Cleaned DataFrames

In [None]:
# Show St. Louis City 
print('St. Louis City')
print('-' * 20)
display(stl_city_df)

# Show St. Louis County
print('\n\nSt. Louis County')
print('-' * 20)
display(stl_county_df)

### Concatenate DataFrames

In [None]:
# Concatenate DataFrames
stl_crime_df = pd.concat([stl_city_df, stl_county_df])

# Sort DataFrame by Datetime
stl_crime_df = stl_crime_df.sort_values(by='Datetime', ascending=True).reset_index(drop=True)

# Display Merged DataFrames
display(stl_crime_df)

In [None]:
infoOut(stl_crime_df)

### Visualizations

In [None]:
# Create DataFrame of OffenseNames
stl_offense_df = pd.DataFrame(stl_crime_df['NIBRS-OffenseName'].value_counts().reset_index())

sns.barplot(data=stl_offense_df.head(20),
            y='NIBRS-OffenseName',
            x='count', orient='h')

plt.title('Top 10 Offenses')

In [None]:
stl_offenseCat_df = pd.DataFrame(stl_crime_df['NIBRS-OffenseCategory'].value_counts().reset_index())

display(stl_offenseCat_df)

sns.barplot(data=stl_offenseCat_df.head(20),
            y='NIBRS-OffenseCategory',
            x='count', orient='h')

In [None]:
display(stl_crime_df[stl_crime_df['OffenseName'].str.contains(offenseName, na=False)]['OffenseName'].value_counts().reset_index())

sns.barplot(data=stl_crime_df[stl_crime_df['OffenseName'].str.contains(offenseName, na=False)]['OffenseName'].value_counts().reset_index(),
            y='OffenseName',
            x='count', orient='h')

In [None]:
stl_crime_city_df = pd.DataFrame(stl_crime_df['City'].value_counts().reset_index())
stl_noCounty_df = stl_crime_city_df.loc[stl_crime_city_df['City'] != 'SAINT LOUIS COUNTY']
stl_noCityCounty_df = stl_noCounty_df.loc[stl_noCounty_df['City'] != 'Saint Louis']

display(stl_noCityCounty_df)

sns.barplot(data=stl_noCityCounty_df.head(10),
            y='City',
            x='count', orient='h')

In [None]:
stl_crime_district_df = pd.DataFrame(stl_crime_df['Patrol District'].value_counts().reset_index())
display(stl_crime_district_df)

sns.barplot(data=stl_crime_district_df.head(10),
            y='Patrol District',
            x='count', orient='h')

## Geographical Distribution

In [None]:
stl_crime_df

In [None]:
fig = px.scatter_mapbox(stl_crime_df, 
                        lat="Latitude", 
                        lon="Longitude", 
                        hover_name="Address",
                        hover_data=['NIBRS-OffenseName', 'Address'],
                        color='NIBRS-OffenseName',
                        #color_continuous_scale=px.colors.qualitative,
                        zoom=10, 
                        height=900,
                        width=1600)


# Different MapBox Styles
# carto-darkmatter
# open-street-map
# carto-positron

fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()