In [None]:
#Question 1, week 1. Introduction:
#Services vs. Safety in San Fransisco
#I inted to investigate if neighborhood crime rates in San Fransisco are linked to the 
#various venues that are located in that neighborhood: as an example, if there are more 
#bars or nightclubs in the neighborhood, will there be more reported incidents? 
#This is relevant information for both landlords, sellers of real estate and families 
#with young kids, that are comparing different neighborhoods when relocating.

In [None]:
#Question 2, week 1:
#I will be using Foursquare location data of San Fransisco for the venue information in combination with the 
#San Fransico Police Department data (https://data.sfgov.org/Public-Safety/Map-of-Police-Department-Incident-Reports-2018-to-/jq29-s5wp), 
#both of these datasets shoud be up-to-date and are interlinked by latitude and longitude coordinates; both datasets have also a 'neighborhood'-field. As the SFPD database
#is huge, I will be limiting the dataset to incidents reported between 1st Jan 2020 and 25th May 2020 or a shorter period if necessary (the Jan-May 2020
#period is already big with 46,382 incidents). The Foursquare data is accessed via their developer site, and the SFPD data is available in CSV-format 
#from their home page mentioned earlier. 
#I intend to combine the datasets and cluster the data per neighborhood or per location, depending on which one is more parctical - the data amount is big, which I expect to be 
#a problem in practice. Thereafter I will compare the results: are there some neighborhoods with higher amount of crime incidents, and if so, are there more of some type
#venues in those neighborhoods compared to other neighborhoods?

In [None]:
import numpy as np  # useful for many scientific computing in Python
import pandas as pd # primary data structure library
import folium
!conda install -c conda-forge folium=0.5.0 --yes
print('Folium installed and imported!')

In [None]:
import matplotlib.pyplot as plt #Other tools if need be
import pylab as pl
%matplotlib inline

In [None]:
#San Fransisco Police Department data downloaded from their site
#df_data_0 becomes the mother of all incident data
df_data_0 = pd.read_csv('Police_Department_Incident_Reports__2018_to_Present.csv')
df_data_0.head()


In [None]:
#Cleaning
df_incidents=df_data_0
df_incidents.dropna()
df_incidents.drop(['Incident Datetime', 'Report Datetime', 'Incident Number', 'Row ID', 'Incident ID', 'CAD Number', 'Resolution', 'CNN', 'Supervisor District'], axis=1, inplace=True)
df_incidents.rename(columns={'Latitude':'Y', 'Longitude':'X'}, inplace=True)
df_incidents.rename(columns={'Incident Category':'IncidentCategory'}, inplace=True)

In [None]:
#Further cleaning, dropping incidents that have no coordinates or category
df_incidents.dropna(subset = ["X"], inplace=True)
df_incidents.dropna(subset = ["IncidentCategory"], inplace=True)
df_incidents.describe()

In [None]:
df_incidents.describe(include=['object'])

In [None]:
#basic statistics for future use
#looking at which neighborhoods have the most incidents
df_freq=df_incidents['Analysis Neighborhood'].value_counts().to_frame()
df_freq.reset_index()
df_freq.index.names = ['Neighborhood']
df_freq.rename(columns={'Analysis Neighborhood':'Incidents'},inplace=True)
df_freq.reset_index()


In [None]:
#giving the neighborhoods lat & long coodrinates by taking the mean of the incident coordinates -> more realistic point when added to Foursquare data later on
coordx=df_incidents.groupby('Analysis Neighborhood')['X'].mean() 
coordy=df_incidents.groupby('Analysis Neighborhood')['Y'].mean()
coordy.reset_index()
coordy.index.names = ['Neighborhood']
coordy.reset_index()
coordx.reset_index()
coordx.index.names = ['Neighborhood']
coordx.reset_index()

In [None]:
dfmerged=pd.merge(coordx, coordy, on='Neighborhood')
dfmerged1=pd.merge(df_freq, dfmerged, on='Neighborhood')
dfmerged1.reset_index()


In [None]:
dfmerged2=dfmerged1.reset_index()
dfmerged2.info()

In [None]:
df_incidents['IncidentCategory'].value_counts().to_frame() #what indicent type is the most common

In [None]:
#incidents on map
#San Francisco latitude and longitude values
latitude = 37.77
longitude = -122.42
sanfran_map = folium.Map(location=[latitude, longitude], zoom_start=12)
sanfran_map


In [None]:
# instantiate a feature group for the incidents in the dataframe
incidents = folium.map.FeatureGroup()

# loop through the incidents and add each to the incidents feature group
for lat, lng, in zip(df_incidents.Y, df_incidents.X):
    incidents.add_child(
        folium.features.CircleMarker(
            [lat, lng],
            radius=5, # define how big you want the circle markers to be
            color='yellow',
            fill=True,
            fill_color='blue',
            fill_opacity=0.6
        )
    )
sanfran_map.add_child(incidents)


In [None]:
from folium import plugins

#the first map has too many incidents to be of use, clustering necessary for usability
sanfran_map = folium.Map(location = [latitude, longitude], zoom_start = 12)

# instantiate a mark cluster object for the incidents in the dataframe
incidents = plugins.MarkerCluster().add_to(sanfran_map)

# loop through the dataframe and add each data point to the mark cluster
for lat, lng, label, in zip(df_incidents.Y, df_incidents.X, df_incidents.IncidentCategory):
    folium.Marker(
        location=[lat, lng],
        icon=None,
        popup=label,
    ).add_to(incidents)

# display map
sanfran_map   #better, to be used in the presentation later on

In [None]:
#moving on to Foursquare stuff - need a large dataset of venues, but if it proves too difficul to handle, then a smaller set will have to do. 
#more libraries
import json 
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

In [None]:
CLIENT_ID = 'F1DSQIYSJRZYVLEU1DDSRFB0QPWDZQHCEQSGNEF3YWMM44SQ' # your Foursquare ID
CLIENT_SECRET = 'QREISBS33NHTVRGIM3EIEUSPPA11I1K53HSH1HLHVOTFHK0Y' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

In [None]:
address = '3350 Mission St, San Francisco, USA'     #Safeway at Mission Street, seems to be quite an active place           
geolocator = Nominatim(user_agent="SanFran_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

In [None]:
latitude = 37.743217349999995 #might be unnecessary
longitude = -122.42247150547838
SanFrancisco_map = folium.Map(location=[latitude, longitude], zoom_start=12)
SanFrancisco_map

In [None]:
for lat, lng, neighborhood in zip(dfmerged2['Y'], dfmerged2['X'], dfmerged2['Neighborhood']):   #the neighborhoods on a map. Might be unnecessary
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(SanFrancisco_map)  
SanFrancisco_map    

In [None]:
neighborhood_latitude = dfmerged2.loc[0, 'Y'] # neighborhood latitude value
neighborhood_longitude = dfmerged2.loc[0, 'X'] # neighborhood longitude value

neighborhood_name = dfmerged2.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

In [None]:
LIMIT=200
radius=750
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url

In [None]:
results = requests.get(url).json()
results

In [None]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [None]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

In [None]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Mission_Venues = getNearbyVenues(names=dfmerged2['Neighborhood'],
                                   latitudes=dfmerged2['Y'],
                                   longitudes=dfmerged2['X']
                                  )



In [None]:
Mission_Venues.groupby('Neighborhood').count()

In [None]:
print('There are {} unique categories.'.format(len(Mission_Venues['Venue Category'].unique())))

In [None]:
# one hot encoding
Mission_onehot = pd.get_dummies(Mission_Venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Mission_onehot['Neighborhood'] = Mission_Venues['Neighborhood'] 

# define a list of column names
cols = Mission_onehot.columns.tolist()
cols

# move the column name to the beggining
cols.insert(0, cols.pop(cols.index('Neighborhood')))
cols

#then use .reindex() function to reorder
Mission_onehot = Mission_onehot.reindex(columns= cols)

Mission_onehot.head(85)

In [None]:
Sanfran_grouped = Mission_onehot.groupby('Neighborhood').mean().reset_index()
Sanfran_grouped

In [None]:
num_top_venues = 20

for hood in Sanfran_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = Sanfran_grouped[Sanfran_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [None]:
Sanfran_groupedcount = Mission_onehot.groupby('Neighborhood').sum().reset_index()
Sanfran_groupedcount

In [None]:
num_top_venues = 20

for hood in Sanfran_groupedcount['Neighborhood']:
    print("----"+hood+"----")
    temp = Sanfran_groupedcount[Sanfran_groupedcount['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [None]:
#looks like the statistics would not point out to a larger amount of bars in Mission or Tenderloin. A closer look at the crime types on those areas might be useful

In [None]:
df_neighborhoods_incidents=df_incidents[['Analysis Neighborhood', 'IncidentCategory']]
df_neighborhoods_incidents.head()

In [None]:
tool=df_neighborhoods_incidents.groupby('Analysis Neighborhood')
tool1=tool.get_group('Mission')
tool1.describe()

In [None]:
tool=df_neighborhoods_incidents.groupby('Analysis Neighborhood')
tool1=tool.get_group('Tenderloin')
tool1.describe()

In [None]:
tool=df_neighborhoods_incidents.groupby('Analysis Neighborhood')
tool1=tool.get_group('Western Addition')
tool1.describe()

In [None]:
#looking closer at which incidents happens where with onehot

In [None]:
# one hot encoding
Incident_onehot = pd.get_dummies(df_incidents[['IncidentCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Incident_onehot['Analysis Neighborhood'] = df_incidents['Analysis Neighborhood'] 

# define a list of column names
cols1 = Incident_onehot.columns.tolist()
cols1

# move the column name to the beggining
cols1.insert(0, cols1.pop(cols1.index('Analysis Neighborhood')))
cols1

#then use .reindex() function to reorder
Incident_onehot = Incident_onehot.reindex(columns= cols1)

Incident_onehot.head(85)

In [None]:
Incident_groupedmean = Incident_onehot.groupby('Analysis Neighborhood').mean().reset_index()
Incident_groupedmean

In [None]:
Incident_groupedcount = Incident_onehot.groupby('Analysis Neighborhood').sum().reset_index()
Incident_groupedcount

In [None]:
num_top_incidents = 20

for hood in Incident_groupedmean['Analysis Neighborhood']:
    print("----"+hood+"----")
    temp = Incident_groupedmean[Incident_groupedmean['Analysis Neighborhood'] == hood].T.reset_index()
    temp.columns = ['type','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [None]:
num_top_incidents = 20

for hood in Incident_groupedcount['Analysis Neighborhood']:
    print("----"+hood+"----")
    temp = Incident_groupedcount[Incident_groupedcount['Analysis Neighborhood'] == hood].T.reset_index()
    temp.columns = ['type','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_incidents))
    print('\n')

In [None]:
Incident_groupedcount.plot(kind='bar',x='Analysis Neighborhood',y='Other Offenses', figsize=(10,5))

In [None]:
Incident_groupedcount.plot(kind='bar',x='Analysis Neighborhood',y='Larceny Theft', figsize=(10,5))

In [None]:
Incident_groupedcount.plot(kind='bar',x='Analysis Neighborhood',y='Total', figsize=(10,5))