In [None]:
from sqlalchemy import create_engine, text
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import folium
from folium.plugins import MarkerCluster
from folium.plugins import FastMarkerCluster

First, we need to create a connection string. The format is

 ```<dialect(+driver)>://<username>:<password>@<hostname>:<port>/<database>```

In [None]:
database_name = 'scooters'# Fill this in with your database name 

connection_string = f"postgresql://postgres:postgres@localhost:5433/{database_name}"

In [None]:
connection_string 

Now, we need to create an engine and use it to connect.

In [None]:
engine = create_engine(connection_string)

In [None]:
engine

Now, we can create our query and pass it into the `.query()` method.

In [None]:
query = '''
SELECT *
FROM scooters 
LIMIT 10;
'''

with engine.connect() as connection:
    result = connection.execute(text(query))

In [None]:
result.fetchone()

In [None]:
query = '''
SELECT * 
FROM scooters
LIMIT 100;
'''

with engine.connect() as connection:
    scooters_query = pd.read_sql(text(query), con = connection)

scooters_query.head()

## Uploading Using SQLAlchemy
#use this format to directly load the dataframes from SQL (do this 7 more times - filter scooters by company name): 

query = '''
SELECT *
FROM trips;
'''
with engine.connect() as connection:
    tripsdf = pd.read_sql(text(query), con = connection)
tripsdf.head()

#then turn each query into a csv: 
trips.to_csv("tripsdf.csv", index=False)

## Reading all files as csv's (smallest to largest): 

In [None]:
tripsdf = pd.read_csv('data/tripsdf.csv')

In [None]:
boltdf = pd.read_csv('data/boltdf.csv')

In [None]:
gotchadf = pd.read_csv('data/gotchadf.csv')

In [None]:
spindf = pd.read_csv('data/spindf.csv')

In [None]:
lyftdf = pd.read_csv('data/lyftdf.csv')

In [None]:
birddf = pd.read_csv('data/birddf.csv')

In [None]:
limedf = pd.read_csv('data/limedf.csv')

In [None]:
jumpdf = pd.read_csv('data/jumpdf.csv')

## Now combine the dataframes to recreate the scooters table as a dataframe: 

In [None]:
#Create a list will all the data frames, then concatenate it and define it as "scootersdf"
scooters_list = [birddf, boltdf, gotchadf, jumpdf, limedf, lyftdf, spindf]
scootersdf = pd.concat(scooters_list)

In [None]:
scootersdf.head()

Metro Scooters Analysis
In May of 2018, Bird dropped hundreds of scooters on the streets of Nashville with no permission. In response, Metro sued, which caused Bird to remove and wait for permits. Metro began developing regulations for scooters and other shared urban mobility devices (SUMDs). In 2019, the Metro Council passed legislation enacting a one-year pilot program for scooters. For this project, you have been provided with the data for 3 months of this pilot program with the goal of reporting on usage trends and generating recommendations for quantity and distribution of scooters in Nashville.

Metro would like to know what the ideal density of available scooters is, which balances the objectives of enabling scooters to serve transportation goals, discouraging scooters from piling up on sidewalks, keeping it economically viable for companies to operate equitably in the city.

This data for this project can be downloaded as a Postgres backup from https://drive.google.com/file/d/1BXAfByFvHCwX0G1BvTCQ373qKm7wE4Y-/view?usp=share_link.

Some notes about the data:

When not in use, each scooter will report its location every five minutes. This data is contained in the scooters table.
WARNING: Both tables contain a large number of records, so think carefully about what data you need to pull in a given query. If you try and pull in all rows from the scooters table, there is a very good chance that you will crash your notebook!

# Exploratory Analysis Questions:

## Are there any null values in any columns in either table?

In [None]:
#Check for null values in the scooters table/DF: 
scootersdf.isna().any()

In [None]:
tripsdf.isna().any()

#### Answer: The only NULL values are in the charge level from the trips table

## What date range is represented in each of the date columns? Investigate any values that seem odd.

In [None]:
#change the date columns in scooters and trips dataframes to be datetime formats
#scootersdf['pubdatetime'] = pd.to_datetime(scootersdf['pubdatetime'], format = 'mixed')
tripsdf['pubtimestamp'] = pd.to_datetime(tripsdf['pubtimestamp'], format = 'mixed')

In [None]:
scooters_table_date_range = 'Date range for scooters table: ' + str(scootersdf['pubdatetime'].dt.date.min()) + ' to ' +str(scootersdf['pubdatetime'].dt.date.max())
print(scooters_table_date_range)

In [None]:
trips_table_date_range = 'Date range for trips table: ' + str(tripsdf['pubtimestamp'].dt.date.min()) + ' to ' +str(tripsdf['pubtimestamp'].dt.date.max())
print(trips_table_date_range)

#### Answer: The date range for both tables is May-July (trips table includes one day of August)

## Is time represented with am/pm or using 24 hour values in each of the columns that include time?

In [None]:
#scooters table has 24 hour timestamp in pubdatetime column
scootersdf.head()

In [None]:
#trips table has more columns, and they are all in 24 hour time format as well 
tripsdf.tail()

##### Answer: both are in 24 hour time

## What values are there in the sumdgroup column? Are there any that are not of interest for this project?

In [None]:
scootersdf.value_counts(subset='sumdgroup')

#### Answer: There is no need to have the bicycle group

## What are the minimum and maximum values for all the latitude and longitude columns? Do these ranges make sense, or is there anything surprising?

In [None]:
#Query min/max lat and long values from the scooters table
query = '''
SELECT min(longitude) as min_long, min(latitude) as min_lat, max(longitude) as max_long, max(latitude) as max_lat
FROM scooters;
'''

with engine.connect() as connection:
    scooters_long_lat_range = pd.read_sql(text(query), con = connection)

scooters_long_lat_range.head()

In [None]:
#Query min/max starting lat and long values from the trips table
query = '''
SELECT min(startlongitude) as min_long, min(startlatitude) as min_lat, max(startlongitude) as max_long, max(startlatitude) as max_lat
FROM trips;
'''

with engine.connect() as connection:
    trips_start_long_lat_range = pd.read_sql(text(query), con = connection)

trips_start_long_lat_range.head()

In [None]:
#Query min/max ending lat and long values from the trips table
query = '''
SELECT min(endlongitude) as min_long, min(endlatitude) as min_lat, max(endlongitude) as max_long, max(endlatitude) as max_lat
FROM trips;
'''

with engine.connect() as connection:
    trips_end_long_lat_range = pd.read_sql(text(query), con = connection)

trips_end_long_lat_range.head()

#### Answer: some values in the scooters table are 0, which are outside the range of Nashville. Also in the trips table the end longitude range is much wider than it should be. I could test this with geopandas but I don't have time

## What is the range of values for trip duration and trip distance? Do these values make sense? Explore values that might seem questionable.

In [None]:
#Query min/max tripduration and tripdistance values from the trips table
query = '''
SELECT min(tripduration) as min_duration, max(tripduration) as max_duration, min(tripdistance) as min_distance, max(tripdistance) as max_distance
FROM trips;
'''

with engine.connect() as connection:
    trips_duration_and_distance_range = pd.read_sql(text(query), con = connection)

trips_duration_and_distance_range.head()

#### Answer: Some of the numbers are negative

## Check out how the values for the company name column in the scooters table compare to those of the trips table. What do you notice?

#### Answer: some of the names have alternative spellings on each table

distinct company names in both tables: 

"Bird"

"Bolt"
"Bolt Mobility"

"Gotcha"

"Jump"
"JUMP"

"Lime"

"Lyft"

"Spin"
"SPIN"

In [None]:
#Clean the company names in the trips table so that they match the scooters table
tripsdf['companyname'] = tripsdf['companyname'].str.replace('Bolt Mobility','Bolt').str.replace('SPIN','Spin').str.replace('JUMP','Jump')

# Focus Questions


## 1. During this period, seven companies offered scooters. 

### How many scooters did each company have in this time frame? 

### Did the number for each company change over time? 

### Did scooter usage vary by company? 

In [None]:
query = '''
SELECT
    companyname AS company,
    COUNT(DISTINCT sumdid) AS units
FROM
    scooters
WHERE
    sumdgroup ILIKE 'scooter'
GROUP BY
    companyname;
'''
with engine.connect() as connection:
    company_scooters = pd.read_sql(text(query), con = connection)
company_scooters.head()

In [None]:
#I think the same query in python would be:
#scootersdf['sumdid'].nunique.groupby(['companyname'])

In [None]:
#just do one company at a time
boltdf['pubdatetime'] = pd.to_datetime(boltdf['pubdatetime'], format = 'mixed')

In [None]:
boltdf['month'] = boltdf['pubdatetime'].dt.month_name()

In [None]:
bolt_by_month = boltdf.groupby(['month']).sumdid.value_counts()
bolt_by_month.head()

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(boltdf['month'], boltdf['sumdid'])
# Add axis labels
plt.xlabel('Month')
plt.ylabel('Units')

# Add title
plt.title('Bolt Scooters Per Month')
plt.show()

#### I am confused?

## 2. According to Second Substitute Bill BL2018-1202 (as amended) (https://web.archive.org/web/20181019234657/https://www.nashville.gov/Metro-Clerk/Legislative/Ordinances/Details/7d2cf076-b12c-4645-a118-b530577c5ee8/2015-2019/BL2018-1202.aspx), all permitted operators will first clean data before providing or reporting data to Metro. 

### Data processing and cleaning shall include:  
#### * Removal of staff servicing and test trips  
#### * Removal of trips below one minute  
#### * Trip lengths are capped at 24 hours  

## Are the scooter companies in compliance with the second and third part of this rule?

In [None]:
filtered_trips = np.logical_and(tripsdf['tripduration'] > 1.0,tripsdf['tripduration'] < 1440.0)

In [None]:
filtered_trips = filtered_trips.to_frame()

In [None]:
filtered_trips.value_counts()

#### Answer: 23,514 of the trips were not within the paramaters (4%)

## 3. The goal of Metro Nashville is to have each scooter used a minimum of 3 times per day. 

### Based on the data, what is the average number of trips per scooter per day? Make sure to consider the days that a scooter was available. 

### How does this vary by company?

In [None]:
tripsdf.head()

# 4. Metro would like to know how many scooters are needed, and something that could help with this is knowing peak demand. 

## Estimate the highest count of scooters being used at the same time. 

## When were the highest volume times? 

## Does this vary by zip code or other geographic region?

In [None]:
#This changes the starttime and startdate columns to datetime:
tripsdf['startdatetime'] = pd.to_datetime(tripsdf['startdate'].astype('str')+' '+tripsdf['starttime'].astype('str'), format ='mixed')

#This gives us the count of scooters used per hour for the duration of the dataset
scooters_per_hour = tripsdf.assign(hour = tripsdf['startdatetime'].dt.hour).groupby(['hour'])['startdatetime'].count().reset_index()
scooters_per_hour.head()

In [None]:
plt.plot(scooters_per_hour)
plt.show()

#This gives us the count of scooters used per day_of_week for the duration of the dataset
scooters_per_day = tripsdf.assign(day_of_week = tripsdf['startdatetime'].dt.day_name).groupby(['day_of_week'])['startdatetime'].count().reset_index()
scooters_per_day.head()

#Couldn't get this to work

In [None]:
#Next, lets look at the concentration of scooters per zipcode:

#read in zip codes file
zipcodes = gpd.read_file('data/zipcodes.geojson')
#optional: print(zipcodes.crs)
#optional: zipcodes.head()

#convert each lat and long point to a geodataframe, matching the CRS of the zipcodes file
trips_start_geo = gpd.GeoDataFrame(tripsdf,
                           crs = zipcodes.crs,
                           geometry = gpd.points_from_xy(tripsdf.startlongitude, tripsdf.startlatitude))
trips_end_geo = gpd.GeoDataFrame(tripsdf,
                           crs = zipcodes.crs,
                           geometry = gpd.points_from_xy(tripsdf.endlongitude, tripsdf.endlatitude))

#clean up the zipcodes dataframe to only relevant info
zipcodes = zipcodes[['zip', 'po_name', 'geometry']]

#list of trip start points within each zip code
start_points_by_zip = gpd.sjoin(trips_start_geo, zipcodes, predicate = 'within')

#list of trip end points within each zip code
end_points_by_zip = gpd.sjoin(trips_end_geo, zipcodes, predicate = 'within')

In [None]:
#create a dataframe that records the count of trips that start in each zip code
start_points = start_points_by_zip['zip'].value_counts().to_frame().reset_index()

In [None]:
#create a dataframe that records the count of trips that end in each zip code
end_points = end_points_by_zip['zip'].value_counts().to_frame().reset_index()

In [None]:
#combine start points with zipcode data to make "start_zips"
start_zips = zipcodes.merge(start_points, left_on='zip', right_on='zip')

In [None]:
#combine end points with zipcode data to make "end_zips"
end_zips = zipcodes.merge(end_points, left_on='zip', right_on='zip')

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
start_zips.plot(column = 'count',
              cmap = 'RdPu', 
              legend = True,
              edgecolor = 'black', 
              ax = ax)
plt.title('Scooter Usage by Zipcode', fontsize = 14)
ax.axis('off')

#### Answer: Top 3 zip codes by trip volume are 37203, 37201, and 37219

In [None]:
#Lets look at the scooters in the highest density zipcode: 37203
        #Note: You can use this code to check the counts per zip: 
        #start_points.head()

#First, filter the full dataframe (start_points_by_zip) to get the scooters that started trips in 37203. 
#You have to use that specific variable bc its the one right after the geopandas sjoin that still includes all the individual rows
start_zips_37203 = start_points_by_zip.loc[start_points_by_zip['zip'].isin(['37203', '37201', '37219'])]
start_zips_37203.info()

In [None]:
#convert startdatetime to a datetime format
start_zips_37203['startdatetime'] = pd.to_datetime(start_zips_37203['startdate'].astype('str')+' '+start_zips_37203['starttime'].astype('str'), format ='mixed')


In [None]:
start_zips_37203.info()

In [None]:
#create a new column that indicates what hour each trip is
start_zips_37203['hour'] = start_zips_37203['startdatetime'].dt.hour

In [None]:
#create a new column that indicates what day of the week each trip is
start_zips_37203['week_day'] = start_zips_37203['startdatetime'].dt.day_name()

In [None]:
#notice the dataframe now has hour and week_day as columns
start_zips_37203.head()

In [None]:
#This will show which days of the week had the most trips
start_zips_37203[['sumdid', 'week_day']].groupby(['week_day'])['sumdid']\
                                                        .nunique()\
                                                        .reset_index(name = 'count')\
                                                        .sort_values(['count'], ascending = False)

In [None]:
#filter the zip code geodataframe to just the polygon (row) for 37203
polygon37203 = zipcodes.loc[zipcodes['zip'].isin(['37203', '37201', '37219'])]
polygon37203.shape

In [None]:
#First plot the polygon for 37203. coloring it light green
#Next plot the points for the scooters and color them by hour of the day

ax = polygon37203.plot(figsize = (8, 10), color = 'lightgreen')
start_zips_37203.plot( ax = ax, column = 'hour');
plt.show()


### Folium maps require a center point for the street map. 

In [None]:
#Make use of a GeoSeries attribute (.centroid) which gives the center of a polygon
center = polygon37203.geometry.centroid[35]
print(center)

### Folium requires a location point as an array 
- with *latitude* first
- but shapely Points have *longitude* first 

In [None]:
# reverse the order when constructing the array for folium location
area_center = [center.y, center.x]

# check the order of area_center, the location for our folium map
print(area_center)

### To create a folium map just call the folium Map() constructor 
- location is the only argument required by the constructor
- Set area_center as the location of the folium map.
- zoom_start is an optional argument
- A setting of 12 should get us nice and close

In [None]:
#create our map of Nashville Zip Code 37203 and show it
map_37203 = folium.Map(location =  area_center, zoom_start = 12)
map_37203

#### Next, let's add our 37207 polygon and bus stop markers.

#### `itterows()` is a generator that iterates through the rows of a DataFrame and returns a tuple with the row id and row values. 
- Below, we are printing the row values for the first 3 row as we iterate through the GeoDataFrame of scooter tripss tarted in 37203. 
- This idea will be helpful for creating our markers!

In [None]:
i = 0
for row_index, row_values in start_zips_37203.iterrows():
    if i <3: 
        print('index is', row_index)
        print('values are:')
        print(' ')
        print(row_values)
        print('------------------------- ')
        i+=1

#### Create map_37203 again with additional features:
- add the zipcode area outline (polygon_37203)
- iterate through the start_zips_37203 to
    - create location from each lat and lng
    - create a popup from the hour column
    - create a custom icon if you want with [font-awesome](https://fontawesome.com/v4.7.0/icons/)
    - build a marker from each location and popup and (optionally) your custom icon 
    - add the marker to  `map_37203`
- display `map_37203`

In [None]:
#There are over 25k markers in the start_zips_37203 dataframe so we amy need to filter it:
    #check number of rows: start_zips_37203.shape
start_zips_37203_hours_12_to_1pm = start_zips_37203[start_zips_37203['hour'].isin([12, 13])]
start_zips_37203_hours_12_to_1pm.head()

In [None]:
start_zips_37203_hours_12_to_1pm.shape

In [None]:
start_zips_37203_peak_hours = start_zips_37203[(start_zips_37203['hour'].isin([18, 19, 23])) & (start_zips_37203['week_day'].isin(['Saturday', 'Sunday', 'Friday']))] 
start_zips_37203_peak_hours.shape

### I attempted to first plot out eahc individual coordinate but there are way too many. I created a clustered marker map instead (see code below this one)

#draw our zip code area: 37203
map_37203 = folium.Map(location =  area_center, zoom_start = 12)

folium.GeoJson(polygon37203).add_to(map_37203)

#iterate through start_zips_37203 to create locations and markers 
#for each scooter trip start point
#remember for Folium locations, lat is listed first!!

for row_index, row_values in start_zips_37203_hours_12_to_1pm.iterrows():
    loc = [row_values['startlatitude'], row_values['startlongitude']]
    pop = str(row_values['hour'])
    icon=folium.Icon(color="blue",icon="fa-dot-circle-o", prefix='fa')
    
    marker = folium.Marker(
        location = loc, 
        popup = pop, icon = icon) 

    marker.add_to(map_37203)
#map_37203.save('maps/map37203.html')



#draw our zip code area: 37203
cluster_map_37203 = folium.Map(location =  area_center, zoom_start = 12)

#create a marker cluster
marker_cluster = MarkerCluster().add_to(cluster_map_37203)

folium.GeoJson(polygon37203).add_to(cluster_map_37203)

#iterate through start_zips_37203 to create locations and markers 
#for each scooter trip start point
#remember for Folium locations, lat is listed first!!

#inside the loop add each marker to the cluster
for row_index, row_values in start_zips_37203_hours_12_to_1pm.iterrows():
    loc = [row_values['startlatitude'], row_values['startlongitude']]
    pop = str(row_values['hour'])
    icon=folium.Icon(color="blue",icon="fa-dot-circle-o", prefix='fa')
    
    marker = folium.Marker(
        location = loc, 
        popup = pop, icon = icon) 
    
    marker.add_to(marker_cluster)

    
#save an interactive HTML map by calling .save()
#cluster_map_37207.save('../maps/cluster37207.html')

#### If you have a lot of markers, you might want to use marker clusters
- Folium has `MarkerCluster()` and `FastMarkerCluster()`
- Construct a `MarkerCluster()` 
    - construct a `MarkerCluster()` and `.add_to()` the map
    - when you loop through the data and create markers add those to the marker cluster
- Construct a `FastMarkerCluster()` by passing a list of locations

In [None]:
#draw our zip code area: 37203
cluster_map_37203 = folium.Map(location =  area_center, zoom_start = 12)

#create a marker cluster
marker_cluster = MarkerCluster().add_to(cluster_map_37203)

folium.GeoJson(polygon37203).add_to(cluster_map_37203)

#iterate through start_zips_37203_peak_hours to create locations and markers 
#for each scooter trip start point
#remember for Folium locations, lat is listed first!!
# inside the loop add each marker to the cluster
for row_index, row_values in start_zips_37203_peak_hours.iterrows():
    loc = [row_values['startlatitude'], row_values['startlongitude']]
    pop = str(row_values['hour'])
    icon=folium.Icon(color="blue",icon="fa-dot-circle-o", prefix='fa')
    
    marker = folium.Marker(
        location = loc, 
        popup = pop, icon = icon) 
    
    marker.add_to(marker_cluster)

    
#save an interactive HTML map by calling .save()
#cluster_map_37207.save('../maps/cluster37207.html')

In [None]:
#display our clustered map
cluster_map_37203

In [1]:
#push to Github