# Table of Contents
### - Data wrangling and cleaning
### - Choropleth maps
### - Findings analysis

# Setting up the Notebook

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import os
import folium
import json
from folium.plugins import MarkerCluster

In [None]:
# Have matplotlib visuals appear in notebook
%matplotlib inline

In [None]:
path = r'C:\Users\mmreg\OneDrive\Desktop\Data Analytics Course Work\Data Immersion\Tasks\08-2022 Exploratory Analytics Project\02 Data'

In [None]:
# Import dataset for analysis
df = pd.read_csv(os.path.join(path, 'Prepared', 'citibike_clean.csv'), index_col = False)

In [None]:
df.head()

In [None]:
df = df.drop(columns = ['Unnamed: 0'])

In [None]:
df.head()

In [None]:
# Import JSON file for mapping
nyc_geo = r'C:\Users\mmreg\OneDrive\Desktop\Data Analytics Course Work\Data Immersion\Tasks\08-2022 Exploratory Analytics Project\02 Data\Original\6.3 NYC Geoinfo.geojson'

In [None]:
nyc_geo

# Question 5
## Wrangle your project data to meet the needs of your analysis

In [None]:
# Select necessary columns to put into list
columns = ['weekday', 'start_hour', 'start_station_latitude', 'start_station_longitude', 'end_station_latitude', 'end_station_longitude', 'trip_duration', 'start_station_name', 'end_station_name']

In [None]:
# Create required subset
df_map = df[columns]

In [None]:
df_map.head()

# Question 6
## Clean your data set — watch for missing and extreme values.

In [None]:
# Look for missing values
df_map.isnull().sum()

In [None]:
# No missing values present
# Check for extreme variables
sns.histplot(df_map['weekday'], bins=20, kde = True)

In [None]:
sns.histplot(df_map['start_hour'], bins = 24, kde = True)

In [None]:
sns.histplot(df_map['start_station_longitude'], bins = 20, kde = True)

In [None]:
sns.histplot(df_map['start_station_latitude'], bins = 20, kde = True)

In [None]:
sns.histplot(df_map['end_station_longitude'], bins = 20, kde = True)

In [None]:
sns.histplot(df_map['end_station_latitude'], bins = 20, kde = True)

### All variables seem to be within normal parameters

# Question 7
## Create a choropleth map using a variable of your choice.

In [None]:
# Create dataframe with just start locations and start hour
plot_data = df_map[['start_hour', 'start_station_latitude', 'start_station_longitude', 'start_station_name']]
plot_data.head()

In [None]:
# Create map of starting locations
m = folium.Map(location = [40.712772, -74.006058], tiles = 'OpenStreetMap', zoom_start=13)

markerCluster = MarkerCluster().add_to(m)

for i, row in plot_data.iterrows():
    lat = plot_data.at[i, 'start_station_latitude']
    lng = plot_data.at[i, 'start_station_longitude']
    
    popup = 'Station: ' + str(plot_data.at[i, 'start_station_name'])
    
    folium.Marker(location = [lat, lng], popup = popup, icon = \
                                 folium.Icon(color = 'blue')).add_to(markerCluster)

m.save('citibike_start_locations.html')

In [None]:
# Create dataframe with just end locations
plot_data_2 = df_map[['end_station_latitude', 'end_station_longitude', 'end_station_name']]
plot_data_2.head()

In [None]:
# Create map of end location
m = folium.Map(location = [40.712772, -74.006058], tiles = 'OpenStreetMap', zoom_start=13)

markerCluster = MarkerCluster().add_to(m)

for i, row in plot_data_2.iterrows():
    lat = plot_data_2.at[i, 'end_station_latitude']
    lng = plot_data_2.at[i, 'end_station_longitude']
    
    popup = 'Station: ' + str(plot_data_2.at[i, 'end_station_name'])
    
    folium.Marker(location = [lat, lng], popup = popup, icon = \
                                 folium.Icon(color = 'blue')).add_to(markerCluster)

m.save('citibike_end_locations.html')

# Question 8
## Discuss the results and what they mean in a markdown section.

### Using the two maps, we can see where most bikes are starting from and where they are ending. The range of locations for the company are Manhatten, with a little overlap in the Brooklyn borough. If we were to look in more detail, we can see that the majority of bikes are rented inland, and are taken to the shoreline. We can also see which stations are seeing the most action in terms of rental based on the color of the cluster. The next question I would have for this data is to find out empirically whether or not my assumption is correct about bikes being rented from the interior of the island and taken to the shore.