## Shared Micromobility Vehicle Trips Data Analysis
---

### Data Source and API
- AODP Dataset Access: https://data.austintexas.gov/Transportation-and-Mobility/Shared-Micromobility-Vehicle-Trips/7d8e-dm7r
- API Endpoint: https://data.austintexas.gov/resource/7d8e-dm7r.json
- API Documentation: https://dev.socrata.com/foundry/data.austintexas.gov/7d8e-dm7r

To access the dataset host, install SODA API first:
    `pip install sodapy`

### Data Extraction

- Data Provided
    - trip_id
    - device_id
    - modified_date
    - council_district_start
    - council_district_end
    - vehicle_type
    - trip_duration
    - trip_distance
    - start_time
    - end_time
    - month
    - hour
    - day_of_week
    - year
    - census_geoid_start
    - census_geoid_end

In [1]:
# Dependencies and packages
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math as math
import datetime as dt
#import seaborn as sns
import pandas as pd
#import geopandas as gpd
from sodapy import Socrata

In [2]:
# url = https://data.austintexas.gov/resource/7d8e-dm7r.json
# Data Extraction:
client = Socrata("data.austintexas.gov", None)

results = client.get("7d8e-dm7r", where="year=2019",limit=9000000)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)



In [3]:
results_df.head()

Unnamed: 0,trip_id,device_id,vehicle_type,trip_duration,trip_distance,start_time,end_time,modified_date,month,hour,day_of_week,council_district_start,council_district_end,year,census_geoid_start,census_geoid_end
0,aa27d854-9f27-456e-ae49-f268a9b2b533,6b90a827-60a1-47ad-94a4-d33654dbd50d,scooter,170,0,2019-04-29T17:30:00.000,2019-04-29T17:30:00.000,2019-04-30T06:45:17.000,4,17,1,9,9,2019,48453000601,48453000601
1,b7c9f133-966b-46f1-91ce-2682ce7fcf50,1e5234a3-e86b-41e1-a1ad-e98310f3a71c,scooter,90,0,2019-04-29T17:30:00.000,2019-04-29T17:30:00.000,2019-04-30T06:45:17.000,4,17,1,9,9,2019,48453000601,48453000601
2,1d4f5d89-c044-46cb-8674-80d37fa0371f,caa0325c-7c0b-4909-a573-d83126ecc953,scooter,570,826,2019-04-29T17:15:00.000,2019-04-29T17:15:00.000,2019-04-30T06:45:17.000,4,17,1,9,9,2019,48453001100,48453001100
3,cd442fb9-058f-46a7-af90-b4adb56fd163,1653cf10-75f5-4934-a9d9-c8d43ade67a9,scooter,438,1815,2019-04-29T17:30:00.000,2019-04-29T17:45:00.000,2019-04-30T06:45:17.000,4,17,1,9,9,2019,48453000601,48453000500
4,e1ac0e42-ab2d-4eac-b019-f085169c1d38,1d89f9e2-d01a-4736-b7d8-258cb9a2108c,scooter,391,1519,2019-04-29T17:30:00.000,2019-04-29T17:30:00.000,2019-04-30T06:45:17.000,4,17,1,9,9,2019,48453000401,48453000307


In [4]:
# Check for missing values:
results_df.count()

trip_id                   6023690
device_id                 6023690
vehicle_type              6023690
trip_duration             6023690
trip_distance             6023690
start_time                6023690
end_time                  6023690
modified_date             6023690
month                     6023690
hour                      6023690
day_of_week               6023690
council_district_start    6023556
council_district_end      6023556
year                      6023690
census_geoid_start        6023556
census_geoid_end          6023556
dtype: int64

In [5]:
# Checking and confirming the missing values:
missing_census_geoid_start = results_df["census_geoid_start"].isnull().sum()
print(f"There are {missing_census_geoid_start} missing census_geoid_start.")

There are 134 missing census_geoid_start.


In [6]:
results_df.isnull().sum()

trip_id                     0
device_id                   0
vehicle_type                0
trip_duration               0
trip_distance               0
start_time                  0
end_time                    0
modified_date               0
month                       0
hour                        0
day_of_week                 0
council_district_start    134
council_district_end      134
year                        0
census_geoid_start        134
census_geoid_end          134
dtype: int64

### Data Cleaning

In [7]:
clean_df = results_df.copy()

In [8]:
clean_df.columns = clean_df.columns.str.replace('_',' ').str.title()
clean_df

Unnamed: 0,Trip Id,Device Id,Vehicle Type,Trip Duration,Trip Distance,Start Time,End Time,Modified Date,Month,Hour,Day Of Week,Council District Start,Council District End,Year,Census Geoid Start,Census Geoid End
0,aa27d854-9f27-456e-ae49-f268a9b2b533,6b90a827-60a1-47ad-94a4-d33654dbd50d,scooter,170,0,2019-04-29T17:30:00.000,2019-04-29T17:30:00.000,2019-04-30T06:45:17.000,4,17,1,9,9,2019,48453000601,48453000601
1,b7c9f133-966b-46f1-91ce-2682ce7fcf50,1e5234a3-e86b-41e1-a1ad-e98310f3a71c,scooter,90,0,2019-04-29T17:30:00.000,2019-04-29T17:30:00.000,2019-04-30T06:45:17.000,4,17,1,9,9,2019,48453000601,48453000601
2,1d4f5d89-c044-46cb-8674-80d37fa0371f,caa0325c-7c0b-4909-a573-d83126ecc953,scooter,570,826,2019-04-29T17:15:00.000,2019-04-29T17:15:00.000,2019-04-30T06:45:17.000,4,17,1,9,9,2019,48453001100,48453001100
3,cd442fb9-058f-46a7-af90-b4adb56fd163,1653cf10-75f5-4934-a9d9-c8d43ade67a9,scooter,438,1815,2019-04-29T17:30:00.000,2019-04-29T17:45:00.000,2019-04-30T06:45:17.000,4,17,1,9,9,2019,48453000601,48453000500
4,e1ac0e42-ab2d-4eac-b019-f085169c1d38,1d89f9e2-d01a-4736-b7d8-258cb9a2108c,scooter,391,1519,2019-04-29T17:30:00.000,2019-04-29T17:30:00.000,2019-04-30T06:45:17.000,4,17,1,9,9,2019,48453000401,48453000307
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6023685,3aae15d8-4d69-494e-a10f-e3ae2af9fca1,0a3c7884-954c-40e4-a82c-db52fcec4e8e,scooter,250,520,2019-12-28T14:30:00.000,2019-12-28T14:30:00.000,2019-12-29T04:30:11.000,12,14,6,1,3,2019,48453000901,48453000902
6023686,d3501cbb-adac-4609-bd4d-8c90200d4cca,854d10c1-6bb5-4fb9-be21-ec71515939c1,scooter,390,1374,2019-12-28T15:00:00.000,2019-12-28T15:15:00.000,2019-12-29T04:30:11.000,12,15,6,9,9,2019,48453001200,48453001100
6023687,9979fa36-1a0c-417c-abc6-ccb0a773bf15,57a92e65-4f1a-44de-8a70-9cc6f13639c4,bicycle,291,5,2019-02-19T17:45:00.000,2019-02-19T17:45:00.000,2020-01-16T01:39:39.000,2,17,2,9,9,2019,48453001100,48453001100
6023688,26399130-24de-40ac-8a4a-c498419e2587,00c131ac-40d4-4f04-b9dc-82d40bd7a0c7,bicycle,120,0,2019-02-19T13:30:00.000,2019-02-19T13:30:00.000,2020-01-16T01:39:39.000,2,13,2,9,9,2019,48453001200,48453001200


In [9]:
# Renaming some column names:
clean_df = clean_df.rename(columns = {
    "Trip Id": "Trip ID",
    "Device Id": "Device ID",
    "Census Geoid Start": "Census GEOID Start",
    "Census Geoid End": "Census GEOID End",
})
clean_df.head()

Unnamed: 0,Trip ID,Device ID,Vehicle Type,Trip Duration,Trip Distance,Start Time,End Time,Modified Date,Month,Hour,Day Of Week,Council District Start,Council District End,Year,Census GEOID Start,Census GEOID End
0,aa27d854-9f27-456e-ae49-f268a9b2b533,6b90a827-60a1-47ad-94a4-d33654dbd50d,scooter,170,0,2019-04-29T17:30:00.000,2019-04-29T17:30:00.000,2019-04-30T06:45:17.000,4,17,1,9,9,2019,48453000601,48453000601
1,b7c9f133-966b-46f1-91ce-2682ce7fcf50,1e5234a3-e86b-41e1-a1ad-e98310f3a71c,scooter,90,0,2019-04-29T17:30:00.000,2019-04-29T17:30:00.000,2019-04-30T06:45:17.000,4,17,1,9,9,2019,48453000601,48453000601
2,1d4f5d89-c044-46cb-8674-80d37fa0371f,caa0325c-7c0b-4909-a573-d83126ecc953,scooter,570,826,2019-04-29T17:15:00.000,2019-04-29T17:15:00.000,2019-04-30T06:45:17.000,4,17,1,9,9,2019,48453001100,48453001100
3,cd442fb9-058f-46a7-af90-b4adb56fd163,1653cf10-75f5-4934-a9d9-c8d43ade67a9,scooter,438,1815,2019-04-29T17:30:00.000,2019-04-29T17:45:00.000,2019-04-30T06:45:17.000,4,17,1,9,9,2019,48453000601,48453000500
4,e1ac0e42-ab2d-4eac-b019-f085169c1d38,1d89f9e2-d01a-4736-b7d8-258cb9a2108c,scooter,391,1519,2019-04-29T17:30:00.000,2019-04-29T17:30:00.000,2019-04-30T06:45:17.000,4,17,1,9,9,2019,48453000401,48453000307


In [16]:
# Drop all the null values
clean_df = clean_df.dropna(how='any')
clean_df.head()

Unnamed: 0,Trip ID,Device ID,Vehicle Type,Trip Duration,Trip Distance,Start Time,End Time,Modified Date,Month,Hour,Day Of Week,Council District Start,Council District End,Year,Census GEOID Start,Census GEOID End
0,aa27d854-9f27-456e-ae49-f268a9b2b533,6b90a827-60a1-47ad-94a4-d33654dbd50d,scooter,170,0,2019-04-29 17:30:00,2019-04-29 17:30:00,2019-04-30 06:45:17,4,17,1,9,9,2019,48453000601,48453000601
1,b7c9f133-966b-46f1-91ce-2682ce7fcf50,1e5234a3-e86b-41e1-a1ad-e98310f3a71c,scooter,90,0,2019-04-29 17:30:00,2019-04-29 17:30:00,2019-04-30 06:45:17,4,17,1,9,9,2019,48453000601,48453000601
2,1d4f5d89-c044-46cb-8674-80d37fa0371f,caa0325c-7c0b-4909-a573-d83126ecc953,scooter,570,826,2019-04-29 17:15:00,2019-04-29 17:15:00,2019-04-30 06:45:17,4,17,1,9,9,2019,48453001100,48453001100
3,cd442fb9-058f-46a7-af90-b4adb56fd163,1653cf10-75f5-4934-a9d9-c8d43ade67a9,scooter,438,1815,2019-04-29 17:30:00,2019-04-29 17:45:00,2019-04-30 06:45:17,4,17,1,9,9,2019,48453000601,48453000500
4,e1ac0e42-ab2d-4eac-b019-f085169c1d38,1d89f9e2-d01a-4736-b7d8-258cb9a2108c,scooter,391,1519,2019-04-29 17:30:00,2019-04-29 17:30:00,2019-04-30 06:45:17,4,17,1,9,9,2019,48453000401,48453000307


In [None]:
# Change the time and date format for columns - 'Modified Date', 'Start time' and 'End Time'
clean_df['Start Time'] = pd.to_datetime(clean_df['Start Time'])
clean_df['End Time'] = pd.to_datetime(clean_df['End Time'])
clean_df['Modified Date'] = pd.to_datetime(clean_df['Modified Date'])
clean_df.head()

In [None]:
#Find the total number of scooter rides:
scooters = clean_df[clean_df["Vehicle Type"] == "scooter"]
total_scooters = len(clean_df) - scooters.shape[0]
total_scooters

In [None]:
#Find the total number of bicycle rides:
bicycles = clean_df[clean_df["Vehicle Type"] == "bicycle"]
total_bicycles = len(clean_df) - bicycles.shape[0]
total_bicycles

In [14]:
#Check to tally the total rides 
total_rides = total_bicycles + total_scooters
total_rides

6023556

In [17]:
#???????
# Does the distance of the trip impact the method of transportation (scooter vs. bicycle)
    #Type of ride; distance per type of rides, average for each
    #Plot this

#total_rides = 999994
#total_bicycles = 931841
#total_scooters = 68153

#scooter_distance = cleandf[scooters["Trip Distance"]].mean()
#scooter_distance
#scooter_distance


#avg_scooter_distance = 

#bike_distance = 
#avg_bike_distance =

In [None]:
# Find the number of unique devices
#device_id_list =  clean_df["Device ID"].value_counts()
#device_id_list

In [None]:
# Find the number of unique GEOIDs where the trips started
start_geoid = clean_df["Census GEOID Start"].value_counts()
start_geoid

In [None]:
# Find the number of unique GEOIDs where the trips ended
end_geoid = clean_df["Census GEOID End"].value_counts()
end_geoid

In [None]:
# Convert dataframe to CSV 
output_data_file = "/Users/sheetalbongale/Scooters_In_Austin_Data_Analysis/resources/shared_mobility_data.csv"
clean_df.to_csv(output_data_file)

### Data Merging

In [None]:
csvpath1 = os.path.join("/Users/sheetalbongale/Scooters_In_Austin_Data_Analysis/resources/zip_tract_092019.csv")
csvpath2 = os.path.join("/Users/sheetalbongale/Scooters_In_Austin_Data_Analysis/resources/shared_mobility_data.csv")
zip_data_start = pd.read_csv(csvpath1)
zip_data_end = pd.read_csv(csvpath1)
clean_df = pd.read_csv(csvpath2, low_memory=False)

In [None]:
clean_df.head(10)

In [None]:
clean_df = clean_df.drop(columns=['Unnamed: 0'])
clean_df

In [None]:
#column3 = zip_data['tract']
#zip_data['Census GEOID End'] = column3
zip_data_start.rename(columns={'zip': 'Zipcode', 'tract': 'Census GEOID Start'}, inplace=True)
zip_data_start

In [None]:
zip_data_end.rename(columns={'zip': 'Zipcode', 'tract': 'Census GEOID End'}, inplace=True)
zip_data_end

In [None]:
zip_data_start[["Zipcode", "Census GEOID Start"]] = zip_data_start[["Zipcode", "Census GEOID Start"]].astype(int).astype(str)
zip_data_end[["Zipcode", "Census GEOID End"]] = zip_data_end[["Zipcode", "Census GEOID End"]].astype(int).astype(str)


In [None]:
df_start = pd.merge(clean_df, zip_data_start, on= "Census GEOID Start", how ="right")
df_start

In [None]:
df_start = df_start.rename(columns={'Zipcode': 'Zipcode Start'})

In [None]:
df_start = df_start.dropna()

In [None]:
df_start.dtypes

In [None]:
zip_data_start.dtypes

In [None]:
df_end = pd.merge(df_start, zip_data_end, on= "Census GEOID End", how ="right")
df_end

In [None]:
df_end = df_end.rename(columns={'Zipcode': 'Zipcode End'})

In [None]:
df_end = df_end.dropna()
df_end

In [None]:
df = df_end.copy()

In [None]:
df.isnull().sum()

In [None]:
#Commenting this out because we already have the csv file after running this once
#output_data_file = "/Users/sheetalbongale/Scooters_In_Austin_Data_Analysis/resources/merged_sharedmobility.csv"
#df.to_csv(output_data_file)

In [None]:
# csvpath = os.path.join("/Users/sheetalbongale/Scooters_In_Austin_Data_Analysis/resources/merged_sharedmobility.csv")
# df = pd.read_csv(csvpath, index_col=0, low_memory=False)

In [None]:
df.head(30)

### Visualization of the Clean Data Set

In [None]:
# Count trips per day and sort by day:
daily_total = pd.DataFrame(df['Day Of Week'].value_counts().sort_index())

# Map day of week with actual names:
days = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
daily_total['Day'] = days

# Plot glyph: 
daily_total.plot(kind='bar', x='Day', y='Day Of Week', title='Total Trip Counts by Day of week', figsize = (10,5), rot= 30, legend=False)
plt.ylabel("Number of Trips")
plt.savefig("Plots/trips_per_week.png")
plt.show()

In [None]:
# Count trips per hour and sort by hour:
hourly_total = pd.DataFrame(df['Hour'].value_counts().sort_index())
#hours = ['0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18', '19','20','21','22','23']
#hours = ['0','1','10','11','12','13','14','15','16','17','18','19','2','20','21','22','23','3','4','5','6','7','8','9']
#hourly_total['Hours'] = hours

hourly_total.reset_index().plot(kind='bar', x='index', y='Hour', title='Total Trip Counts by Hour', figsize = (10,5), legend=False)
plt.xlabel('Hours')
plt.ylabel('Number of Trips')
plt.savefig("Plots/trips_per_hour.png")
plt.show()
#now shows correct plot with sorted xticks

In [None]:
monthly_total = pd.DataFrame(df['Month'].value_counts()).sort_index()
import calendar
mn=[calendar.month_name[int(x)] for x in monthly_total.index.values.tolist()]

monthly_chart = monthly_total.plot.bar(title = "Total Trips per Month ",width = 0.75,figsize = (10,5),rot = 30, legend = False)
monthly_chart.set_xticklabels(mn)
monthly_chart.set_xlabel("Trip Months")
monthly_chart.set_ylabel("Total Trip Count")
plt.savefig("Plots/trips_per_month1.png")
plt.show()
#now shows correct plot with sorted months

In [None]:
month_list = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
monthly_total['Months'] = month_list
monthly_total.reset_index().plot(kind='bar', x='Months', y='Month', title='Total Trip Counts by months', figsize = (10,5), rot = 30, legend=False)
plt.ylabel('Number of Trips')
plt.savefig("Plots/trips_per_month2.png")
plt.show()
#now shows correct plot with sorted xticks

In [None]:
# Count how many trips started in each census GEOID tract
census_trip_start = pd.DataFrame(df['Census GEOID Start'].value_counts())

# Count how many trips ended in each census tract
census_trip_end = pd.DataFrame(df['Census GEOID End'].value_counts())

plt.bar(df["Census GEOID Start"], census_trip_start, figsize = (10,5))
plt.title("Total Count of trips starting per Census Tract")
plt.xlabel("GEOID ID")
plt.ylabel("Total Trips")
plt.show()