In [1]:
%matplotlib notebook

# Bike Trippin

For this assignment, you will be taking "Cycle Share" data from Seattle and creating charts to determine which gender borrows and uses bikes more often.

* Import your dependencies and then import your data into a pandas data frame from the CSV within the 'Data' folder
* Check for null or NaN values and remove them
* Split up your data into groups based upon the gender column
    * NOTE: There will be a garbage row with a gender of 'stoptime' which you will have to remove!
* Chart your data using a bar graph, giving it both a title and labels for the axes

In [2]:
# Import Dependencies
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [3]:
# Import our data into pandas from CSV
bike_trip_data_path = ("Resources/trip.csv")

#file to read
bike_trips_df = pd.read_csv(bike_trip_data_path, low_memory=False)
bike_trips_df
 

Unnamed: 0,stoptime,bikeid,tripduration,from_station_name,to_station_name,from_station_id,to_station_id,usertype,gender,birthyear
0,10/13/2014 10:48,SEA00298,985.935,2nd Ave & Spring St,Occidental Park / Occidental Ave S & S Washing...,CBD-06,PS-04,Member,Male,1960
1,10/13/2014 10:48,SEA00195,926.375,2nd Ave & Spring St,Occidental Park / Occidental Ave S & S Washing...,CBD-06,PS-04,Member,Male,1970
2,10/13/2014 10:48,SEA00486,883.831,2nd Ave & Spring St,Occidental Park / Occidental Ave S & S Washing...,CBD-06,PS-04,Member,Female,1988
3,10/13/2014 10:48,SEA00333,865.937,2nd Ave & Spring St,Occidental Park / Occidental Ave S & S Washing...,CBD-06,PS-04,Member,Female,1977
4,10/13/2014 10:49,SEA00202,923.923,2nd Ave & Spring St,Occidental Park / Occidental Ave S & S Washing...,CBD-06,PS-04,Member,Male,1971
...,...,...,...,...,...,...,...,...,...,...
286853,8/31/2016 23:45,SEA00201,679.532,Harvard Ave & E Pine St,2nd Ave & Spring St,CH-09,CBD-06,Short-Term Pass Holder,,
286854,9/1/2016 0:20,SEA00247,1965.418,Cal Anderson Park / 11th Ave & Pine St,6th Ave S & S King St,CH-08,ID-04,Short-Term Pass Holder,,
286855,9/1/2016 0:20,SEA00300,1951.173,Cal Anderson Park / 11th Ave & Pine St,6th Ave S & S King St,CH-08,ID-04,Short-Term Pass Holder,,
286856,9/1/2016 0:20,SEA00047,1883.299,Cal Anderson Park / 11th Ave & Pine St,6th Ave S & S King St,CH-08,ID-04,Short-Term Pass Holder,,


In [4]:
# Get the last 5 rows 
bike_trips_df.tail(5)

Unnamed: 0,stoptime,bikeid,tripduration,from_station_name,to_station_name,from_station_id,to_station_id,usertype,gender,birthyear
286853,8/31/2016 23:45,SEA00201,679.532,Harvard Ave & E Pine St,2nd Ave & Spring St,CH-09,CBD-06,Short-Term Pass Holder,,
286854,9/1/2016 0:20,SEA00247,1965.418,Cal Anderson Park / 11th Ave & Pine St,6th Ave S & S King St,CH-08,ID-04,Short-Term Pass Holder,,
286855,9/1/2016 0:20,SEA00300,1951.173,Cal Anderson Park / 11th Ave & Pine St,6th Ave S & S King St,CH-08,ID-04,Short-Term Pass Holder,,
286856,9/1/2016 0:20,SEA00047,1883.299,Cal Anderson Park / 11th Ave & Pine St,6th Ave S & S King St,CH-08,ID-04,Short-Term Pass Holder,,
286857,9/1/2016 0:20,SEA00442,1896.031,Cal Anderson Park / 11th Ave & Pine St,6th Ave S & S King St,CH-08,ID-04,Short-Term Pass Holder,,


In [8]:
# Create a clean DataFrame after dropping the null values.
CleanDF = bike_trips_df.dropna(inplace=True)


In [10]:
# Check for null values again.
bike_trips_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 181554 entries, 0 to 286849
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   stoptime           181554 non-null  object 
 1   bikeid             181554 non-null  object 
 2   tripduration       181554 non-null  float64
 3   from_station_name  181554 non-null  object 
 4   to_station_name    181554 non-null  object 
 5   from_station_id    181554 non-null  object 
 6   to_station_id      181554 non-null  object 
 7   usertype           181554 non-null  object 
 8   gender             181554 non-null  object 
 9   birthyear          181554 non-null  object 
dtypes: float64(1), object(9)
memory usage: 15.2+ MB


In [14]:
# Check for null or NaNs.
bike_trips_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 181554 entries, 0 to 286849
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   stoptime           181554 non-null  object 
 1   bikeid             181554 non-null  object 
 2   tripduration       181554 non-null  float64
 3   from_station_name  181554 non-null  object 
 4   to_station_name    181554 non-null  object 
 5   from_station_id    181554 non-null  object 
 6   to_station_id      181554 non-null  object 
 7   usertype           181554 non-null  object 
 8   gender             181554 non-null  object 
 9   birthyear          181554 non-null  object 
dtypes: float64(1), object(9)
memory usage: 15.2+ MB


In [15]:
# Split up the data into groups based upon 'gender' and 'stoptime' Note to self things to use groupby / date / columns type / index date.  
# And, find out how many bike trips each gender took.
Group_Gender_Stop = bike_trips_df.groupby(['gender', 'stoptime']).count()['tripduration']
Group_Gender_Stop

gender    stoptime       
Female    1/1/2015 11:01     2
          1/1/2015 11:54     2
          1/1/2015 12:22     2
          1/1/2015 13:16     4
          1/1/2015 13:32     2
                            ..
Other     9/9/2015 21:41     1
          9/9/2015 7:44      1
          9/9/2015 8:35      1
          9/9/2015 9:44      1
stoptime  4/17/2015 19:21    1
Name: tripduration, Length: 131215, dtype: int64

In [16]:
# Reset the index of the Pandas Series to convert to a DataFrame.
Group_Gender_Stop = Group_Gender_Stop.reset_index()
Group_Gender_Stop
Group_Gender_Stop.head()

Unnamed: 0,gender,stoptime,tripduration
0,Female,1/1/2015 11:01,2
1,Female,1/1/2015 11:54,2
2,Female,1/1/2015 12:22,2
3,Female,1/1/2015 13:16,4
4,Female,1/1/2015 13:32,2


In [17]:
# Get the datatypes for the DataFrame columns.
Group_Gender_Stop.dtypes

gender          object
stoptime        object
tripduration     int64
dtype: object

# Bonus!

You will now take the same base data frame before and create some code that will allow you to create individual pie charts for each bike. For this part of the activity, we want you to chart the total 'Trip Duration' of each bike, sorted by gender. Bonus points if you can come up with a method to do this without using loc or iloc to filter the original data frame! You can use loc to filter group data though.

In [None]:
# Group our data based upon 'bikeid' and 'gender'


# Create a new variable that holds the sum of our groups


In [None]:
# Make a variable called bike_id and store a 'bikeid' in it


# Collect the trips of the 'bikeid' above


In [None]:
# Create a pie chart based upon the trip duration of that single bike
