In [1]:
# Importing packages and functions
import pandas as pd
import datetime # operations to parse dates

import calendar
import csv

In [2]:
pd.__version__

'0.23.4'

In [3]:
# supported cities, months, days
city_data = {'chicago': 'chicago.csv',
          'new york': 'new_york_city.csv',
          'washington': 'washington.csv'}
months = {v.lower(): k for k, v in enumerate(calendar.month_name)}
#months = ('january', 'february', 'march', 'april', 'may', 'june')
days = {v.lower(): k for k, v in enumerate(calendar.day_name)}
#days = ('monday','tuesday','wednesday','thursday','friday','saturday','sunday')

In [4]:
#get user input for city
def get_city():
    '''Asks the user for a city and returns the specified filter.
    Args:
        none.
    Returns:
        (str) City filter for the bikeshare data.
    '''
    while True:
        try:
            city = input('Hello! Let\'s explore some US bikeshare data!\nWould you like to see data for Chicago, New York, or Washington?\n')
        except ValueError:
            print('That is not a valid answer. Please try again.')
        if city.lower() in city_data.keys():
            return city.lower()
        else:
            print('That is not a valid answer. Please try again.')


In [5]:
# 1) pick a city   
city = get_city()
print('Great! We\'ll use %s.' % city)

Hello! Let's explore some US bikeshare data!
Would you like to see data for Chicago, New York, or Washington?
chicago
Great! We'll use chicago.


In [6]:
def get_raw_city_data(city):
    '''Read CSV (comma-separated) file into DataFrame
    Args:
        city filter from get_city()
    Returns:
        raw_city_data df for the specified city's bikeshare data.
    '''
    raw_city_data = pd.read_csv(city_data[city])
    return raw_city_data

In [7]:
 # 3) load data
raw_city_data = get_raw_city_data(city)

In [8]:
raw_city_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 9 columns):
Unnamed: 0       300000 non-null int64
Start Time       300000 non-null object
End Time         300000 non-null object
Trip Duration    300000 non-null int64
Start Station    300000 non-null object
End Station      300000 non-null object
User Type        300000 non-null object
Gender           238948 non-null object
Birth Year       238981 non-null float64
dtypes: float64(1), int64(2), object(6)
memory usage: 20.6+ MB


In [9]:
raw_city_data

Unnamed: 0.1,Unnamed: 0,Start Time,End Time,Trip Duration,Start Station,End Station,User Type,Gender,Birth Year
0,1423854,2017-06-23 15:09:32,2017-06-23 15:14:53,321,Wood St & Hubbard St,Damen Ave & Chicago Ave,Subscriber,Male,1992.0
1,955915,2017-05-25 18:19:03,2017-05-25 18:45:53,1610,Theater on the Lake,Sheffield Ave & Waveland Ave,Subscriber,Female,1992.0
2,9031,2017-01-04 08:27:49,2017-01-04 08:34:45,416,May St & Taylor St,Wood St & Taylor St,Subscriber,Male,1981.0
3,304487,2017-03-06 13:49:38,2017-03-06 13:55:28,350,Christiana Ave & Lawrence Ave,St. Louis Ave & Balmoral Ave,Subscriber,Male,1986.0
4,45207,2017-01-17 14:53:07,2017-01-17 15:02:01,534,Clark St & Randolph St,Desplaines St & Jackson Blvd,Subscriber,Male,1975.0
5,1473887,2017-06-26 09:01:20,2017-06-26 09:11:06,586,Clinton St & Washington Blvd,Canal St & Taylor St,Subscriber,Male,1990.0
6,961916,2017-05-26 09:41:44,2017-05-26 09:46:25,281,Ashland Ave & Lake St,Wood St & Hubbard St,Subscriber,Female,1983.0
7,65924,2017-01-21 14:28:38,2017-01-21 14:40:41,723,Larrabee St & Kingsbury St,Larrabee St & Armitage Ave,Customer,,
8,606841,2017-04-20 16:08:51,2017-04-20 16:20:20,689,Sedgwick St & Huron St,Halsted St & Blackhawk St (*),Subscriber,Male,1984.0
9,135470,2017-02-06 18:00:47,2017-02-06 18:09:00,493,Stetson Ave & South Water St,Clinton St & Washington Blvd,Subscriber,Male,1979.0


In [10]:
def clean_data(raw_city_data):
    '''Read CSV (comma-separated) file into DataFrame
    Args:
        (obj) raw_city_data from get_raw_city_data(city)
    Returns:
        (obj) parsed raw_city_data   
    '''

    
    #change Birth_Year from float to int datatype and fill in NA values to zero
    #raw_city_data['Birth Year']=raw_city_data['Birth Year'].fillna(0.0).astype(int)
    # create 'journey' column that concatenates start_station, end_station 
    raw_city_data['Journey'] = raw_city_data['Start Station'].str.cat(raw_city_data['End Station'], sep=' to ')
    
    #format column names
    raw_city_data.columns = [x.strip().replace(' ', '_') for x in raw_city_data.columns]
    #make all headers lowercase
    raw_city_data.columns=map(str.lower, raw_city_data.columns)
    
    return raw_city_data

In [11]:
clean_data(raw_city_data)

Unnamed: 0,unnamed:_0,start_time,end_time,trip_duration,start_station,end_station,user_type,gender,birth_year,journey
0,1423854,2017-06-23 15:09:32,2017-06-23 15:14:53,321,Wood St & Hubbard St,Damen Ave & Chicago Ave,Subscriber,Male,1992.0,Wood St & Hubbard St to Damen Ave & Chicago Ave
1,955915,2017-05-25 18:19:03,2017-05-25 18:45:53,1610,Theater on the Lake,Sheffield Ave & Waveland Ave,Subscriber,Female,1992.0,Theater on the Lake to Sheffield Ave & Wavelan...
2,9031,2017-01-04 08:27:49,2017-01-04 08:34:45,416,May St & Taylor St,Wood St & Taylor St,Subscriber,Male,1981.0,May St & Taylor St to Wood St & Taylor St
3,304487,2017-03-06 13:49:38,2017-03-06 13:55:28,350,Christiana Ave & Lawrence Ave,St. Louis Ave & Balmoral Ave,Subscriber,Male,1986.0,Christiana Ave & Lawrence Ave to St. Louis Ave...
4,45207,2017-01-17 14:53:07,2017-01-17 15:02:01,534,Clark St & Randolph St,Desplaines St & Jackson Blvd,Subscriber,Male,1975.0,Clark St & Randolph St to Desplaines St & Jack...
5,1473887,2017-06-26 09:01:20,2017-06-26 09:11:06,586,Clinton St & Washington Blvd,Canal St & Taylor St,Subscriber,Male,1990.0,Clinton St & Washington Blvd to Canal St & Tay...
6,961916,2017-05-26 09:41:44,2017-05-26 09:46:25,281,Ashland Ave & Lake St,Wood St & Hubbard St,Subscriber,Female,1983.0,Ashland Ave & Lake St to Wood St & Hubbard St
7,65924,2017-01-21 14:28:38,2017-01-21 14:40:41,723,Larrabee St & Kingsbury St,Larrabee St & Armitage Ave,Customer,,,Larrabee St & Kingsbury St to Larrabee St & Ar...
8,606841,2017-04-20 16:08:51,2017-04-20 16:20:20,689,Sedgwick St & Huron St,Halsted St & Blackhawk St (*),Subscriber,Male,1984.0,Sedgwick St & Huron St to Halsted St & Blackha...
9,135470,2017-02-06 18:00:47,2017-02-06 18:09:00,493,Stetson Ave & South Water St,Clinton St & Washington Blvd,Subscriber,Male,1979.0,Stetson Ave & South Water St to Clinton St & W...


In [12]:
raw_city_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 10 columns):
unnamed:_0       300000 non-null int64
start_time       300000 non-null object
end_time         300000 non-null object
trip_duration    300000 non-null int64
start_station    300000 non-null object
end_station      300000 non-null object
user_type        300000 non-null object
gender           238948 non-null object
birth_year       238981 non-null float64
journey          300000 non-null object
dtypes: float64(1), int64(2), object(7)
memory usage: 22.9+ MB


In [13]:
def parse_data(raw_city_data):
    '''Read CSV (comma-separated) file into DataFrame
    Args:
        (obj) raw_city_data from get_raw_city_data(city)
    Returns:
        (obj) parsed raw_city_data   
    ''' 
    # parse datetime 
    raw_city_data['start_time'] = pd.to_datetime(raw_city_data['start_time'])
    raw_city_data['end_time'] = pd.to_datetime(raw_city_data['end_time'])
    
    # extract month and hour from the Start Time column to create month, hour columns
    raw_city_data['month'] = raw_city_data['start_time'].dt.month
    raw_city_data['day_of_week'] = raw_city_data['start_time'].dt.weekday_name
    raw_city_data['hour'] = raw_city_data['start_time'].dt.hour 
    
    return raw_city_data

In [14]:
parse_data(raw_city_data)

Unnamed: 0,unnamed:_0,start_time,end_time,trip_duration,start_station,end_station,user_type,gender,birth_year,journey,month,day_of_week,hour
0,1423854,2017-06-23 15:09:32,2017-06-23 15:14:53,321,Wood St & Hubbard St,Damen Ave & Chicago Ave,Subscriber,Male,1992.0,Wood St & Hubbard St to Damen Ave & Chicago Ave,6,Friday,15
1,955915,2017-05-25 18:19:03,2017-05-25 18:45:53,1610,Theater on the Lake,Sheffield Ave & Waveland Ave,Subscriber,Female,1992.0,Theater on the Lake to Sheffield Ave & Wavelan...,5,Thursday,18
2,9031,2017-01-04 08:27:49,2017-01-04 08:34:45,416,May St & Taylor St,Wood St & Taylor St,Subscriber,Male,1981.0,May St & Taylor St to Wood St & Taylor St,1,Wednesday,8
3,304487,2017-03-06 13:49:38,2017-03-06 13:55:28,350,Christiana Ave & Lawrence Ave,St. Louis Ave & Balmoral Ave,Subscriber,Male,1986.0,Christiana Ave & Lawrence Ave to St. Louis Ave...,3,Monday,13
4,45207,2017-01-17 14:53:07,2017-01-17 15:02:01,534,Clark St & Randolph St,Desplaines St & Jackson Blvd,Subscriber,Male,1975.0,Clark St & Randolph St to Desplaines St & Jack...,1,Tuesday,14
5,1473887,2017-06-26 09:01:20,2017-06-26 09:11:06,586,Clinton St & Washington Blvd,Canal St & Taylor St,Subscriber,Male,1990.0,Clinton St & Washington Blvd to Canal St & Tay...,6,Monday,9
6,961916,2017-05-26 09:41:44,2017-05-26 09:46:25,281,Ashland Ave & Lake St,Wood St & Hubbard St,Subscriber,Female,1983.0,Ashland Ave & Lake St to Wood St & Hubbard St,5,Friday,9
7,65924,2017-01-21 14:28:38,2017-01-21 14:40:41,723,Larrabee St & Kingsbury St,Larrabee St & Armitage Ave,Customer,,,Larrabee St & Kingsbury St to Larrabee St & Ar...,1,Saturday,14
8,606841,2017-04-20 16:08:51,2017-04-20 16:20:20,689,Sedgwick St & Huron St,Halsted St & Blackhawk St (*),Subscriber,Male,1984.0,Sedgwick St & Huron St to Halsted St & Blackha...,4,Thursday,16
9,135470,2017-02-06 18:00:47,2017-02-06 18:09:00,493,Stetson Ave & South Water St,Clinton St & Washington Blvd,Subscriber,Male,1979.0,Stetson Ave & South Water St to Clinton St & W...,2,Monday,18


In [15]:
raw_city_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 13 columns):
unnamed:_0       300000 non-null int64
start_time       300000 non-null datetime64[ns]
end_time         300000 non-null datetime64[ns]
trip_duration    300000 non-null int64
start_station    300000 non-null object
end_station      300000 non-null object
user_type        300000 non-null object
gender           238948 non-null object
birth_year       238981 non-null float64
journey          300000 non-null object
month            300000 non-null int64
day_of_week      300000 non-null object
hour             300000 non-null int64
dtypes: datetime64[ns](2), float64(1), int64(4), object(6)
memory usage: 29.8+ MB


In [16]:
def filter_data(raw_city_data):
    '''Asks the user for a time period and filter the basic processed data according 
        to the specified filter and returns the filtered data and the filter name.
    Args:
        (obj) basic processed data
    Returns:
        (obj) filtered data
    '''

    # loop for handling invalid entries
    while True: 
        time_period = input('Would you like to filter the data by month, day, or not at all? Type "none" for no time filter.\n').lower()
        print('Great! Time period selected: %s' % time_period)
        if time_period in ('month', 'day', 'none'):
            break
        print('Enter a valid input provided in the options')

    if time_period =='month':
    #ask for the month of choice
        while True:
            month_selection = input('Select a month, January - June \n')
            if month_selection.lower() in months:
                print('Great! We\'ll use %s.' % month_selection)
                month_selection = months.get(month_selection)
                filtered_city_data = raw_city_data[raw_city_data['start_time'].dt.month==month_selection]
                
                break

            print('That is not a valid answer. Please try again.')
            
    elif time_period =='day':
        while True:
            day_selection = input('Which day of the week? \n')
            if day_selection.lower() in days:
                print('Great! We\'ll use %s.' % day_selection)
                day_selection = days.get(day_selection)
                filtered_city_data = raw_city_data[raw_city_data['start_time'].dt.dayofweek==day_selection]
                break

            print('That is not a valid answer. Please try again.')
                
    else:
        filtered_city_data = raw_city_data # for none option

    return filtered_city_data


In [17]:
    #5) filter data
filtered_city_data = filter_data(raw_city_data)

Would you like to filter the data by month, day, or not at all? Type "none" for no time filter.
month
Great! Time period selected: month
Select a month, January - June 
january
Great! We'll use january.


In [18]:
filtered_city_data

Unnamed: 0,unnamed:_0,start_time,end_time,trip_duration,start_station,end_station,user_type,gender,birth_year,journey,month,day_of_week,hour
2,9031,2017-01-04 08:27:49,2017-01-04 08:34:45,416,May St & Taylor St,Wood St & Taylor St,Subscriber,Male,1981.0,May St & Taylor St to Wood St & Taylor St,1,Wednesday,8
4,45207,2017-01-17 14:53:07,2017-01-17 15:02:01,534,Clark St & Randolph St,Desplaines St & Jackson Blvd,Subscriber,Male,1975.0,Clark St & Randolph St to Desplaines St & Jack...,1,Tuesday,14
7,65924,2017-01-21 14:28:38,2017-01-21 14:40:41,723,Larrabee St & Kingsbury St,Larrabee St & Armitage Ave,Customer,,,Larrabee St & Kingsbury St to Larrabee St & Ar...,1,Saturday,14
11,71678,2017-01-22 15:15:45,2017-01-22 15:31:02,917,Southport Ave & Wellington Ave,Clark St & Schiller St,Subscriber,Male,1964.0,Southport Ave & Wellington Ave to Clark St & S...,1,Sunday,15
12,19061,2017-01-08 16:03:00,2017-01-08 16:07:37,277,Green St & Madison St,Ada St & Washington Blvd,Subscriber,Male,1961.0,Green St & Madison St to Ada St & Washington Blvd,1,Sunday,16
39,93723,2017-01-26 19:45:41,2017-01-26 19:59:31,830,McClurg Ct & Erie St,Clinton St & Washington Blvd,Subscriber,Male,1977.0,McClurg Ct & Erie St to Clinton St & Washingto...,1,Thursday,19
91,80745,2017-01-24 10:51:56,2017-01-24 11:12:04,1208,Streeter Dr & Grand Ave,Canal St & Monroe St (*),Subscriber,Male,1985.0,Streeter Dr & Grand Ave to Canal St & Monroe S...,1,Tuesday,10
117,7325,2017-01-03 17:57:21,2017-01-03 18:06:04,523,Artesian Ave & Hubbard St,Damen Ave & Augusta Blvd,Subscriber,Male,1989.0,Artesian Ave & Hubbard St to Damen Ave & Augus...,1,Tuesday,17
120,1647,2017-01-01 21:06:09,2017-01-01 21:10:37,268,Sedgwick St & Webster Ave,Halsted St & Wrightwood Ave,Subscriber,Male,1984.0,Sedgwick St & Webster Ave to Halsted St & Wrig...,1,Sunday,21
121,42280,2017-01-16 14:54:38,2017-01-16 15:09:03,865,Field Museum,Millennium Park,Subscriber,Male,1992.0,Field Museum to Millennium Park,1,Monday,14


In [19]:
def display_statistics(filtered_city_data):
    '''Displays city data statistics (trip, user) on the specified filters
    Args:
        (ob) filtered_city_data
    Returns:
        statistics
    '''
    #Print heading that specifies selected city, filters
    print('\n')
    print('-------------------------------------')    


    """Display statistics on the most popular stations and trip."""
    print('\nStation Info:')
    # display most commonly used start station & end station
    popular_start_station = filtered_city_data['start_station'].mode().to_string(index = False)
    popular_end_station = filtered_city_data['end_station'].mode().to_string(index = False)
    print('Popular Start Station: ', popular_start_station)
    print('Popular End Station: ', popular_end_station)

    # display most frequent combination of start station and end station trip
    popular_journey = filtered_city_data['journey'].mode().to_string(index = False)
    print('Popular Journey: ', popular_journey)

    """Displays statistics on the total and average trip duration."""
    print('\nOther Ridership Data:')
    # display total travel time
    total_travel_time = filtered_city_data['trip_duration'].sum()
    print('Total Time Travel:', total_travel_time)
    # display mean travel time
    mean_travel_time = filtered_city_data['trip_duration'].mean()
    print('Mean Time Travel:', mean_travel_time)

    """Displays statistics on bikeshare users."""
    print('\nUser Info:')
    #Display counts of user types
    user_types=filtered_city_data['user_type'].value_counts()
    print(user_types)
    print('\n')
    
    if city == 'chicago' or city == 'new york': 
        user_statistics(filtered_city_data)

In [20]:
def display_month_day_hour_statistics(filtered_city_data):
    '''Displays city data statistics (month, day, hour) on the specified filters.
       Example: Finds and prints the most popular day of week (Monday, Tuesday, etc.) for start time.
    Args:
        (ob) filtered_city_data
    Returns:
        statistics
    '''
    #Print heading that specifies selected city, filters
    print('\n')
    print('-------------------------------------')
    # display total number of trips for this city and filter
    #print('Total trips: ', (filtered_city_data['Start_Time'].count()))
    
    """Display statistics on the most frequent times of travel."""
    print('\nTrip Info:')
    # display the most common month
    popular_month = filtered_city_data['month'].mode()[0]
    print(popular_month, 'is the month with the highest ridership')

    # display the most common day of week
    popular_day = filtered_city_data['day_of_week'].mode()[0]
    print(popular_day, 'is the day of the week with the highest ridership')
        
    # display the most common hour (from 0 to 23)
    popular_hour = filtered_city_data['hour'].mode()[0]
    print(popular_hour, 'is the most common trip start hour')

In [29]:
def user_statistics(filtered_city_data):
    '''Displays city data statistics on the specified filters, for specified cities
    Args:
        (obj) filtered_city_data
    Returns:
        user statistics for chicago and nyc data only 
    '''
    #Display counts of gender
    #Display earliest, most recent, and most common year of birth
    #gender_count = city_data.groupby('Gender')['Gender'].count()
    gender_count = filtered_city_data['gender'].value_counts()
    print(gender_count)
    earliest = int(filtered_city_data['birth_year'].min(skipna=True))
    recent = int(filtered_city_data['birth_year'].max(skipna=True))
    mode = int(filtered_city_data['birth_year'].mode())
    print('The oldest birth year in the dataset is listed as {}.\nThe most recent birth year in the dataset is {}.'
          '\nThe most common birth year in the dataset is {}.'.format(earliest, recent, mode))
    print('\n')
 


    #finding min without including zeros with pandas and numpy (instead of skipna...if float was updated to int and na filled with zeros)
    #https://stackoverflow.com/questions/51481884/min-value-in-each-column-of-a-data-frame-excluding-zeros
    #https://stackoverflow.com/questions/51594489/find-minimum-without-zero-and-nan-in-pandas-dataframe
    #https://stackoverflow.com/questions/27733431/finding-an-average-but-ignoring-any-zero-in-a-list-python


a = filtered_city_data[filtered_city_data.gt(birth_year)].min(0)
print(a)

In [30]:
display_statistics(filtered_city_data)



-------------------------------------

Station Info:
Popular Start Station:  Clinton St & Washington Blvd
Popular End Station:  Clinton St & Washington Blvd
Popular Journey:  LaSalle St & Jackson Blvd to Canal St & Madiso...

Other Ridership Data:
Total Time Travel: 14826150
Mean Time Travel: 679.8179650602962

User Info:
Subscriber    20794
Customer       1015
Name: user_type, dtype: int64


Male      16745
Female     4059
Name: gender, dtype: int64
The oldest birth year in the dataset is listed as 1899.
The most recent birth year in the dataset is 2000.
The most common birth year in the dataset is 1989.




In [31]:
def display_data(filtered_city_data, row):
    """
    Asks the user if they would you like to view individual trip data and loads the raw data 
    Args:
        (obj) filtered city_data
        ilocs
    Returns:
        data in detail
    """
    display = input('\nWould you like to view individual trip data?'
                    ' Type \'yes\' or \'no\'.\n').lower()
    if display == 'yes' or display == 'y':
        print(filtered_city_data.iloc[row:row+5])
        row += 5
        return display_data(filtered_city_data, row)
    if display == 'no' or display == 'n':
        return
    else:
        print('That is not a valid answer. Please try again.')
        return display_data(filtered_city_data, row)
#https://stackoverflow.com/questions/43772362/how-to-print-a-specific-row-of-a-pandas-dataframe
#https://pandas.pydata.org/pandas-docs/stable/indexing.html


In [32]:
def main():
    """
    Loads analysis and data for the specified city and filters.
    """
    # 1) pick a city   
    city = get_city()
    print('Great! We\'ll use %s.' % city)
    
    # 2) load data
    raw_city_data = get_raw_city_data(city)
    
    #3) clean data
    clean_data(raw_city_data)
 
    #4) parse data
    parse_data(raw_city_data)
    
    #5) filter data
    filtered_city_data = filter_data(raw_city_data)
    
    #6) display statistics
    display_statistics(filtered_city_data)
    
    #7) display statistics on most popular month and day overall
    display_month_day_hour_statistics(filtered_city_data)
    
    #8) see data details
    see_data = display_data(filtered_city_data, row=76)

    #9) restart if you wish
    restart = input('\nWould you like to restart? Enter yes or no.\n')
    if restart.lower() == 'yes' or restart.lower() == 'y':
        main()
    elif restart.lower() == 'no' or restart.lower() == 'n':
        return
    else:
        print("\nThat is not a valid answer. Please try again.")
        return restart()

In [33]:
if __name__ == "__main__":
	main()

Hello! Let's explore some US bikeshare data!
Would you like to see data for Chicago, New York, or Washington?
chicago
Great! We'll use chicago.
Would you like to filter the data by month, day, or not at all? Type "none" for no time filter.
month
Great! Time period selected: month
Select a month, January - June 
may
Great! We'll use may.


-------------------------------------

Station Info:
Popular Start Station:  Streeter Dr & Grand Ave
Popular End Station:  Streeter Dr & Grand Ave
Popular Journey:  Lake Shore Dr & Monroe St to Streeter Dr & Gra...

Other Ridership Data:
Total Time Travel: 65175363
Mean Time Travel: 976.3367987416673

User Info:
Subscriber    51020
Customer      15735
Name: user_type, dtype: int64


Male      38284
Female    12750
Name: gender, dtype: int64
The oldest birth year in the dataset is listed as 1899.
The most recent birth year in the dataset is 2016.
The most common birth year in the dataset is 1989.




-------------------------------------

Trip Info:
5 