In [15]:
import time
import pandas as pd
import numpy as np
import calendar


CITY_DATA = { 'chicago': 'chicago.csv',
              'new york city': 'new_york_city.csv',
              'washington': 'washington.csv' }

def get_filters():
    
    months = ['january','february', 'march','april','may','june','all']
    days = ['all','monday','tuesday','wednesday','thursday','friday','saturday','sunday']
    """
    Asks user to specify a city, month, and day to analyze.
    Returns:
        (str) city - name of the city to analyze
        (str) month - name of the month to filter by, or "all" to apply no month filter
        (str) day - name of the day of week to filter by, or "all" to apply no day filter
    """
    print('Hello! Let\'s explore some US bikeshare data!')
    # get user input for city (chicago, new york city, washington). HINT: Use a while loop to handle invalid inputs
    
    while True:
        city = input("Which city would you like to select: chicago, new york city, washington?\n").lower()
        if city not in CITY_DATA:
            print("City input not recognized, please try again!")
            continue
        else:
            break
        
    while True:
        month = input("Please select one of the first six months in the year, or all: \n").lower()
        if month not in months:
            print ("Month input not recognized, please try again!")
            continue
        else:
            break
            
    while True:
        day = input("Please select a day of the week or type all\n").lower()
        if day not in days:
            print("You may have misspelled the name of the day, please try again!", end=' ')
            continue
        else:
            break
        
    print('-'*40)
    return city, month, day

In [16]:
def load_data(city, month, day):
    """
    Loads data for the specified city and filters by month and day if applicable.
    Args:
        (str) city - name of the city to analyze
        (str) month - name of the month to filter by, or "all" to apply no month filter
        (str) day - name of the day of week to filter by, or "all" to apply no day filter
    Returns:
        df - Pandas DataFrame containing city data filtered by month and day
    """

    # load data file into a dataframe
    df = pd.read_csv(CITY_DATA[city])

    #convert start and end time into datetime
    df['Start Time'] = pd.to_datetime(df['Start Time'])
    df['End Time'] = pd.to_datetime(df['End Time'])

    # extract month and day of week from Start Time to create new columns
    df['month'] = df['Start Time'].dt.month
    
    # apply calendar function so that month returns as a name rather than an int, see: https://stackoverflow.com/questions/37625334/python-pandas-convert-month-int-to-month-name
    df['month_name'] = df['Start Time'].dt.month.apply(lambda x: calendar.month_name[x])
    
    df['day_of_week'] = df['Start Time'].dt.weekday_name
    df['Start hour'] = df['Start Time'].dt.hour

    # filter by month if applicable
    if month != 'all':
        # use the index of the months list to get the corresponding int
        months = ['january', 'february', 'march', 'april', 'may', 'june']
        month = months.index(month) + 1

        # filter by month to create the new dataframe
        df = df[df['month'] == month]

    # filter by day of week if applicable
    if day != 'all':
        # filter by day of week to create the new dataframe
        df = df[df['day_of_week'] == day.title()]
        
        print('You have selected the following filters: \nCity: {}\nMonth: {}\nDay: {}'.format(city, month, day))
    
    return df


In [17]:
def time_stats(df):
    """Displays statistics on the most frequent times of travel."""

    print('\nCalculating The Most Frequent Times of Travel...\n')
    start_time = time.time()

    # display the most common month
    most_common_month = df['month_name'].mode()[0]
    
    print('The month with the most rides was',most_common_month)

    # TO DO: display the most common day of week
    most_common_week = df['day_of_week'].mode()[0]
    
    print('The most popular day of the week to ride was',most_common_week)

    # TO DO: display the most common start hour
    most_common_hour = df['Start hour'].mode()[0]
    
    print('The most popular hour to take a ride was',most_common_hour,':00')

    print("\nThis took %s seconds." % (time.time() - start_time))
    print('-'*40)

In [18]:
def station_stats(df):
    """Displays statistics on the most popular stations and trip."""

    print('\nCalculating The Most Popular Stations and Trip...\n')
    start_time = time.time()

    #: display most commonly used start station
    most_common_start_station = df['Start Station'].mode()
    
    print('The most common start station is',most_common_start_station)

    #: display most commonly used end station
    most_common_end_station = df['End Station'].mode()
    
    print('The most common end station is',most_common_end_station)

    #: display most frequent combination of start station and end station trip
    station_combination = df.groupby(['Start Station', 'End Station']).size().idxmax()
    
    #: compute number of occurences
    station_combo_number = df.groupby(['Start Station', 'End Station']).size().sort_values(ascending=False)[0]
    
    print('The most frequent combination of start station and end station is',station_combination,'this combination occured',station_combo_number,'times')

    print("\nThis took %s seconds." % (time.time() - start_time))
    print('-'*40)

In [19]:
def trip_duration_stats(df):
    """Displays statistics on the total and average trip duration."""

    print('\nCalculating Trip Duration...\n')
    start_time = time.time()   

    # display total travel time
    total_duration = df['Trip Duration'].sum()/3600
    
    # rounding info found here: https://stackoverflow.com/questions/4518641/how-to-round-a-floating-point-number-up-to-a-certain-decimal-place/4519044
    print('The total time traveled in hours was',round(total_duration,2))
    
    # display mean travel time
    mean_travel_time_minutes = df['Trip Duration'].mean()/60
    
    print('The average duration of a trip in minutes was',round(mean_travel_time_minutes,2))
    
    # display 

    print("\nThis took %s seconds." % (time.time() - start_time))
    print('-'*40)
    

In [20]:
def user_stats(df):
    """Displays statistics on bikeshare users."""

    print('\nCalculating User Stats...\n')
    start_time = time.time()

    # Display counts of user types
    user_types = df['User Type'].value_counts()
    print('This is the distribution of different user types:\n',user_types)

    # Display counts of gender: follwed advice here https://knowledge.udacity.com/questions/55524
    if 'Gender' in df.columns:
        gender_counts = df['Gender'].value_counts()
    
        print('This is the gender distribution',gender_counts)
    
    else: "Unfortunately we have no gender data in this city"

    # TO DO: Display earliest, most recent, and most common year of birth
    if 'Birth Year' in df.columns:
        earliest_birthyear = df['Birth Year'].min()
        most_recent_birthyear = df['Birth Year'].max()
        most_common_birthyear = df['Birth Year'].mode()
    
        print('The oldest user was born in',earliest_birthyear,'\nThe youngest user was born in',most_recent_birthyear,'\nThe most common birth year among users is',most_common_birthyear)

    else: print('Unforunately we have no data on birth year for this city')
    
    print("\nThis took %s seconds." % (time.time() - start_time))
    print('-'*40)

In [21]:
def main():
    while True:
        city, month, day = get_filters()
        df = load_data(city, month, day)
        
        time_stats(df)
        station_stats(df)
        trip_duration_stats(df)
        user_stats(df)

        input_data = input('\nWould you like to see the first 5 rows of the file? Please enter yes or no:').lower()
        if input_data == 'yes':
            i = 0
            while True:
                print(df.iloc[i:i+5])
                i += 5
                add_data = input('Would you like to see more data? Please enter yes or no: ').lower()
                if add_data not in ('yes'):
                    break
        
        restart = input('\nWould you like to restart? Enter yes or no.\n')
        if restart.lower() != 'yes':
            break

if __name__ == "__main__":
    main()

Hello! Let's explore some US bikeshare data!
Which city would you like to select: chicago, new york city, washington?
chicago
Please select one of the first six months in the year, or all: 
march
Please select a day of the week or type all
tuesday
----------------------------------------
You have selected the following filters: 
City: chicago
Month: 3
Day: tuesday

Calculating The Most Frequent Times of Travel...

The month with the most rides was March
The most popular day of the week to ride was Tuesday
The most popular hour to take a ride was 17 :00

This took 0.0035059452056884766 seconds.
----------------------------------------

Calculating The Most Popular Stations and Trip...

The most common start station is 0    Clinton St & Washington Blvd
dtype: object
The most common end station is 0    Clinton St & Madison St
dtype: object
The most frequent combination of start station and end station is ('Morgan St & Lake St', 'Clinton St & Washington Blvd') this combination occured 9 ti