<h1><right>Capital Bike Rental Analysis</right></h1>
<h2>Ki Mau</h2>

## <center>Python Library Imports</center>

In [114]:
#Library imports for our project
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display
from sklearn import metrics
import seaborn as sb
import sklearn as sk
import datetime
import glob

## <center>Importing and Combining Data</center>

In [115]:
#Combines all csv datasets from the 'data' folder

#Get File names from data folder
path =r'data/'
filenames = glob.glob(path + "/*.csv")

#Empty list
dataset = []
           
#Add each item to list
for file in filenames:
    chunksize = 10 ** 6
    for chunk in pd.read_csv(file, chunksize=chunksize):
        #process(chunk)
        dataset.append(chunk)

# Concatenates all csv datasets into one (very large) DataFrame
full_data = pd.concat(dataset, ignore_index=True)

In [116]:
#Verifying that it worked
full_data

Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type
0,230,2019-01-01 00:04:48,2019-01-01 00:08:39,31203,14th & Rhode Island Ave NW,31200,Massachusetts Ave & Dupont Circle NW,E00141,Member
1,1549,2019-01-01 00:06:37,2019-01-01 00:32:27,31321,15th St & Constitution Ave NW,31114,18th St & Wyoming Ave NW,W24067,Casual
2,177,2019-01-01 00:08:46,2019-01-01 00:11:44,31104,Adams Mill & Columbia Rd NW,31323,Woodley Park Metro / Calvert St & Connecticut ...,W22654,Casual
3,228,2019-01-01 00:08:47,2019-01-01 00:12:35,31281,8th & O St NW,31280,11th & S St NW,W22336,Member
4,1300,2019-01-01 00:12:29,2019-01-01 00:34:10,31014,Lynn & 19th St North,31923,Columbia Pike & S Taylor St,70004,Member
...,...,...,...,...,...,...,...,...,...
3398412,130,2019-12-31 23:57:36,2019-12-31 23:59:46,31011,Crystal Dr & 23rd St S,31009,Crystal Dr & 27th St S,W21285,Member
3398413,664,2019-12-31 23:57:47,2020-01-01 00:08:51,31125,15th & W St NW,31281,8th & O St NW,W24197,Member
3398414,389,2019-12-31 23:59:37,2020-01-01 00:06:06,31047,Braddock Rd Metro,31085,Mount Vernon Ave & E Nelson Ave,W21281,Member
3398415,962,2019-12-31 23:59:38,2020-01-01 00:15:40,31236,37th & O St NW / Georgetown University,31214,17th & Corcoran St NW,W00534,Member


## <center>Data Cleaning</center>

In [117]:
#Searching for any Null values
print("\n\nNull/NaN values")
null_data = full_data[full_data.isnull().any(axis=1)]
null_data



Null/NaN values


Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type
1377112,788,2019-06-12 06:40:39,2019-06-12 06:53:47,31306,39th & Calvert St NW / Stoddert,31200,Massachusetts Ave & Dupont Circle NW,,Member
1378926,1452,2019-06-12 08:32:20,2019-06-12 08:56:33,31200,Massachusetts Ave & Dupont Circle NW,31219,10th St & Constitution Ave NW,,Member
1383217,615,2019-06-12 15:45:35,2019-06-12 15:55:51,31219,10th St & Constitution Ave NW,31623,Columbus Circle / Union Station,,Member
1392193,1703,2019-06-13 11:34:37,2019-06-13 12:03:00,31507,1st & Washington Hospital Center NW,31312,Wisconsin Ave & O St NW,,Casual
1396358,1075,2019-06-13 18:30:28,2019-06-13 18:48:24,31305,Connecticut Ave & Newark St NW / Cleveland Park,31201,15th & P St NW,,Member
1397045,888,2019-06-13 19:18:37,2019-06-13 19:33:26,31121,Calvert St & Woodley Pl NW,31401,14th St & Spring Rd NW,,Member
1401235,641,2019-06-14 08:57:17,2019-06-14 09:07:59,31324,18th & New Hampshire Ave NW,31260,23rd & E St NW,,Member
1403209,1092,2019-06-14 12:14:22,2019-06-14 12:32:34,31260,23rd & E St NW,31101,14th & V St NW,,Member
1403573,811,2019-06-14 12:46:04,2019-06-14 12:59:35,31101,14th & V St NW,31246,M St & Pennsylvania Ave NW,,Member
1404951,470,2019-06-14 14:48:48,2019-06-14 14:56:39,31246,M St & Pennsylvania Ave NW,31292,22nd St & Constitution Ave NW,,Member


In [118]:
#Since we're not going to do anything with the individual Bike Numbers for this project, 
#these NaN values are acceptable to leave in place.
print("\nDoes not affect our current process, leaving as-is.")


Does not affect our current process, leaving as-is.


In [119]:
#Searching for any missing values
print("\n\nSearching for missing values")
missing_data = np.where(full_data.applymap(lambda x: x == ''))

missing_data



Searching for missing values


(array([], dtype=int64), array([], dtype=int64))

In [120]:
#No empty values found.
print("No missing values found.\n")

No missing values found.



## <center>Additional Data Extrapolation</center>

In [121]:
#Converting Start date column to proper datetime object
full_data['Start date'] = pd.to_datetime(full_data['Start date'])

#Adding a month column from date
full_data['Month'] = full_data['Start date'].dt.month_name()

#Adding a day column from date
full_data['Day'] = full_data['Start date'].dt.day

#Adding an hour column from date
full_data['Hour'] = full_data['Start date'].dt.hour

print("\n\nAdded Month, Day, and Hour columns")

#Check
full_data



Added Month, Day, and Hour columns


Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type,Month,Day,Hour
0,230,2019-01-01 00:04:48,2019-01-01 00:08:39,31203,14th & Rhode Island Ave NW,31200,Massachusetts Ave & Dupont Circle NW,E00141,Member,January,1,0
1,1549,2019-01-01 00:06:37,2019-01-01 00:32:27,31321,15th St & Constitution Ave NW,31114,18th St & Wyoming Ave NW,W24067,Casual,January,1,0
2,177,2019-01-01 00:08:46,2019-01-01 00:11:44,31104,Adams Mill & Columbia Rd NW,31323,Woodley Park Metro / Calvert St & Connecticut ...,W22654,Casual,January,1,0
3,228,2019-01-01 00:08:47,2019-01-01 00:12:35,31281,8th & O St NW,31280,11th & S St NW,W22336,Member,January,1,0
4,1300,2019-01-01 00:12:29,2019-01-01 00:34:10,31014,Lynn & 19th St North,31923,Columbia Pike & S Taylor St,70004,Member,January,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3398412,130,2019-12-31 23:57:36,2019-12-31 23:59:46,31011,Crystal Dr & 23rd St S,31009,Crystal Dr & 27th St S,W21285,Member,December,31,23
3398413,664,2019-12-31 23:57:47,2020-01-01 00:08:51,31125,15th & W St NW,31281,8th & O St NW,W24197,Member,December,31,23
3398414,389,2019-12-31 23:59:37,2020-01-01 00:06:06,31047,Braddock Rd Metro,31085,Mount Vernon Ave & E Nelson Ave,W21281,Member,December,31,23
3398415,962,2019-12-31 23:59:38,2020-01-01 00:15:40,31236,37th & O St NW / Georgetown University,31214,17th & Corcoran St NW,W00534,Member,December,31,23


In [122]:
ALL='All'
def unique_months_and_all(array):
    umonths = array.unique().tolist()
    umonths.append(ALL)
    return umonths

In [123]:
#Visual Output
#Including Month Selection and actual visual plot output

#Graphical Month Button Selector
month_selection = widgets.ToggleButtons(
    options = unique_months_and_all(full_data.Month),
    value='All',
    button_style='info',
)

#Varables for widget outputs
selection = widgets.Output()
projected1 = widgets.Output()
plotter1 = widgets.Output()
plotter2 = widgets.Output()
plotter3 = widgets.Output()
plotter4 = widgets.Output()
plotter5 = widgets.Output()

#Function run depending on user selection
def filter(month):
    selection.clear_output()
    projected1.clear_output()
    plotter1.clear_output()
    plotter2.clear_output()
    plotter3.clear_output()
    plotter4.clear_output()
    plotter5.clear_output()

    # Filters data based on user selection
    if (month == 'All'): 
        filtered_data = full_data[full_data["Start station number"] != 0]
    else:
        #filtered_data = full_data[full_data.Month == month]
        filtered_data = full_data[(full_data.Month == month) & (full_data["Start station number"] != 0)]  
    
    #General (Truncated) Dataset
    with selection:
        display(filtered_data)
        

    #Plots for further Data Analysis
    with plotter1:
        sb.set_theme(style="ticks")
        f, ax = plt.subplots(figsize=(10, 10))
        sb.despine(f)
        sb.histplot(
            filtered_data,
            x="Day",
            )
        ax.set_xticks(range(1,32))
        #plt.savefig('plot1-BRbd.png')
        plt.show()
        
        
    with plotter2:
        f, ax = plt.subplots(figsize=(10, 10))
        sb.kdeplot(filtered_data['Day'], 
                   shade=True,                   
                  )
        ax.set_xlim(1,31)
        ax.set_xticks(range(1,32))
        #plt.savefig('plot2-DBbd.png')
        plt.show()
        
    with plotter3:
        sb.set_theme(style="ticks")
        f, ax = plt.subplots(figsize=(10, 10))
        sb.despine(f)
        sb.histplot(
            filtered_data,
            x="Hour",
            )
        ax.set_xticks(range(0,24))
        #plt.savefig('plot3-BRbh.png')
        plt.show()
    
    with plotter4:
        f, ax = plt.subplots(figsize=(10, 10))
        sb.kdeplot(filtered_data['Hour'], 
                   shade=True,
                  )
        ax.set_xlim(0,23)
        ax.set_xticks(range(0,24))
        #plt.savefig('plot4-DBbh.png')
        plt.show()
    
    with plotter5:

        sb.set_theme(style="whitegrid")        
        f, ax = plt.subplots(figsize=(10, 10))
        sb.despine(f, left=True, bottom=True)
        sb.scatterplot(x="Hour", y="Duration",
                        palette="ch:r=-.2,d=.3_r",
                        data=filtered_data, 
                        )
        ax.set_xticks(range(0,24))
        #plt.savefig('plot5-DrBbh.png')
        plt.show()
        
    with projected1:
        sb.pairplot(filtered_data, 
                    x_vars=['Start station number','Day'], 
                    y_vars='Duration', 
                    height=8, aspect=0.8, 
                    kind='reg')
        
        feature_columns = ['Start station number', 'Day']
        X = filtered_data[feature_columns]

        y = filtered_data['Duration']

        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, random_state=1)
        
        #plt.savefig('proj1-DrBlnd.png')
        plt.show()

        print("\nX Train Count")
        print(X_train.shape)
        
        print("\ny Train Count")
        print(y_train.shape)
       
        print("\nX Test Count")
        print(X_test.shape)
        
        print("\ny Test Count")
        print(y_test.shape)
        
        #Import Regression Model and run
        from sklearn.linear_model import LinearRegression
        linreg = LinearRegression()
        linreg.fit(X_train, y_train)
        
        print("\nRegression Intercept")
        print(linreg.intercept_)
        
        print("\nRegression Coeffecients")
        print(linreg.coef_)
        
        print("\nFeatures with Coefficients")
        print(list(zip(feature_columns, linreg.coef_)))
        
        #y_prediction
        y_pred = linreg.predict(X_test)
        
        print("\nRMSE of testing")
        print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
        
        print("\n\nPROCESSING COMPLETED")


def month_selection_eventhandler(change):
    filter(change.new)       


month_selection.observe(month_selection_eventhandler, names='value')


## <center>Month Selection</center>

In [124]:
print("Please select a month below.")

Please select a month below.


In [125]:
display(month_selection)

ToggleButtons(button_style='info', index=12, options=('January', 'February', 'March', 'April', 'May', 'June', …

### <center>Filtered Data by Selection (Truncated)</center>

In [126]:
display(selection)

Output()

## <center>Data Analysis Plots</center>

### <center>Total of Bikes Rented (By Day)</center>

In [127]:
display(plotter1)

Output()

### <center>Density of Bikes Rented (By Day)</center>

In [128]:
display(plotter2)

Output()

### <center>Total Count of Bikes Rented (By Hour)</center>

In [129]:
display(plotter3)

Output()

### <center>Density of Bikes Rented (By Hour)</center>

In [130]:
display(plotter4)

Output()

### <center>Total Duration of Bikes Rented (By Hour)</center>

In [131]:
display(plotter5)

Output()

## <center>Projected Data Correlation</center>

### <center>Duration Based on Location and Day</center>

In [132]:
display(projected1)

Output()

### <center><font color='red'>WARNING</font></center>
<center>If "PROCESSING COMPLETED" is not printed directly above "WARNING", notebook is still running calculations.</center>

fake_selection = full_data[(full_data.Month == "December") & (full_data["Start station number"] != 0)]  
fake_selection