# Yan-Ferrighetto Project 2

### Part 1: Data Loading &amp; Preparation

1.1 Load the data

In [1]:
!pip install geopy
!pip install imblearn



In [2]:
import glob
import pandas as pd
import numpy as np
import os

In [3]:
joined_files = os.path.join("./Source", "JC*.csv")
joined_list = glob.glob(joined_files)
pre_df = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)
pre_df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,74A4206E7487CBC9,docked_bike,2021-05-23 16:51:00,2021-05-23 18:38:21,9 St HBLR - Jackson St & 8 St,HB305,9 St HBLR - Jackson St & 8 St,HB305,40.747907,-74.038411,40.747907,-74.038412,casual
1,58EEE2950FFE01CE,docked_bike,2021-05-31 16:54:47,2021-05-31 16:55:28,9 St HBLR - Jackson St & 8 St,HB305,9 St HBLR - Jackson St & 8 St,HB305,40.747907,-74.038411,40.747907,-74.038412,member
2,1429D912C16EEE59,docked_bike,2021-05-25 16:19:34,2021-05-25 17:03:06,9 St HBLR - Jackson St & 8 St,HB305,9 St HBLR - Jackson St & 8 St,HB305,40.747907,-74.038411,40.747907,-74.038412,casual
3,FE9C5B74167CBCCD,docked_bike,2021-05-22 17:32:19,2021-05-22 17:41:27,9 St HBLR - Jackson St & 8 St,HB305,Grand St & 2 St,HB405,40.747907,-74.038411,40.73913,-74.03618,casual
4,B88D37626F000BBA,docked_bike,2021-05-14 09:48:34,2021-05-14 10:17:36,Union St,JC051,Newark Ave,JC032,40.718211,-74.083639,40.721525,-74.046305,casual


In [4]:
pre_df.shape

(743074, 13)

In [5]:
"""
pre_df.dropna(inplace=True)
pre_df.shape
"""

'\npre_df.dropna(inplace=True)\npre_df.shape\n'

The NYC bikeshare data is from February of 2021 to March 2022. We originally planned on utilizing data beginning when the COVID-19 pandemic shutdowns began in the U.S. in March 2020, but unfortunately the data collected changed in January 2021, creating continuity issues. The dataset has 743,163 records and 13 columns.

1.1 Preprocessing

In [6]:
pre_df = pre_df.drop(['ride_id', 'start_station_name', 'end_station_name'], 1)
pre_df.head()

  pre_df = pre_df.drop(['ride_id', 'start_station_name', 'end_station_name'], 1)


Unnamed: 0,rideable_type,started_at,ended_at,start_station_id,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,docked_bike,2021-05-23 16:51:00,2021-05-23 18:38:21,HB305,HB305,40.747907,-74.038411,40.747907,-74.038412,casual
1,docked_bike,2021-05-31 16:54:47,2021-05-31 16:55:28,HB305,HB305,40.747907,-74.038411,40.747907,-74.038412,member
2,docked_bike,2021-05-25 16:19:34,2021-05-25 17:03:06,HB305,HB305,40.747907,-74.038411,40.747907,-74.038412,casual
3,docked_bike,2021-05-22 17:32:19,2021-05-22 17:41:27,HB305,HB405,40.747907,-74.038411,40.73913,-74.03618,casual
4,docked_bike,2021-05-14 09:48:34,2021-05-14 10:17:36,JC051,JC032,40.718211,-74.083639,40.721525,-74.046305,casual


Add feature for ZipCode and City

In [7]:
a = pd.read_csv('./Source/out.csv')
a['ZipCode'] = a['address'].str[:5].astype(int)
a

Unnamed: 0.1,Unnamed: 0,lat,lng,address,ZipCode
0,1,40.721630,-74.049967,07302,7302
1,2,40.749984,-74.027150,07030,7030
2,3,40.745983,-74.028199,07030,7030
3,12,40.721630,-74.049968,07302,7302
4,28,40.745984,-74.028199,07030,7030
...,...,...,...,...,...
430,714140,40.752271,-73.987706,10018,10018
431,723276,40.801343,-73.971146,10025,10025
432,724586,40.861560,-73.912190,10468,10468
433,736040,40.705945,-74.013219,10006,10006


In [8]:
ny = pd.read_csv('./Source/nyc-zip-codes.csv')
ny['City'] = ny['Borough']
nj = pd.read_csv('./Source/nj-zip-codes.csv')
zip_city = pd.concat([ny, nj])
zip_city = zip_city[['ZipCode', 'City']]
zip_city

Unnamed: 0,ZipCode,City
0,10453,Bronx
1,10457,Bronx
2,10460,Bronx
3,10458,Bronx
4,10467,Bronx
...,...,...
166,7036,Linden
167,7065,Rahway
168,7203,Roselle
169,7204,Roselle Park


In [9]:
b = pd.merge(a, zip_city, how='left', on='ZipCode')
b = b[['lat', 'lng', 'ZipCode', 'City']]
b

Unnamed: 0,lat,lng,ZipCode,City
0,40.721630,-74.049967,7302,Jersey City
1,40.749984,-74.027150,7030,Hoboken
2,40.745983,-74.028199,7030,Hoboken
3,40.721630,-74.049968,7302,Jersey City
4,40.745984,-74.028199,7030,Hoboken
...,...,...,...,...
433,40.752271,-73.987706,10018,Manhattan
434,40.801343,-73.971146,10025,Manhattan
435,40.861560,-73.912190,10468,Bronx
436,40.705945,-74.013219,10006,Manhattan


In [10]:
pre_df = pre_df.merge(b, how='left', left_on=['start_lat', 'start_lng'], right_on=['lat', 'lng'])

In [11]:
pre_df['start_zip'] = pre_df['ZipCode']
pre_df['start_city'] = pre_df['City']
pre_df = pre_df.drop(['lat','lng','ZipCode','City'], axis=1)

In [12]:
pre_df = pre_df.merge(b, how='left', left_on=['end_lat', 'end_lng'], right_on=['lat', 'lng'])

In [13]:
pre_df['end_zip'] = pre_df['ZipCode']
pre_df['end_city'] = pre_df['City']
pre_df = pre_df.drop(['lat','lng','ZipCode','City'], axis=1)

In [14]:
pre_df

Unnamed: 0,rideable_type,started_at,ended_at,start_station_id,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,start_zip,start_city,end_zip,end_city
0,docked_bike,2021-05-23 16:51:00,2021-05-23 18:38:21,HB305,HB305,40.747907,-74.038411,40.747907,-74.038412,casual,7030,Hoboken,7030.0,Hoboken
1,docked_bike,2021-05-31 16:54:47,2021-05-31 16:55:28,HB305,HB305,40.747907,-74.038411,40.747907,-74.038412,member,7030,Hoboken,7030.0,Hoboken
2,docked_bike,2021-05-25 16:19:34,2021-05-25 17:03:06,HB305,HB305,40.747907,-74.038411,40.747907,-74.038412,casual,7030,Hoboken,7030.0,Hoboken
3,docked_bike,2021-05-22 17:32:19,2021-05-22 17:41:27,HB305,HB405,40.747907,-74.038411,40.739130,-74.036180,casual,7030,Hoboken,7030.0,Hoboken
4,docked_bike,2021-05-14 09:48:34,2021-05-14 10:17:36,JC051,JC032,40.718211,-74.083639,40.721525,-74.046305,casual,7304,Jersey City,7302.0,Jersey City
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
743076,docked_bike,2021-02-24 18:05:45,2021-02-24 18:12:27,JC014,JC008,40.718355,-74.038914,40.728745,-74.032108,member,7302,Jersey City,7310.0,Jersey City
743077,docked_bike,2021-02-22 18:22:04,2021-02-22 18:28:45,JC014,JC008,40.718355,-74.038914,40.728745,-74.032108,member,7302,Jersey City,7310.0,Jersey City
743078,docked_bike,2021-02-13 12:21:49,2021-02-13 12:27:51,JC014,JC008,40.718355,-74.038914,40.728745,-74.032108,member,7302,Jersey City,7310.0,Jersey City
743079,docked_bike,2021-02-25 11:20:29,2021-02-25 11:25:23,JC014,JC072,40.718355,-74.038914,40.712419,-74.038526,member,7302,Jersey City,7302.0,Jersey City


Distance

In [15]:
pre_df[pre_df['end_lat'].isnull()]

Unnamed: 0,rideable_type,started_at,ended_at,start_station_id,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,start_zip,start_city,end_zip,end_city
5547,docked_bike,2021-05-28 00:18:55,2021-05-29 01:18:49,JC063,,40.711130,-74.078900,,,casual,7305,Jersey City,,
5587,docked_bike,2021-05-27 22:00:08,2021-05-28 23:00:04,JC084,,40.714358,-74.066610,,,casual,7304,Jersey City,,
5590,docked_bike,2021-05-27 19:09:47,2021-05-28 20:09:41,JC084,,40.714358,-74.066610,,,casual,7304,Jersey City,,
5591,docked_bike,2021-05-27 22:00:28,2021-05-28 23:00:23,JC084,,40.714358,-74.066610,,,casual,7304,Jersey City,,
5593,docked_bike,2021-05-08 00:22:05,2021-05-09 01:22:00,JC082,,40.721650,-74.042884,,,casual,7302,Jersey City,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742743,docked_bike,2021-02-16 17:39:55,2021-02-16 17:54:53,JC003,,40.717732,-74.043845,,,member,7302,Jersey City,,
742804,docked_bike,2021-02-12 14:24:38,2021-02-13 15:24:32,JC003,,40.717732,-74.043845,,,casual,7302,Jersey City,,
742810,docked_bike,2021-02-16 18:51:33,2021-02-16 20:49:07,JC027,,40.725289,-74.045571,,,member,7302,Jersey City,,
742870,docked_bike,2021-02-09 05:48:18,2021-02-09 07:22:41,JC027,,40.725289,-74.045571,,,member,7302,Jersey City,,


In [16]:
from geopy import distance

def cal_distance(from_lat, from_lng, to_lat, to_lng):
    go = (from_lat, from_lng)
    to = (to_lat, to_lng)
    try:
        return distance.great_circle(go, to)
    except ValueError:
        return np.nan

pre_df['distance'] = pre_df.apply(lambda row: cal_distance(row.start_lat,row.start_lng, row.end_lat, row.end_lng), axis=1)

In [17]:
pre_df[pre_df['end_lat'].isnull()]

Unnamed: 0,rideable_type,started_at,ended_at,start_station_id,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,start_zip,start_city,end_zip,end_city,distance
5547,docked_bike,2021-05-28 00:18:55,2021-05-29 01:18:49,JC063,,40.711130,-74.078900,,,casual,7305,Jersey City,,,
5587,docked_bike,2021-05-27 22:00:08,2021-05-28 23:00:04,JC084,,40.714358,-74.066610,,,casual,7304,Jersey City,,,
5590,docked_bike,2021-05-27 19:09:47,2021-05-28 20:09:41,JC084,,40.714358,-74.066610,,,casual,7304,Jersey City,,,
5591,docked_bike,2021-05-27 22:00:28,2021-05-28 23:00:23,JC084,,40.714358,-74.066610,,,casual,7304,Jersey City,,,
5593,docked_bike,2021-05-08 00:22:05,2021-05-09 01:22:00,JC082,,40.721650,-74.042884,,,casual,7302,Jersey City,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742743,docked_bike,2021-02-16 17:39:55,2021-02-16 17:54:53,JC003,,40.717732,-74.043845,,,member,7302,Jersey City,,,
742804,docked_bike,2021-02-12 14:24:38,2021-02-13 15:24:32,JC003,,40.717732,-74.043845,,,casual,7302,Jersey City,,,
742810,docked_bike,2021-02-16 18:51:33,2021-02-16 20:49:07,JC027,,40.725289,-74.045571,,,member,7302,Jersey City,,,
742870,docked_bike,2021-02-09 05:48:18,2021-02-09 07:22:41,JC027,,40.725289,-74.045571,,,member,7302,Jersey City,,,


In [18]:
pre_df['distance'] = pre_df['distance'].astype(str).str[:-3].astype('float64', errors = 'ignore')

In [19]:
pre_df[pre_df['end_lat'].isnull()]

Unnamed: 0,rideable_type,started_at,ended_at,start_station_id,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,start_zip,start_city,end_zip,end_city,distance
5547,docked_bike,2021-05-28 00:18:55,2021-05-29 01:18:49,JC063,,40.711130,-74.078900,,,casual,7305,Jersey City,,,
5587,docked_bike,2021-05-27 22:00:08,2021-05-28 23:00:04,JC084,,40.714358,-74.066610,,,casual,7304,Jersey City,,,
5590,docked_bike,2021-05-27 19:09:47,2021-05-28 20:09:41,JC084,,40.714358,-74.066610,,,casual,7304,Jersey City,,,
5591,docked_bike,2021-05-27 22:00:28,2021-05-28 23:00:23,JC084,,40.714358,-74.066610,,,casual,7304,Jersey City,,,
5593,docked_bike,2021-05-08 00:22:05,2021-05-09 01:22:00,JC082,,40.721650,-74.042884,,,casual,7302,Jersey City,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742743,docked_bike,2021-02-16 17:39:55,2021-02-16 17:54:53,JC003,,40.717732,-74.043845,,,member,7302,Jersey City,,,
742804,docked_bike,2021-02-12 14:24:38,2021-02-13 15:24:32,JC003,,40.717732,-74.043845,,,casual,7302,Jersey City,,,
742810,docked_bike,2021-02-16 18:51:33,2021-02-16 20:49:07,JC027,,40.725289,-74.045571,,,member,7302,Jersey City,,,
742870,docked_bike,2021-02-09 05:48:18,2021-02-09 07:22:41,JC027,,40.725289,-74.045571,,,member,7302,Jersey City,,,


In [20]:
pre_df['distance'] = pre_df['distance'].fillna(0)

In [21]:
pre_df[pre_df['end_lat'].isnull()]

Unnamed: 0,rideable_type,started_at,ended_at,start_station_id,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,start_zip,start_city,end_zip,end_city,distance
5547,docked_bike,2021-05-28 00:18:55,2021-05-29 01:18:49,JC063,,40.711130,-74.078900,,,casual,7305,Jersey City,,,
5587,docked_bike,2021-05-27 22:00:08,2021-05-28 23:00:04,JC084,,40.714358,-74.066610,,,casual,7304,Jersey City,,,
5590,docked_bike,2021-05-27 19:09:47,2021-05-28 20:09:41,JC084,,40.714358,-74.066610,,,casual,7304,Jersey City,,,
5591,docked_bike,2021-05-27 22:00:28,2021-05-28 23:00:23,JC084,,40.714358,-74.066610,,,casual,7304,Jersey City,,,
5593,docked_bike,2021-05-08 00:22:05,2021-05-09 01:22:00,JC082,,40.721650,-74.042884,,,casual,7302,Jersey City,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742743,docked_bike,2021-02-16 17:39:55,2021-02-16 17:54:53,JC003,,40.717732,-74.043845,,,member,7302,Jersey City,,,
742804,docked_bike,2021-02-12 14:24:38,2021-02-13 15:24:32,JC003,,40.717732,-74.043845,,,casual,7302,Jersey City,,,
742810,docked_bike,2021-02-16 18:51:33,2021-02-16 20:49:07,JC027,,40.725289,-74.045571,,,member,7302,Jersey City,,,
742870,docked_bike,2021-02-09 05:48:18,2021-02-09 07:22:41,JC027,,40.725289,-74.045571,,,member,7302,Jersey City,,,


Drop weird ridetype

In [22]:
pre_df['rideable_type'].value_counts()

classic_bike              582589
docked_bike               141306
electric_bike              19185
motivate_dockless_bike         1
Name: rideable_type, dtype: int64

In [23]:
pre_df = pre_df[(pre_df.rideable_type != 'motivate_dockless_bike')]

Add feature - Intra city

In [24]:
import numpy as np
pre_df['intra'] = np.where(pre_df['start_city']!=pre_df['end_city'], 1, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pre_df['intra'] = np.where(pre_df['start_city']!=pre_df['end_city'], 1, 0)


In [25]:
pre_df

Unnamed: 0,rideable_type,started_at,ended_at,start_station_id,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,start_zip,start_city,end_zip,end_city,distance,intra
0,docked_bike,2021-05-23 16:51:00,2021-05-23 18:38:21,HB305,HB305,40.747907,-74.038411,40.747907,-74.038412,casual,7030,Hoboken,7030.0,Hoboken,5.021014915867473e-05,0
1,docked_bike,2021-05-31 16:54:47,2021-05-31 16:55:28,HB305,HB305,40.747907,-74.038411,40.747907,-74.038412,member,7030,Hoboken,7030.0,Hoboken,5.021014915867473e-05,0
2,docked_bike,2021-05-25 16:19:34,2021-05-25 17:03:06,HB305,HB305,40.747907,-74.038411,40.747907,-74.038412,casual,7030,Hoboken,7030.0,Hoboken,5.021014915867473e-05,0
3,docked_bike,2021-05-22 17:32:19,2021-05-22 17:41:27,HB305,HB405,40.747907,-74.038411,40.739130,-74.036180,casual,7030,Hoboken,7030.0,Hoboken,0.9938925900796551,0
4,docked_bike,2021-05-14 09:48:34,2021-05-14 10:17:36,JC051,JC032,40.718211,-74.083639,40.721525,-74.046305,casual,7304,Jersey City,7302.0,Jersey City,3.1678936428785565,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
743076,docked_bike,2021-02-24 18:05:45,2021-02-24 18:12:27,JC014,JC008,40.718355,-74.038914,40.728745,-74.032108,member,7302,Jersey City,7310.0,Jersey City,1.28982334621709,0
743077,docked_bike,2021-02-22 18:22:04,2021-02-22 18:28:45,JC014,JC008,40.718355,-74.038914,40.728745,-74.032108,member,7302,Jersey City,7310.0,Jersey City,1.28982334621709,0
743078,docked_bike,2021-02-13 12:21:49,2021-02-13 12:27:51,JC014,JC008,40.718355,-74.038914,40.728745,-74.032108,member,7302,Jersey City,7310.0,Jersey City,1.28982334621709,0
743079,docked_bike,2021-02-25 11:20:29,2021-02-25 11:25:23,JC014,JC072,40.718355,-74.038914,40.712419,-74.038526,member,7302,Jersey City,7302.0,Jersey City,0.66088514879881,0


Start at end at

In [26]:
pre_df[["started_at", "ended_at"]] = pre_df[["started_at", "ended_at"]].apply(pd.to_datetime)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pre_df[["started_at", "ended_at"]] = pre_df[["started_at", "ended_at"]].apply(pd.to_datetime)


In [27]:
pre_df['duration'] = pre_df['ended_at']-pre_df['started_at']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pre_df['duration'] = pre_df['ended_at']-pre_df['started_at']


In [28]:
pre_df['duration_secs'] = pre_df['duration'].dt.total_seconds()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pre_df['duration_secs'] = pre_df['duration'].dt.total_seconds()


In [29]:
import numpy as np
#Time of day and season are calculated using started_at derived values
[month%12 // 3 + 1 for month in range(1, 13)]
pre_df['season_num'] = pre_df['started_at'].dt.month%12 // 3 + 1
#pre_df['season'] = pre_df['season_num'].map({1:'Winter',2:'Spring', 3: 'Summer', 4: 'Fall'})
#pre_df['season'] = pre_df['season'].astype("string")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pre_df['season_num'] = pre_df['started_at'].dt.month%12 // 3 + 1


In [30]:
#Time of day and season are calculated using started_at derived values
pre_df['time_of_day'] = (pre_df['started_at'].dt.hour % 24 + 4) // 4
pre_df['time_of_day'].replace({1: 'Late Night',
                      2: 'Early Morning',
                      3: 'Morning',
                      4: 'Noon',
                      5: 'Evening',
                      6: 'Night'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pre_df['time_of_day'] = (pre_df['started_at'].dt.hour % 24 + 4) // 4
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pre_df['time_of_day'].replace({1: 'Late Night',


In [31]:
pre_df

Unnamed: 0,rideable_type,started_at,ended_at,start_station_id,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,start_zip,start_city,end_zip,end_city,distance,intra,duration,duration_secs,season_num,time_of_day
0,docked_bike,2021-05-23 16:51:00,2021-05-23 18:38:21,HB305,HB305,40.747907,-74.038411,40.747907,-74.038412,casual,7030,Hoboken,7030.0,Hoboken,5.021014915867473e-05,0,0 days 01:47:21,6441.0,2,Evening
1,docked_bike,2021-05-31 16:54:47,2021-05-31 16:55:28,HB305,HB305,40.747907,-74.038411,40.747907,-74.038412,member,7030,Hoboken,7030.0,Hoboken,5.021014915867473e-05,0,0 days 00:00:41,41.0,2,Evening
2,docked_bike,2021-05-25 16:19:34,2021-05-25 17:03:06,HB305,HB305,40.747907,-74.038411,40.747907,-74.038412,casual,7030,Hoboken,7030.0,Hoboken,5.021014915867473e-05,0,0 days 00:43:32,2612.0,2,Evening
3,docked_bike,2021-05-22 17:32:19,2021-05-22 17:41:27,HB305,HB405,40.747907,-74.038411,40.739130,-74.036180,casual,7030,Hoboken,7030.0,Hoboken,0.9938925900796551,0,0 days 00:09:08,548.0,2,Evening
4,docked_bike,2021-05-14 09:48:34,2021-05-14 10:17:36,JC051,JC032,40.718211,-74.083639,40.721525,-74.046305,casual,7304,Jersey City,7302.0,Jersey City,3.1678936428785565,0,0 days 00:29:02,1742.0,2,Morning
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
743076,docked_bike,2021-02-24 18:05:45,2021-02-24 18:12:27,JC014,JC008,40.718355,-74.038914,40.728745,-74.032108,member,7302,Jersey City,7310.0,Jersey City,1.28982334621709,0,0 days 00:06:42,402.0,1,Evening
743077,docked_bike,2021-02-22 18:22:04,2021-02-22 18:28:45,JC014,JC008,40.718355,-74.038914,40.728745,-74.032108,member,7302,Jersey City,7310.0,Jersey City,1.28982334621709,0,0 days 00:06:41,401.0,1,Evening
743078,docked_bike,2021-02-13 12:21:49,2021-02-13 12:27:51,JC014,JC008,40.718355,-74.038914,40.728745,-74.032108,member,7302,Jersey City,7310.0,Jersey City,1.28982334621709,0,0 days 00:06:02,362.0,1,Noon
743079,docked_bike,2021-02-25 11:20:29,2021-02-25 11:25:23,JC014,JC072,40.718355,-74.038914,40.712419,-74.038526,member,7302,Jersey City,7302.0,Jersey City,0.66088514879881,0,0 days 00:04:54,294.0,1,Morning


In [32]:
pre_df[pre_df['end_lat'].isnull()]

Unnamed: 0,rideable_type,started_at,ended_at,start_station_id,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,start_zip,start_city,end_zip,end_city,distance,intra,duration,duration_secs,season_num,time_of_day
5547,docked_bike,2021-05-28 00:18:55,2021-05-29 01:18:49,JC063,,40.711130,-74.078900,,,casual,7305,Jersey City,,,,1,1 days 00:59:54,89994.0,2,Late Night
5587,docked_bike,2021-05-27 22:00:08,2021-05-28 23:00:04,JC084,,40.714358,-74.066610,,,casual,7304,Jersey City,,,,1,1 days 00:59:56,89996.0,2,Night
5590,docked_bike,2021-05-27 19:09:47,2021-05-28 20:09:41,JC084,,40.714358,-74.066610,,,casual,7304,Jersey City,,,,1,1 days 00:59:54,89994.0,2,Evening
5591,docked_bike,2021-05-27 22:00:28,2021-05-28 23:00:23,JC084,,40.714358,-74.066610,,,casual,7304,Jersey City,,,,1,1 days 00:59:55,89995.0,2,Night
5593,docked_bike,2021-05-08 00:22:05,2021-05-09 01:22:00,JC082,,40.721650,-74.042884,,,casual,7302,Jersey City,,,,1,1 days 00:59:55,89995.0,2,Late Night
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742743,docked_bike,2021-02-16 17:39:55,2021-02-16 17:54:53,JC003,,40.717732,-74.043845,,,member,7302,Jersey City,,,,1,0 days 00:14:58,898.0,1,Evening
742804,docked_bike,2021-02-12 14:24:38,2021-02-13 15:24:32,JC003,,40.717732,-74.043845,,,casual,7302,Jersey City,,,,1,1 days 00:59:54,89994.0,1,Noon
742810,docked_bike,2021-02-16 18:51:33,2021-02-16 20:49:07,JC027,,40.725289,-74.045571,,,member,7302,Jersey City,,,,1,0 days 01:57:34,7054.0,1,Evening
742870,docked_bike,2021-02-09 05:48:18,2021-02-09 07:22:41,JC027,,40.725289,-74.045571,,,member,7302,Jersey City,,,,1,0 days 01:34:23,5663.0,1,Early Morning


# Part 2

### Data Cleaning

In [33]:
pre_df_p2 = pre_df

In [34]:

pre_df_p2 = pre_df_p2.drop(['duration', 'started_at', 'ended_at', 'start_lat', 'start_lng', 'end_lat', 'end_lng', 'start_zip', 'end_zip', 'end_station_id', 'end_city', 'start_city'], axis=1)


In [35]:
pre_df_p2['distance'] = pre_df_p2['distance'].replace('', np.NAN)
pre_df_p2[pre_df_p2['distance'].isnull()]

Unnamed: 0,rideable_type,start_station_id,member_casual,distance,intra,duration_secs,season_num,time_of_day
5547,docked_bike,JC063,casual,,1,89994.0,2,Late Night
5587,docked_bike,JC084,casual,,1,89996.0,2,Night
5590,docked_bike,JC084,casual,,1,89994.0,2,Evening
5591,docked_bike,JC084,casual,,1,89995.0,2,Night
5593,docked_bike,JC082,casual,,1,89995.0,2,Late Night
...,...,...,...,...,...,...,...,...
742743,docked_bike,JC003,member,,1,898.0,1,Evening
742804,docked_bike,JC003,casual,,1,89994.0,1,Noon
742810,docked_bike,JC027,member,,1,7054.0,1,Evening
742870,docked_bike,JC027,member,,1,5663.0,1,Early Morning


In [36]:

from sklearn.preprocessing import LabelEncoder

non_numeric_cols = ['member_casual', 'time_of_day', 'rideable_type', 'start_station_id']
for col in non_numeric_cols:
    pre_df_p2[col] = LabelEncoder().fit_transform(pre_df_p2[col].values)

pre_df_p2

Unnamed: 0,rideable_type,start_station_id,member_casual,distance,intra,duration_secs,season_num,time_of_day
0,1,12,0,5.021014915867473e-05,0,6441.0,2,1
1,1,12,1,5.021014915867473e-05,0,41.0,2,1
2,1,12,0,5.021014915867473e-05,0,2612.0,2,1
3,1,12,0,0.9938925900796551,0,548.0,2,1
4,1,51,0,3.1678936428785565,0,1742.0,2,3
...,...,...,...,...,...,...,...,...
743076,1,39,1,1.28982334621709,0,402.0,1,1
743077,1,39,1,1.28982334621709,0,401.0,1,1
743078,1,39,1,1.28982334621709,0,362.0,1,5
743079,1,39,1,0.66088514879881,0,294.0,1,3


Split train test dataset

In [37]:
from sklearn.model_selection import train_test_split
train_set_p2, test_set_p2 = train_test_split(pre_df_p2, test_size=0.2, random_state=35)

## In train dataset:

Drop Unuseful features

Drop duplicates

In [38]:
dups = train_set_p2.duplicated()
train_set_p2[dups]

Unnamed: 0,rideable_type,start_station_id,member_casual,distance,intra,duration_secs,season_num,time_of_day
395741,1,34,1,0.6470440688819047,0,188.0,2,3
294569,0,46,1,0.6670196269168461,0,237.0,4,3
567285,0,27,0,0.976675801683034,0,357.0,4,1
261482,0,64,1,0.6204420455380038,0,180.0,3,0
290580,0,60,1,0.7686893564668077,0,296.0,4,1
...,...,...,...,...,...,...,...,...
282413,0,54,1,0.6924918349345762,0,211.0,4,1
643007,0,65,1,0.31416832462829586,0,120.0,4,5
29053,1,35,1,0.5707006757143257,0,303.0,2,1
361768,0,29,1,0.43247977514785535,0,152.0,4,5


In [39]:
train_set_p2.drop_duplicates(inplace=True)

Check if there are any null values (should not have any since we drop them during pre-process part)

In [40]:
train_set_p2.isnull().sum()

rideable_type          0
start_station_id       0
member_casual          0
distance            1877
intra                  0
duration_secs          0
season_num             0
time_of_day            0
dtype: int64

In [41]:
X_train_p2 = train_set_p2.drop('rideable_type', axis=1)
y_train_p2 = train_set_p2['rideable_type'].copy()

Class Imbalance

In [42]:
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)

X_train_p2, y_train_p2 = rus.fit_resample(X_train_p2, y_train_p2)

Pipeline

In [43]:
X_train_p2

Unnamed: 0,start_station_id,member_casual,distance,intra,duration_secs,season_num,time_of_day
0,79,0,0.7495877412392384,0,549.0,4,4
1,1,0,0.0,0,639.0,3,1
2,33,0,3.0257739258365133,0,682.0,4,1
3,33,1,0.8297759871467439,0,354.0,4,3
4,79,1,0.8873286951646564,0,221.0,1,1
...,...,...,...,...,...,...,...
45169,2,0,1.5564652033904733,0,370.0,2,1
45170,33,1,1.409124270662168,0,345.0,2,5
45171,22,1,0.40622353685465457,0,124.0,2,3
45172,4,0,1.3686253750214619,1,594.0,2,5


In [44]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer


num_pipeline = Pipeline([
        ('imp', IterativeImputer()),
        ('std_scaler', StandardScaler()),
    ])

In [46]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

num_attribs = ['distance', 'duration_secs']
cat_attribs = ['member_casual', 'time_of_day', 'season_num', 'intra', 'start_station_id']

preprocessor = ColumnTransformer([
        ('num', num_pipeline, num_attribs),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_attribs),
    ])

In [47]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

clf = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=35))
    ]
)

param_grid = [
    {'classifier__n_estimators': [50, 100]},
  ]

grid_search = GridSearchCV(clf, param_grid, cv=2)
grid_search.fit(X_train_p2, y_train_p2)

GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imp',
                                                                                          IterativeImputer()),
                                                                                         ('std_scaler',
                                                                                          StandardScaler())]),
                                                                         ['distance',
                                                                          'duration_secs']),
                                                                        ('cat',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                  

Test set - doing exactly as the train dataset

Drop duplicated rows

In [386]:
dups = test_set_p2.duplicated()
dups.any()

True

In [387]:
test_set_p2.drop_duplicates(inplace=True)

Check if there are null values, (impute during pipeline just like train sets)

In [388]:
test_set_p2.isnull().sum()

rideable_type         0
start_station_id      0
member_casual         0
distance            547
intra                 0
duration_secs         0
season_num            0
time_of_day           0
dtype: int64

Split X and y

In [389]:
X_test = test_set_p2.drop('rideable_type', axis=1)
y_test = test_set_p2['rideable_type'].copy()

Model performance

In [390]:
from sklearn.metrics import mean_squared_error, confusion_matrix, classification_report, ConfusionMatrixDisplay
final_model = grid_search.best_estimator_

#grid_search.score(X_test, y_test)

print(classification_report(y_test, grid_search.predict(X_test)))

              precision    recall  f1-score   support

           0       0.95      0.81      0.88    115462
           1       0.51      0.66      0.57     27742
           2       0.28      0.85      0.42      3847

    accuracy                           0.79    147051
   macro avg       0.58      0.77      0.62    147051
weighted avg       0.85      0.79      0.81    147051



### Part 2: Classification Question 1

Based on the time spent, start station, end station, and distance, can we classify the user type and bike type (dock, electric, manual)? 

### Part 3: Classification Question 2

Based on predictive analysis, can we predict the length of trip and time of day (e.g. commuting hours, late nights, etc.)?

The above predictive and classification analysis will allow the NYC bike program stakeholders to predict and understand how riders are using their bikes. Based off of this information, they can create user personas and targeted advertising campaigns to grow ridership. This will facilitate a more efficient use of advertising dollars for the expansion.

NOtes: trips intra/inter neighborhods, pay atention to perf variables(falsepos/false neg)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6751d308-8904-49e9-ab76-ff469d35696c' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>