### Import Dependencies

In [1]:
import pandas as pd
import os
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session

from config import conn_string

**get all files for loading**

In [2]:
# path
station_path = os.path.join('.', "station_info.csv")
weather_path = os.path.join('.', "historical_weather.csv")

#bikes
toronto_bikes = os.path.join("Resources/1_toronto_canada", "toronto_bikes.csv")
vancouver_bikes = os.path.join("Resources/2_vancouver_canada", "vancouver_bikes.csv")
boston_bikes = os.path.join("Resources/3_boston_us", "boston_bikes.csv")
nyc_bikes = os.path.join("Resources/4_NewYork_US", "nyc_bikes.csv")

# membertypes
toronto_users = os.path.join("Resources/1_toronto_canada", "toronto_membertypes.csv")
vancouver_users = os.path.join("Resources/2_vancouver_canada", "vancouver_membertypes.csv")
boston_users = os.path.join("Resources/3_boston_us", "boston_membertypes.csv")
nyc_users = os.path.join("Resources/4_NewYork_US", "nyc_membertypes.csv")

# ridership
toronto_ = os.path.join("Resources/1_toronto_canada", "toronto_bikeshare.csv")
vancouver_ = os.path.join("Resources/2_vancouver_canada", "vancouver_bikeshare.csv")
boston_ = os.path.join("Resources/3_boston_us", "boston_bikeshare.csv")
nyc_ = os.path.join("Resources/4_NewYork_US", "nyc_bikeshare.csv")


In [3]:
# read files
stns = pd.read_csv(station_path, encoding="utf-8")
weather = pd.read_csv(weather_path, encoding="utf-8")

# bikes
tor_bikes = pd.read_csv(toronto_bikes, encoding="utf-8")
van_bikes = pd.read_csv(vancouver_bikes, encoding="utf-8")
bos_bikes = pd.read_csv(boston_bikes, encoding="utf-8")
nyc_bikes = pd.read_csv(nyc_bikes, encoding="utf-8")

# member_types
tor_users = pd.read_csv(toronto_users, encoding="utf-8")
van_users = pd.read_csv(vancouver_users, encoding="utf-8")
bos_users = pd.read_csv(boston_users, encoding="utf-8")
nyc_users = pd.read_csv(nyc_users, encoding="utf-8")

# pricing index


# ridership
toronto = pd.read_csv(toronto_, encoding="utf-8")
vancouver = pd.read_csv(vancouver_, encoding="utf-8")
boston = pd.read_csv(boston_, encoding="utf-8")
newyork = pd.read_csv(nyc_, encoding="utf-8")


**Locations**

In [4]:
# create locations table
locations = [{"location_id": 1,"city":"Toronto", "country":"Canada" }, 
             {"location_id": 2,"city":"Vancouver", "country":"Canada"},
             {"location_id": 3,"city":"Boston", "country":"USA"},
             {"location_id": 4,"city":"New York", "country":"USA"}
            ]

df = pd.DataFrame(locations)
df.set_index("location_id", inplace=True)
df


Unnamed: 0_level_0,city,country
location_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toronto,Canada
2,Vancouver,Canada
3,Boston,USA
4,New York,USA


**Stations**

In [5]:
stns.head()

Unnamed: 0,station_id,station_name,latitude,longitude,location_id
0,7000,Fort York Blvd / Capreol Ct,43.639832,-79.395954,1
1,7001,Lower Jarvis St / The Esplanade,43.64783,-79.370698,1
2,7002,St. George St / Bloor St W,43.667333,-79.399429,1
3,7003,Madison Ave / Bloor St W,43.667158,-79.402761,1
4,7004,University Ave / Elm St,43.656518,-79.389099,1


**Historical Weather**

In [6]:
weather.head()

Unnamed: 0,location_id,forecast_date,maxtempc,humidity,total_precip,avg_cloudcover,avg_windspeed
0,1,2019-01-01,1,77.25,3.6,50.625,16.875
1,1,2019-01-02,-5,69.0,0.2,41.75,8.625
2,1,2019-01-03,-1,68.5,1.3,68.25,17.75
3,1,2019-01-04,2,77.25,0.0,7.625,18.125
4,1,2019-01-05,3,81.125,0.0,23.75,11.25


**Bikes**

In [7]:
tor_bikes.head()

Unnamed: 0,bike_id,location_id
0,1296,1
1,2947,1
2,2293,1
3,283,1
4,1799,1


In [8]:
van_bikes.head()

Unnamed: 0,bike_id,location_id
0,388.0,2
1,1880.0,2
2,779.0,2
3,2123.0,2
4,444.0,2


In [9]:
bos_bikes.head()

Unnamed: 0,bike_id,location_id
0,3689,3
1,4142,3
2,1628,3
3,2969,3
4,3469,3


In [10]:
nyc_bikes.head()

Unnamed: 0,bike_id,location_id
0,15839,4
1,32723,4
2,27451,4
3,21579,4
4,35379,4


**Member Types**

In [11]:
tor_users.head()

Unnamed: 0,member_type,location_id
0,Annual Member,1
1,Casual Member,1


In [12]:
van_users.head()

Unnamed: 0,member_type,location_id
0,365 Standard,2
1,365 Plus,2
2,Vancity Community Pass,2
3,365 Day Pass Plus SALE,2
4,365 Day Founding Standard,2


In [13]:
bos_users.head()

Unnamed: 0,member_type,location_id
0,Subscriber,3
1,Customer,3


In [14]:
nyc_users.head()

Unnamed: 0,member_type,location_id
0,Subscriber,4
1,Customer,4


**Pricing Index**

**Ridership**

In [15]:
# Toronto
toronto.head()

Unnamed: 0,trip_duration,start_date,end_date,start_station_id,end_station_id,bike_id,member_type,location_id
0,1547,01/01/2019 00:08,01/01/2019 00:33,7021,7233,1296,Annual Member,1
1,1112,01/01/2019 00:10,01/01/2019 00:29,7160,7051,2947,Annual Member,1
2,589,01/01/2019 00:15,01/01/2019 00:25,7055,7013,2293,Annual Member,1
3,259,01/01/2019 00:16,01/01/2019 00:20,7012,7235,283,Annual Member,1
4,281,01/01/2019 00:19,01/01/2019 00:24,7041,7257,1799,Annual Member,1


In [16]:
# Vancouver
vancouver.head()

Unnamed: 0,trip_duration,start_date,end_date,start_station_id,end_station_id,bike_id,member_type,location_id
0,177,2/1/2019 0:00,2/1/2019 0:00,198,298,388.0,365 Standard,2
1,221,2/1/2019 0:00,2/1/2019 0:00,193,173,1880.0,365 Plus,2
2,3272,2/1/2019 0:00,2/1/2019 0:00,63,143,779.0,Vancity Community Pass,2
3,157,2/1/2019 0:00,2/1/2019 0:00,64,78,2123.0,365 Day Pass Plus SALE,2
4,516,1/31/2019 23:00,2/1/2019 0:00,60,196,444.0,365 Day Pass Plus SALE,2


In [17]:
# Boston
boston.head()

Unnamed: 0,trip_duration,start_date,end_date,start_station_id,end_station_id,bike_id,member_type,location_id
0,371,2019-01-01 00:09:13.7980,2019-01-01 00:15:25.3360,80,179,3689,Subscriber,3
1,264,2019-01-01 00:33:56.1820,2019-01-01 00:38:20.8800,117,189,4142,Subscriber,3
2,458,2019-01-01 00:41:54.6000,2019-01-01 00:49:33.2730,68,96,1628,Subscriber,3
3,364,2019-01-01 00:43:32.5710,2019-01-01 00:49:37.4260,89,334,2969,Subscriber,3
4,681,2019-01-01 00:49:56.4640,2019-01-01 01:01:17.7010,73,367,3469,Subscriber,3


In [18]:
# New York
newyork.head()

Unnamed: 0,trip_duration,start_date,end_date,start_station_id,end_station_id,bike_id,member_type,location_id
0,320,2019-01-01 00:01:47.4010,2019-01-01 00:07:07.5810,3160.0,3283.0,15839,Subscriber,4
1,316,2019-01-01 00:04:43.7360,2019-01-01 00:10:00.6080,519.0,518.0,32723,Subscriber,4
2,591,2019-01-01 00:06:03.9970,2019-01-01 00:15:55.4380,3171.0,3154.0,27451,Subscriber,4
3,2719,2019-01-01 00:07:03.5450,2019-01-01 00:52:22.6500,504.0,3709.0,21579,Subscriber,4
4,303,2019-01-01 00:07:35.9450,2019-01-01 00:12:39.5020,229.0,503.0,35379,Subscriber,4


#### Load data

In [19]:
# connect to database
cnx = f'postgresql://{conn_string}'
engine = create_engine(cnx)

In [20]:
# confirm tables
engine.table_names()

['locations',
 'membertypes',
 'stations',
 'bikes',
 'ridership',
 'pricing',
 'historicalweather']

In [None]:
# load locations into database
df.to_sql(name='locations', con=engine, if_exists='append', index=True)

In [None]:
# load stations into database
stns.to_sql(name='stations', con=engine, if_exists='append', index=False)


In [None]:
# load historical weather into database
weather.to_sql(name='historicalweather', con=engine, if_exists='append', index=False)

In [None]:
# load toronto bikes into database
tor_bikes.to_sql(name='bikes', con=engine, if_exists='append', index=False)

In [None]:
# load vancouver bikes into database
van_bikes.to_sql(name='bikes', con=engine, if_exists='append', index=False)

In [None]:
# load boston bikes into database
bos_bikes.to_sql(name='bikes', con=engine, if_exists='append', index=False)

In [None]:
# load new york bikes into database
nyc_bikes.to_sql(name='bikes', con=engine, if_exists='append', index=False)

In [None]:
# load toronto member types
tor_users.to_sql(name='membertypes', con=engine, if_exists='append', index=False)

In [None]:
# load vancouver member types
van_users.to_sql(name='membertypes', con=engine, if_exists='append', index=False)

In [None]:
# load boston member types
bos_users.to_sql(name='membertypes', con=engine, if_exists='append', index=False)

In [None]:
# load new york member types
nyc_users.to_sql(name='membertypes', con=engine, if_exists='append', index=False)

### Load Ridership

In [21]:
# check reference to stations table
# create session
session = Session(bind=engine)
# declare a base using automap_base
Base = automap_base()
# use the Base class to reflect the database tables
Base.prepare(engine, reflect=True)
Base.classes.keys()
Stations = Base.classes.stations

***Verify that all station_ids used in bikeshare exists in Stations table***

In [22]:
# get all stations
all_stations = session.query(Stations.station_id).all()
all_stns_df = pd.DataFrame(all_stations)
all_stns_df

Unnamed: 0,station_id
0,7000
1,7001
2,7002
3,7003
4,7004
...,...
2396,4248
2397,4249
2398,4250
2399,4252


### ***Toronto***

In [23]:
# check for Toronto records with start_station_id not in stations table
missing_start = toronto[-toronto["start_station_id"].isin(all_stns_df["station_id"])]
missing_start = missing_start.rename(columns={"start_station_id":"station_id"})

In [24]:
missing_start

Unnamed: 0,trip_duration,start_date,end_date,station_id,end_station_id,bike_id,member_type,location_id


In [25]:
# check for Toronto records with end_station_id not in stations table
missing_end = toronto[-toronto["end_station_id"].isin(all_stns_df["station_id"])]
missing_end = missing_end.rename(columns={"start_station_id":"station_id"})


In [26]:
missing_end

Unnamed: 0,trip_duration,start_date,end_date,station_id,end_station_id,bike_id,member_type,location_id


*Load rows with station_id existing in stations table*

In [27]:
toronto = toronto[toronto["start_station_id"].isin(all_stns_df["station_id"])]
toronto = toronto[toronto["end_station_id"].isin(all_stns_df["station_id"])]

In [28]:
toronto

Unnamed: 0,trip_duration,start_date,end_date,start_station_id,end_station_id,bike_id,member_type,location_id
0,1547,01/01/2019 00:08,01/01/2019 00:33,7021,7233,1296,Annual Member,1
1,1112,01/01/2019 00:10,01/01/2019 00:29,7160,7051,2947,Annual Member,1
2,589,01/01/2019 00:15,01/01/2019 00:25,7055,7013,2293,Annual Member,1
3,259,01/01/2019 00:16,01/01/2019 00:20,7012,7235,283,Annual Member,1
4,281,01/01/2019 00:19,01/01/2019 00:24,7041,7257,1799,Annual Member,1
...,...,...,...,...,...,...,...,...
99995,465,02/27/2019 07:03,02/27/2019 07:11,7320,7323,3729,Annual Member,1
99996,360,02/27/2019 07:04,02/27/2019 07:10,7143,7252,3641,Annual Member,1
99997,447,02/27/2019 07:05,02/27/2019 07:12,7054,7033,1091,Annual Member,1
99998,407,02/27/2019 07:05,02/27/2019 07:12,7031,7129,1028,Annual Member,1


In [30]:
# get all Toronto stations
stns_toronto = session.query(Stations.station_id).filter(Stations.location_id==1).all()
stns_toronto = pd.DataFrame(stns_toronto)
stns_toronto = stns_toronto.rename(columns={"station_id": "start_station_id"})
stns_toronto

Unnamed: 0,start_station_id
0,7000
1,7001
2,7002
3,7003
4,7004
...,...
602,7660
603,7662
604,7663
605,7664


In [31]:
# remove Toronto records with start_station_id not in stations table
tor_df = pd.merge(toronto, stns_toronto, how='inner', on='start_station_id')

In [34]:
tor_df

Unnamed: 0,trip_duration,start_date,end_date,start_station_id,end_station_id,bike_id,member_type,location_id
0,1547,01/01/2019 00:08,01/01/2019 00:33,7021,7233,1296,Annual Member,1
1,453,01/01/2019 10:35,01/01/2019 10:42,7021,7253,1118,Annual Member,1
2,632,01/01/2019 14:13,01/01/2019 14:24,7021,7323,111,Annual Member,1
3,234,01/01/2019 19:05,01/01/2019 19:09,7021,7052,942,Annual Member,1
4,843,01/02/2019 09:14,01/02/2019 09:28,7021,7036,1814,Annual Member,1
...,...,...,...,...,...,...,...,...
99995,1255,02/02/2019 14:24,02/02/2019 14:45,7345,7076,3156,Casual Member,1
99996,1240,02/02/2019 14:24,02/02/2019 14:45,7345,7076,1233,Casual Member,1
99997,1226,02/03/2019 15:23,02/03/2019 15:43,7345,7108,2974,Annual Member,1
99998,1515,02/04/2019 16:06,02/04/2019 16:31,7345,7313,851,Annual Member,1


In [32]:
# rename Toronto station column to check if end_station_id exists in stations table
stns_toronto = stns_toronto.rename(columns={"start_station_id": "end_station_id"})
stns_toronto

Unnamed: 0,end_station_id
0,7000
1,7001
2,7002
3,7003
4,7004
...,...
602,7660
603,7662
604,7663
605,7664


In [36]:
#remove Toronto records with end_station_id not in stations table
toronto = pd.merge(tor_df, stns_toronto, how='inner', on='end_station_id')

In [61]:
# count number of records for insert to ridership table
toronto.count()

trip_duration       100000
start_date          100000
end_date            100000
start_station_id    100000
end_station_id      100000
bike_id             100000
member_type         100000
location_id         100000
dtype: int64

In [None]:
# Load Toronto bikeshare data to ridership table
toronto.to_sql(name='ridership', con=engine, if_exists='append', index=False, index_label='id')

### ***Vancouver***

In [38]:
# check for Vancouver records with start_station_id missing in stations table
missing_start = vancouver[-vancouver["start_station_id"].isin(all_stns_df["station_id"])]
missing_start = missing_start.rename(columns={"start_station_id":"station_id"})


In [39]:
missing_start

Unnamed: 0,trip_duration,start_date,end_date,station_id,end_station_id,bike_id,member_type,location_id
8712,28,1/25/2019 18:00,1/25/2019 18:00,990,990,1306.0,VIP,2
9630,170,1/25/2019 9:00,1/25/2019 9:00,990,225,1412.0,VIP,2
11715,30,1/23/2019 17:00,1/23/2019 17:00,991,990,681.0,VIP,2
12212,1600,1/23/2019 11:00,1/23/2019 11:00,990,54,412.0,VIP,2
12238,34,1/23/2019 10:00,1/23/2019 10:00,990,990,412.0,VIP,2
12249,26,1/23/2019 10:00,1/23/2019 10:00,991,991,412.0,VIP,2
12273,642,1/23/2019 10:00,1/23/2019 10:00,990,6,843.0,365 Day Pass Plus SALE,2
14000,5510,1/21/2019 14:00,1/21/2019 16:00,990,72,1775.0,365 Plus,2
14005,5755,1/21/2019 14:00,1/21/2019 16:00,990,72,1127.0,365 Plus,2
15203,768,1/20/2019 15:00,1/20/2019 16:00,990,254,504.0,VIP,2


In [40]:
# check for Vancouver records with end_station_id missing in stations table
missing_end = vancouver[-vancouver["end_station_id"].isin(all_stns_df["station_id"])]
missing_end = missing_end.rename(columns={"start_station_id":"station_id"})


In [41]:
missing_end

Unnamed: 0,trip_duration,start_date,end_date,station_id,end_station_id,bike_id,member_type,location_id
1215,1035,1/31/2019 8:00,1/31/2019 8:00,107,990,1542.0,VIP,2
4437,662,1/29/2019 6:00,1/29/2019 6:00,242,990,235.0,VIP,2
5764,885,1/28/2019 8:00,1/28/2019 9:00,15,990,824.0,VIP,2
5805,4839,1/28/2019 8:00,1/28/2019 9:00,254,991,1623.0,VIP,2
5811,1208,1/28/2019 8:00,1/28/2019 8:00,206,990,1133.0,VIP,2
6544,706,1/27/2019 14:00,1/27/2019 14:00,150,990,1034.0,VIP,2
8712,28,1/25/2019 18:00,1/25/2019 18:00,990,990,1306.0,VIP,2
9575,186,1/25/2019 10:00,1/25/2019 10:00,225,990,1412.0,VIP,2
11715,30,1/23/2019 17:00,1/23/2019 17:00,991,990,681.0,VIP,2
12238,34,1/23/2019 10:00,1/23/2019 10:00,990,990,412.0,VIP,2


In [42]:
# check Vancouver records with start_station_id and end_station_id existing in stations table
vancouver = vancouver[vancouver["start_station_id"].isin(all_stns_df["station_id"])]
vancouver = vancouver[vancouver["end_station_id"].isin(all_stns_df["station_id"])]

In [43]:
vancouver

Unnamed: 0,trip_duration,start_date,end_date,start_station_id,end_station_id,bike_id,member_type,location_id
0,177,2/1/2019 0:00,2/1/2019 0:00,198,298,388.0,365 Standard,2
1,221,2/1/2019 0:00,2/1/2019 0:00,193,173,1880.0,365 Plus,2
2,3272,2/1/2019 0:00,2/1/2019 0:00,63,143,779.0,Vancity Community Pass,2
3,157,2/1/2019 0:00,2/1/2019 0:00,64,78,2123.0,365 Day Pass Plus SALE,2
4,516,1/31/2019 23:00,2/1/2019 0:00,60,196,444.0,365 Day Pass Plus SALE,2
...,...,...,...,...,...,...,...,...
36740,1382,1/1/2019 0:00,1/1/2019 1:00,105,84,511.0,24 Hour,2
36741,1009,1/1/2019 0:00,1/1/2019 0:00,88,218,723.0,365 Corporate Standard,2
36742,1144,1/1/2019 0:00,1/1/2019 1:00,139,167,938.0,365 Day Founding Plus,2
36743,1105,1/1/2019 0:00,1/1/2019 0:00,88,218,1450.0,365 Day Founding Plus,2


In [45]:
# get all Vancouver stations
stns_vancouver = session.query(Stations.station_id).filter(Stations.location_id==2).all()
stns_vancouver = pd.DataFrame(stns_vancouver)
stns_vancouver = stns_vancouver.rename(columns={"station_id": "start_station_id"})
stns_vancouver

Unnamed: 0,start_station_id
0,1
1,2
2,4
3,5
4,6
...,...
197,285
198,287
199,297
200,298


In [46]:
# remove Vancouver records with start_station_id not in stations table
vancouver_df = pd.merge(vancouver, stns_vancouver, how='inner', on='start_station_id')

In [48]:
# rename Vancouver station column to end_station_id
stns_vancouver = stns_vancouver.rename(columns={"start_station_id": "end_station_id"})
stns_vancouver.head()

Unnamed: 0,end_station_id
0,1
1,2
2,4
3,5
4,6


In [49]:
#remove Vancouver records with end_station_id not in stations table
vancouver = pd.merge(vancouver_df, stns_vancouver, how='inner', on='end_station_id')

In [50]:
# check number of records for insert to database
vancouver.count()

trip_duration       36680
start_date          36680
end_date            36680
start_station_id    36680
end_station_id      36680
bike_id             36680
member_type         36680
location_id         36680
dtype: int64

In [None]:
# Load Vancouver bikeshare data to ridership table
vancouver.to_sql(name='ridership', con=engine, if_exists='append', index=False, index_label='id')

### ***Boston***

In [52]:
# check Boston data for missing start_station_id
missing_start = boston[-boston["start_station_id"].isin(all_stns_df["station_id"])]
missing_start

Unnamed: 0,trip_duration,start_date,end_date,start_station_id,end_station_id,bike_id,member_type,location_id
215,610,2019-01-01 11:05:52.2480,2019-01-01 11:16:02.6380,42,40,2648,Subscriber,3
302,1657,2019-01-01 11:59:53.3000,2019-01-01 12:27:30.3620,42,65,3553,Customer,3
306,1509,2019-01-01 12:02:14.3990,2019-01-01 12:27:24.3260,42,65,2790,Customer,3
486,1874,2019-01-01 13:13:17.7800,2019-01-01 13:44:32.1500,42,42,4118,Customer,3
488,1854,2019-01-01 13:13:35.4750,2019-01-01 13:44:29.5620,42,42,4441,Customer,3
...,...,...,...,...,...,...,...,...
69339,365,2019-01-31 17:35:58.4630,2019-01-31 17:42:03.9100,42,22,4462,Subscriber,3
69375,448,2019-01-31 17:45:04.7500,2019-01-31 17:52:33.3280,42,190,2269,Subscriber,3
69566,1308,2019-01-31 18:49:23.9190,2019-01-31 19:11:12.3160,42,139,4463,Subscriber,3
69615,443,2019-01-31 19:14:51.1830,2019-01-31 19:22:14.8730,42,6,4467,Customer,3


In [53]:
# check for missing end_station_id
missing_end = boston[-boston["end_station_id"].isin(all_stns_df["station_id"])]
missing_end

Unnamed: 0,trip_duration,start_date,end_date,start_station_id,end_station_id,bike_id,member_type,location_id
44,622,2019-01-01 06:06:01.8390,2019-01-01 06:16:24.1470,157,42,4553,Subscriber,3
226,1646,2019-01-01 11:11:29.5610,2019-01-01 11:38:55.9730,356,42,2423,Customer,3
227,1596,2019-01-01 11:11:43.6340,2019-01-01 11:38:20.4030,356,42,2790,Customer,3
261,959,2019-01-01 11:31:57.0740,2019-01-01 11:47:56.1730,105,42,3955,Subscriber,3
316,1043,2019-01-01 12:08:56.8120,2019-01-01 12:26:20.0560,178,42,4371,Customer,3
...,...,...,...,...,...,...,...,...
69296,228,2019-01-31 17:28:19.5150,2019-01-31 17:32:07.7730,36,42,4462,Subscriber,3
69405,323,2019-01-31 17:52:26.5220,2019-01-31 17:57:50.2330,374,42,3415,Subscriber,3
69480,252,2019-01-31 18:19:35.3470,2019-01-31 18:23:47.8430,374,42,4355,Subscriber,3
69516,309,2019-01-31 18:28:31.2450,2019-01-31 18:33:40.6540,16,42,2601,Subscriber,3


In [54]:
boston = boston[boston["start_station_id"].isin(all_stns_df["station_id"])]
boston = boston[boston["end_station_id"].isin(all_stns_df["station_id"])]
boston

Unnamed: 0,trip_duration,start_date,end_date,start_station_id,end_station_id,bike_id,member_type,location_id
0,371,2019-01-01 00:09:13.7980,2019-01-01 00:15:25.3360,80,179,3689,Subscriber,3
1,264,2019-01-01 00:33:56.1820,2019-01-01 00:38:20.8800,117,189,4142,Subscriber,3
2,458,2019-01-01 00:41:54.6000,2019-01-01 00:49:33.2730,68,96,1628,Subscriber,3
3,364,2019-01-01 00:43:32.5710,2019-01-01 00:49:37.4260,89,334,2969,Subscriber,3
4,681,2019-01-01 00:49:56.4640,2019-01-01 01:01:17.7010,73,367,3469,Subscriber,3
...,...,...,...,...,...,...,...,...
69867,431,2019-01-31 23:48:35.5680,2019-01-31 23:55:47.0140,81,23,2696,Subscriber,3
69868,567,2019-01-31 23:49:11.7920,2019-01-31 23:58:39.3080,178,116,3010,Subscriber,3
69869,420,2019-01-31 23:49:55.6920,2019-01-31 23:56:55.7720,108,87,3676,Subscriber,3
69870,249,2019-01-31 23:51:19.3780,2019-01-31 23:55:28.9770,68,178,2203,Subscriber,3


In [55]:
# get all Boston stations
stns_boston = session.query(Stations.station_id).filter(Stations.location_id==3).all()
stns_boston = pd.DataFrame(stns_boston)
stns_boston = stns_boston.rename(columns={"station_id": "start_station_id"})
stns_boston

Unnamed: 0,start_station_id
0,3
1,4
2,5
3,6
4,7
...,...
373,495
374,496
375,497
376,498


In [56]:
# remove Boston records with start_station_id not in stations table
df = pd.merge(boston, stns_boston, how='inner', on='start_station_id')

In [58]:
# rename Boston station column to end_station_id
stns_boston = stns_boston.rename(columns={"start_station_id": "end_station_id"})
stns_boston.head()

Unnamed: 0,end_station_id
0,3
1,4
2,5
3,6
4,7


In [59]:
#remove Boston records with end_station_id not in stations table
boston_df = pd.merge(df, stns_boston, how='inner', on='end_station_id')

In [60]:
# count number of Boston records for insert to ridership table
boston_df.count()

trip_duration       68210
start_date          68210
end_date            68210
start_station_id    68210
end_station_id      68210
bike_id             68210
member_type         68210
location_id         68210
dtype: int64

In [None]:
# load Boston records to ridership table
boston_df.to_sql(name='ridership', con=engine, if_exists='append', index=False, index_label='id')

### ***New York***

In [62]:
# check for New York records with missing start_station_id in stations table
missing_start = newyork[-newyork["start_station_id"].isin(all_stns_df["station_id"])]
missing_start

Unnamed: 0,trip_duration,start_date,end_date,start_station_id,end_station_id,bike_id,member_type,location_id
6,280,2019-01-01 00:09:21.0060,2019-01-01 00:14:01.1510,3675.0,3288.0,35391,Subscriber,4
72,740,2019-01-01 00:42:16.3560,2019-01-01 00:54:36.8120,524.0,526.0,34089,Subscriber,4
97,967,2019-01-01 00:50:10.0810,2019-01-01 01:06:17.7230,524.0,236.0,35052,Subscriber,4
138,207,2019-01-01 01:00:48.7520,2019-01-01 01:04:15.8860,3360.0,3434.0,25854,Subscriber,4
143,642,2019-01-01 01:01:34.8600,2019-01-01 01:12:17.7070,3341.0,3529.0,28933,Subscriber,4
...,...,...,...,...,...,...,...,...
99956,467,2019-01-03 21:40:06.5770,2019-01-03 21:47:54.1780,3631.0,436.0,29558,Subscriber,4
99965,139,2019-01-03 21:40:54.0590,2019-01-03 21:43:13.1320,382.0,3463.0,35075,Subscriber,4
99971,125,2019-01-03 21:41:20.4860,2019-01-03 21:43:26.2770,243.0,298.0,35636,Subscriber,4
99992,610,2019-01-03 21:42:59.7330,2019-01-03 21:53:10.0820,3660.0,3255.0,33448,Customer,4


In [63]:
# check for New York records with missing end_station_id in stations table
missing_end = newyork[-newyork["end_station_id"].isin(all_stns_df["station_id"])]
missing_end

Unnamed: 0,trip_duration,start_date,end_date,start_station_id,end_station_id,bike_id,member_type,location_id
1,316,2019-01-01 00:04:43.7360,2019-01-01 00:10:00.6080,519.0,518.0,32723,Subscriber,4
18,829,2019-01-01 00:19:08.1030,2019-01-01 00:32:57.3880,3165.0,3295.0,32106,Subscriber,4
20,736,2019-01-01 00:21:02.9570,2019-01-01 00:33:19.4110,3165.0,3295.0,16761,Customer,4
28,1068,2019-01-01 00:25:13.8750,2019-01-01 00:43:01.9350,394.0,3431.0,32042,Subscriber,4
58,187,2019-01-01 00:35:03.5980,2019-01-01 00:38:10.6250,484.0,524.0,34089,Subscriber,4
...,...,...,...,...,...,...,...,...
99903,509,2019-01-03 21:37:30.1870,2019-01-03 21:46:00.1730,340.0,3701.0,35319,Subscriber,4
99904,333,2019-01-03 21:37:26.4860,2019-01-03 21:42:59.9570,174.0,3431.0,32840,Subscriber,4
99909,388,2019-01-03 21:37:41.5310,2019-01-03 21:44:09.7860,3284.0,3168.0,28117,Subscriber,4
99914,307,2019-01-03 21:38:13.3420,2019-01-03 21:43:20.4560,3435.0,3474.0,27723,Subscriber,4


In [65]:
# check New York records with start_station_id and end_station_id both in stations table
newyork = newyork[newyork["start_station_id"].isin(all_stns_df["station_id"])]
newyork = newyork[newyork["end_station_id"].isin(all_stns_df["station_id"])]
newyork

Unnamed: 0,trip_duration,start_date,end_date,start_station_id,end_station_id,bike_id,member_type,location_id
0,320,2019-01-01 00:01:47.4010,2019-01-01 00:07:07.5810,3160.0,3283.0,15839,Subscriber,4
2,591,2019-01-01 00:06:03.9970,2019-01-01 00:15:55.4380,3171.0,3154.0,27451,Subscriber,4
3,2719,2019-01-01 00:07:03.5450,2019-01-01 00:52:22.6500,504.0,3709.0,21579,Subscriber,4
4,303,2019-01-01 00:07:35.9450,2019-01-01 00:12:39.5020,229.0,503.0,35379,Subscriber,4
5,535,2019-01-01 00:08:33.1790,2019-01-01 00:17:28.3540,3630.0,3529.0,30315,Subscriber,4
...,...,...,...,...,...,...,...,...
99994,774,2019-01-03 21:42:58.5580,2019-01-03 21:55:52.7160,276.0,223.0,33988,Subscriber,4
99995,61,2019-01-03 21:43:05.5590,2019-01-03 21:44:06.8270,3137.0,3137.0,20902,Subscriber,4
99996,523,2019-01-03 21:43:19.0950,2019-01-03 21:52:02.7520,476.0,507.0,18518,Subscriber,4
99997,1179,2019-01-03 21:43:13.7950,2019-01-03 22:02:53.6280,3158.0,3383.0,16710,Customer,4


In [66]:
# get New York stations data
stns_newyork = session.query(Stations.station_id).filter(Stations.location_id==4).all()
stns_newyork = pd.DataFrame(stns_newyork)
stns_newyork = stns_newyork.rename(columns={"station_id": "start_station_id"})
stns_newyork

Unnamed: 0,start_station_id
0,3477
1,72
2,79
3,82
4,83
...,...
1209,4248
1210,4249
1211,4250
1212,4252


In [67]:
# remove New York records with start_station_id not existing in stations table
ny_df = pd.merge(newyork, stns_newyork, how='inner', on='start_station_id')

In [69]:
# rename New Yorks stations column to end_station_id
stns_newyork = stns_newyork.rename(columns={"start_station_id": "end_station_id"})
stns_newyork.head()

Unnamed: 0,end_station_id
0,3477
1,72
2,79
3,82
4,83


In [70]:
# remove New York records with end_station_id not existing in stations table
newyork_df = pd.merge(ny_df, stns_newyork, how='inner', on='end_station_id')

In [72]:
# count number of New York records for insert to ridership table
newyork_df.count()

trip_duration       81231
start_date          81231
end_date            81231
start_station_id    81231
end_station_id      81231
bike_id             81231
member_type         81231
location_id         81231
dtype: int64

In [None]:
newyork_df.to_sql(name='ridership', con=engine, if_exists='append', index=False, index_label='id')