In [1]:
import pandas as pd
import os
from sqlalchemy import create_engine
from config import conn_string

**read all files**

In [2]:
# path
station_path = os.path.join('.', "station_info.csv")
weather_path = os.path.join('.', "historical_weather.csv")

#bikes
toronto_bikes = os.path.join("Resources/1_toronto_canada", "toronto_bikes.csv")
vancouver_bikes = os.path.join("Resources/2_vancouver_canada", "vancouver_bikes.csv")
boston_bikes = os.path.join("Resources/3_boston_us", "boston_bikes.csv")
nyc_bikes = os.path.join("Resources/4_NewYork_US", "nyc_bikes.csv")

# membertypes
toronto_users = os.path.join("Resources/1_toronto_canada", "toronto_membertypes.csv")
vancouver_users = os.path.join("Resources/2_vancouver_canada", "vancouver_membertypes.csv")
boston_users = os.path.join("Resources/3_boston_us", "boston_membertypes.csv")
nyc_users = os.path.join("Resources/4_NewYork_US", "nyc_membertypes.csv")

# ridership
toronto_ = os.path.join("Resources/1_toronto_canada", "toronto_bikeshare.csv")
vancouver_ = os.path.join("Resources/2_vancouver_canada", "vancouver_bikeshare.csv")
boston_ = os.path.join("Resources/3_boston_us", "boston_bikeshare.csv")
nyc_ = os.path.join("Resources/4_NewYork_US", "nyc_bikeshare.csv")


In [3]:
# read files
stns = pd.read_csv(station_path, encoding="utf-8")
weather = pd.read_csv(weather_path, encoding="utf-8")

# bikes
tor_bikes = pd.read_csv(toronto_bikes, encoding="utf-8")
van_bikes = pd.read_csv(vancouver_bikes, encoding="utf-8")
bos_bikes = pd.read_csv(boston_bikes, encoding="utf-8")
nyc_bikes = pd.read_csv(nyc_bikes, encoding="utf-8")

# member_types
tor_users = pd.read_csv(toronto_users, encoding="utf-8")
van_users = pd.read_csv(vancouver_users, encoding="utf-8")
bos_users = pd.read_csv(boston_users, encoding="utf-8")
nyc_users = pd.read_csv(nyc_users, encoding="utf-8")

# pricing index


# ridership
toronto = pd.read_csv(toronto_, encoding="utf-8")
vancouver = pd.read_csv(vancouver_, encoding="utf-8")
boston = pd.read_csv(boston_, encoding="utf-8")
newyork = pd.read_csv(nyc_, encoding="utf-8")


In [28]:
boston = pd.read_csv(boston_, encoding="utf-8")

In [29]:
boston.count()

trip_duration       2522771
start_date          2522771
end_date            2522771
start_station_id    2522771
end_station_id      2522771
bike_id             2522771
member_type         2522771
location_id         2522771
dtype: int64

**Locations**

In [None]:
locations = [{"location_id": 1,"city":"Toronto", "country":"Canada" }, 
             {"location_id": 2,"city":"Vancouver", "country":"Canada"},
             {"location_id": 3,"city":"Boston", "country":"USA"},
             {"location_id": 4,"city":"New York", "country":"USA"}
            ]

df = pd.DataFrame(locations)
df.set_index("location_id", inplace=True)
df


**Stations**

In [None]:
stns.head()

**Historical Weather**

In [None]:
weather.head()

**Bikes**

In [None]:
tor_bikes.head()

In [None]:
van_bikes.head()

In [None]:
bos_bikes.head()

In [None]:
nyc_bikes.head()

**Member Types**

In [None]:
tor_users.head()

In [None]:
van_users.head()

In [None]:
bos_users.head()

In [None]:
nyc_users.head()

**Pricing Index**

**Ridership**

In [None]:
# Toronto

toronto.head()

### Load data

In [4]:
cnx = f'postgresql://{conn_string}'
engine = create_engine(cnx)

In [5]:
# confirm tables
engine.table_names()

['locations',
 'membertypes',
 'stations',
 'bikes',
 'ridership',
 'pricing',
 'historicalweather']

In [None]:
# load locations into database
df.to_sql(name='locations', con=engine, if_exists='append', index=True)

In [None]:
# load stations into database
stns.to_sql(name='stations', con=engine, if_exists='append', index=False)


In [None]:
# load historical weather into database
weather.to_sql(name='historicalweather', con=engine, if_exists='append', index=False)

In [None]:
# load toronto bikes into database
tor_bikes.to_sql(name='bikes', con=engine, if_exists='append', index=False)

In [None]:
# load vancouver bikes into database
van_bikes.to_sql(name='bikes', con=engine, if_exists='append', index=False)

In [None]:
# load boston bikes into database
bos_bikes.to_sql(name='bikes', con=engine, if_exists='append', index=False)

In [None]:
# load new york bikes into database
nyc_bikes.to_sql(name='bikes', con=engine, if_exists='append', index=False)

In [None]:
# load toronto member types
tor_users.to_sql(name='membertypes', con=engine, if_exists='append', index=False)

In [None]:
# load vancouver member types
van_users.to_sql(name='membertypes', con=engine, if_exists='append', index=False)

In [None]:
# load boston member types
bos_users.to_sql(name='membertypes', con=engine, if_exists='append', index=False)

In [None]:
# load new york member types
nyc_users.to_sql(name='membertypes', con=engine, if_exists='append', index=False)

**Load Ridership**

In [6]:
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session

# check reference to stations table
# create session
session = Session(bind=engine)
# declare a base using automap_base
Base = automap_base()
# use the Base class to reflect the database tables
Base.prepare(engine, reflect=True)
Base.classes.keys()
Stations = Base.classes.stations

*Verify that all station_ids used in bikeshare exists in Stations table*

In [7]:
all_stations = session.query(Stations.station_id).all()
all_stns_df = pd.DataFrame(all_stations)
all_stns_df

Unnamed: 0,station_id
0,7000
1,7001
2,7002
3,7003
4,7004
...,...
2396,4248
2397,4249
2398,4250
2399,4252


***Toronto***

In [None]:
missing_start = toronto[-toronto["start_station_id"].isin(all_stns_df["station_id"])]
missing_start = missing_start.rename(columns={"start_station_id":"station_id"})


In [None]:
missing_start

In [None]:
missing_end = toronto[-toronto["end_station_id"].isin(all_stns_df["station_id"])]
missing_end = missing_end.rename(columns={"start_station_id":"station_id"})


In [None]:
missing_end

*Load rows with station_id existing in stations table*

In [None]:
toronto = toronto[toronto["start_station_id"].isin(all_stns_df["station_id"])]
toronto = toronto[toronto["end_station_id"].isin(all_stns_df["station_id"])]

In [None]:
toronto

In [None]:
toronto.to_sql(name='ridership', con=engine, if_exists='append', index=False, index_label='id')

***Vancouver***

In [None]:
missing_start = vancouver[-vancouver["start_station_id"].isin(all_stns_df["station_id"])]
missing_start = missing_start.rename(columns={"start_station_id":"station_id"})


In [None]:
missing_start

In [None]:
missing_end = vancouver[-vancouver["end_station_id"].isin(all_stns_df["station_id"])]
missing_end = missing_end.rename(columns={"start_station_id":"station_id"})


In [None]:
missing_end

*Load rows with station_id existing in stations table*

In [None]:
vancouver = vancouver[vancouver["start_station_id"].isin(all_stns_df["station_id"])]
vancouver = vancouver[vancouver["end_station_id"].isin(all_stns_df["station_id"])]

In [None]:
vancouver

In [None]:
vancouver.to_sql(name='ridership', con=engine, if_exists='append', index=False, index_label='id')

***Boston***

In [30]:
missing_start = boston[-boston["start_station_id"].isin(all_stns_df["station_id"])]
missing_start

Unnamed: 0,trip_duration,start_date,end_date,start_station_id,end_station_id,bike_id,member_type,location_id
215,610,2019-01-01 11:05:52.2480,2019-01-01 11:16:02.6380,42,40,2648,Subscriber,3
302,1657,2019-01-01 11:59:53.3000,2019-01-01 12:27:30.3620,42,65,3553,Customer,3
306,1509,2019-01-01 12:02:14.3990,2019-01-01 12:27:24.3260,42,65,2790,Customer,3
486,1874,2019-01-01 13:13:17.7800,2019-01-01 13:44:32.1500,42,42,4118,Customer,3
488,1854,2019-01-01 13:13:35.4750,2019-01-01 13:44:29.5620,42,42,4441,Customer,3
...,...,...,...,...,...,...,...,...
2519857,1224,2019-12-29 17:04:24.2110,2019-12-29 17:24:49.1520,42,342,2124,Subscriber,3
2520099,1791,2019-12-29 19:21:01.0570,2019-12-29 19:50:52.5650,42,41,4082,Subscriber,3
2521348,166,2019-12-31 11:41:20.5730,2019-12-31 11:44:07.3100,42,399,4723,Subscriber,3
2522391,3522,2019-12-31 18:20:43.0640,2019-12-31 19:19:25.4720,42,370,4000,Subscriber,3


In [None]:
boston.loc[boston['start_station_id']==88]

In [31]:
missing_end = boston[-boston["end_station_id"].isin(all_stns_df["station_id"])]
missing_end

Unnamed: 0,trip_duration,start_date,end_date,start_station_id,end_station_id,bike_id,member_type,location_id
44,622,2019-01-01 06:06:01.8390,2019-01-01 06:16:24.1470,157,42,4553,Subscriber,3
226,1646,2019-01-01 11:11:29.5610,2019-01-01 11:38:55.9730,356,42,2423,Customer,3
227,1596,2019-01-01 11:11:43.6340,2019-01-01 11:38:20.4030,356,42,2790,Customer,3
261,959,2019-01-01 11:31:57.0740,2019-01-01 11:47:56.1730,105,42,3955,Subscriber,3
316,1043,2019-01-01 12:08:56.8120,2019-01-01 12:26:20.0560,178,42,4371,Customer,3
...,...,...,...,...,...,...,...,...
2522337,2414,2019-12-31 17:57:17.2330,2019-12-31 18:37:31.9600,54,42,3361,Subscriber,3
2522350,700,2019-12-31 18:01:52.0070,2019-12-31 18:13:32.9020,72,42,4401,Subscriber,3
2522492,1118,2019-12-31 19:04:43.5650,2019-12-31 19:23:21.5790,3,42,4739,Subscriber,3
2522682,1150,2019-12-31 21:46:51.4280,2019-12-31 22:06:01.9680,107,42,5091,Subscriber,3


In [None]:
boston = boston[boston["start_station_id"].isin(all_stns_df["station_id"])]
boston = boston[boston["end_station_id"].isin(all_stns_df["station_id"])]
boston

In [None]:
# intersection_df = pd.merge(d1, d2, how ='inner', on =['start_station_id', 'station_id'])

In [32]:
print(2522771-(14129+15948))

2492694


In [26]:
boston.count()

trip_duration       2522771
start_date          2522771
end_date            2522771
start_station_id    2522771
end_station_id      2522771
bike_id             2522771
member_type         2522771
location_id         2522771
dtype: int64

In [23]:
stns_boston = session.query(Stations.station_id).filter(Stations.location_id==3).all()
stns_boston = pd.DataFrame(stns_boston)
stns_boston = stns_boston.rename(columns={"station_id": "start_station_id"})
stns_boston

Unnamed: 0,start_station_id
0,3
1,4
2,5
3,6
4,7
...,...
373,495
374,496
375,497
376,498


In [24]:
df = pd.merge(boston, stns_boston, how='inner', on='start_station_id')

In [33]:
stns_boston = stns_boston.rename(columns={"start_station_id": "end_station_id"})
stns_boston

Unnamed: 0,end_station_id
0,3
1,4
2,5
3,6
4,7
...,...
373,495
374,496
375,497
376,498


In [35]:
boston_df = pd.merge(df, stns_boston, how='inner', on='end_station_id')

In [36]:
boston_df 

Unnamed: 0,trip_duration,start_date,end_date,start_station_id,end_station_id,bike_id,member_type,location_id
0,371,2019-01-01 00:09:13.7980,2019-01-01 00:15:25.3360,80,179,3689,Subscriber,3
1,304,2019-01-01 00:54:48.9310,2019-01-01 00:59:53.5810,80,179,3861,Subscriber,3
2,354,2019-01-01 17:54:32.7970,2019-01-01 18:00:26.8420,80,179,4053,Subscriber,3
3,397,2019-01-01 17:57:05.5680,2019-01-01 18:03:43.0000,80,179,4160,Subscriber,3
4,320,2019-01-01 18:33:41.3500,2019-01-01 18:39:01.9270,80,179,2893,Subscriber,3
...,...,...,...,...,...,...,...,...
2480339,2156,2019-09-22 14:36:12.3820,2019-09-22 15:12:08.6720,430,430,4882,Customer,3
2480340,5288,2019-10-17 06:43:00.1410,2019-10-17 08:11:09.0090,430,430,2179,Customer,3
2480341,1734,2019-10-26 16:00:40.9080,2019-10-26 16:29:35.1080,430,430,5871,Customer,3
2480342,1719,2019-10-26 16:00:47.4760,2019-10-26 16:29:26.5250,430,430,2179,Customer,3


In [37]:
boston_df.to_sql(name='ridership', con=engine, if_exists='append', index=False, index_label='id')