In [1]:
import pandas as pd
import os
from sqlalchemy import create_engine
from config import conn_string

**read all files**

In [2]:
# path
station_path = os.path.join('.', "station_info.csv")
weather_path = os.path.join('.', "historical_weather.csv")

#bikes
toronto_bikes = os.path.join("Resources/1_toronto_canada", "toronto_bikes.csv")
vancouver_bikes = os.path.join("Resources/2_vancouver_canada", "vancouver_bikes.csv")
boston_bikes = os.path.join("Resources/3_boston_us", "boston_bikes.csv")
nyc_bikes = os.path.join("Resources/4_NewYork_US", "nyc_bikes.csv")

# membertypes
toronto_users = os.path.join("Resources/1_toronto_canada", "toronto_membertypes.csv")
vancouver_users = os.path.join("Resources/2_vancouver_canada", "vancouver_membertypes.csv")
boston_users = os.path.join("Resources/3_boston_us", "boston_membertypes.csv")
nyc_users = os.path.join("Resources/4_NewYork_US", "nyc_membertypes.csv")

# ridership
toronto_ = os.path.join("Resources/1_toronto_canada", "toronto_bikeshare.csv")
vancouver_ = os.path.join("Resources/2_vancouver_canada", "vancouver_bikeshare.csv")
boston_ = os.path.join("Resources/3_boston_us", "boston_bikeshare.csv")
nyc_ = os.path.join("Resources/4_NewYork_US", "nyc_bikeshare.csv")


In [3]:
# read files
stns = pd.read_csv(station_path, encoding="utf-8")
weather = pd.read_csv(weather_path, encoding="utf-8")

# bikes
tor_bikes = pd.read_csv(toronto_bikes, encoding="utf-8")
van_bikes = pd.read_csv(vancouver_bikes, encoding="utf-8")
bos_bikes = pd.read_csv(boston_bikes, encoding="utf-8")
nyc_bikes = pd.read_csv(nyc_bikes, encoding="utf-8")

# member_types
tor_users = pd.read_csv(toronto_users, encoding="utf-8")
van_users = pd.read_csv(vancouver_users, encoding="utf-8")
bos_users = pd.read_csv(boston_users, encoding="utf-8")
nyc_users = pd.read_csv(nyc_users, encoding="utf-8")

# pricing index


# ridership
toronto = pd.read_csv(toronto_, encoding="utf-8")
vancouver = pd.read_csv(vancouver_, encoding="utf-8")
boston = pd.read_csv(boston_, encoding="utf-8")
newyork = pd.read_csv(nyc_, encoding="utf-8")


In [None]:
boston = pd.read_csv(boston_, encoding="utf-8")

In [None]:
boston.count()

**Locations**

In [None]:
locations = [{"location_id": 1,"city":"Toronto", "country":"Canada" }, 
             {"location_id": 2,"city":"Vancouver", "country":"Canada"},
             {"location_id": 3,"city":"Boston", "country":"USA"},
             {"location_id": 4,"city":"New York", "country":"USA"}
            ]

df = pd.DataFrame(locations)
df.set_index("location_id", inplace=True)
df


**Stations**

In [None]:
stns.head()

**Historical Weather**

In [None]:
weather.head()

**Bikes**

In [None]:
tor_bikes.head()

In [None]:
van_bikes.head()

In [None]:
bos_bikes.head()

In [None]:
nyc_bikes.head()

**Member Types**

In [None]:
tor_users.head()

In [None]:
van_users.head()

In [None]:
bos_users.head()

In [None]:
nyc_users.head()

**Pricing Index**

**Ridership**

In [None]:
# Toronto

toronto.head()

### Load data

In [4]:
cnx = f'postgresql://{conn_string}'
engine = create_engine(cnx)

In [5]:
# confirm tables
engine.table_names()

['locations',
 'membertypes',
 'stations',
 'bikes',
 'ridership',
 'pricing',
 'historicalweather']

In [None]:
# load locations into database
df.to_sql(name='locations', con=engine, if_exists='append', index=True)

In [None]:
# load stations into database
stns.to_sql(name='stations', con=engine, if_exists='append', index=False)


In [None]:
# load historical weather into database
weather.to_sql(name='historicalweather', con=engine, if_exists='append', index=False)

In [None]:
# load toronto bikes into database
tor_bikes.to_sql(name='bikes', con=engine, if_exists='append', index=False)

In [None]:
# load vancouver bikes into database
van_bikes.to_sql(name='bikes', con=engine, if_exists='append', index=False)

In [None]:
# load boston bikes into database
bos_bikes.to_sql(name='bikes', con=engine, if_exists='append', index=False)

In [None]:
# load new york bikes into database
nyc_bikes.to_sql(name='bikes', con=engine, if_exists='append', index=False)

In [None]:
# load toronto member types
tor_users.to_sql(name='membertypes', con=engine, if_exists='append', index=False)

In [None]:
# load vancouver member types
van_users.to_sql(name='membertypes', con=engine, if_exists='append', index=False)

In [None]:
# load boston member types
bos_users.to_sql(name='membertypes', con=engine, if_exists='append', index=False)

In [None]:
# load new york member types
nyc_users.to_sql(name='membertypes', con=engine, if_exists='append', index=False)

**Load Ridership**

In [6]:
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session

# check reference to stations table
# create session
session = Session(bind=engine)
# declare a base using automap_base
Base = automap_base()
# use the Base class to reflect the database tables
Base.prepare(engine, reflect=True)
Base.classes.keys()
Stations = Base.classes.stations

*Verify that all station_ids used in bikeshare exists in Stations table*

In [7]:
all_stations = session.query(Stations.station_id).all()
all_stns_df = pd.DataFrame(all_stations)
all_stns_df

Unnamed: 0,station_id
0,7000
1,7001
2,7002
3,7003
4,7004
...,...
2396,4248
2397,4249
2398,4250
2399,4252


***Toronto***

In [None]:
missing_start = toronto[-toronto["start_station_id"].isin(all_stns_df["station_id"])]
missing_start = missing_start.rename(columns={"start_station_id":"station_id"})


In [None]:
missing_start

In [None]:
missing_end = toronto[-toronto["end_station_id"].isin(all_stns_df["station_id"])]
missing_end = missing_end.rename(columns={"start_station_id":"station_id"})


In [None]:
missing_end

*Load rows with station_id existing in stations table*

In [None]:
toronto = toronto[toronto["start_station_id"].isin(all_stns_df["station_id"])]
toronto = toronto[toronto["end_station_id"].isin(all_stns_df["station_id"])]

In [None]:
toronto

In [None]:
toronto.to_sql(name='ridership', con=engine, if_exists='append', index=False, index_label='id')

***Vancouver***

In [None]:
missing_start = vancouver[-vancouver["start_station_id"].isin(all_stns_df["station_id"])]
missing_start = missing_start.rename(columns={"start_station_id":"station_id"})


In [None]:
missing_start

In [None]:
missing_end = vancouver[-vancouver["end_station_id"].isin(all_stns_df["station_id"])]
missing_end = missing_end.rename(columns={"start_station_id":"station_id"})


In [None]:
missing_end

*Load rows with station_id existing in stations table*

In [None]:
vancouver = vancouver[vancouver["start_station_id"].isin(all_stns_df["station_id"])]
vancouver = vancouver[vancouver["end_station_id"].isin(all_stns_df["station_id"])]

In [None]:
vancouver

In [None]:
vancouver.to_sql(name='ridership', con=engine, if_exists='append', index=False, index_label='id')

***Boston***

In [None]:
missing_start = boston[-boston["start_station_id"].isin(all_stns_df["station_id"])]
missing_start

In [None]:
boston.loc[boston['start_station_id']==88]

In [None]:
missing_end = boston[-boston["end_station_id"].isin(all_stns_df["station_id"])]
missing_end

In [None]:
boston = boston[boston["start_station_id"].isin(all_stns_df["station_id"])]
boston = boston[boston["end_station_id"].isin(all_stns_df["station_id"])]
boston

In [None]:
# intersection_df = pd.merge(d1, d2, how ='inner', on =['start_station_id', 'station_id'])

In [None]:
print(2522771-(14129+15948))

In [None]:
boston.count()

In [None]:
stns_boston = session.query(Stations.station_id).filter(Stations.location_id==3).all()
stns_boston = pd.DataFrame(stns_boston)
stns_boston = stns_boston.rename(columns={"station_id": "start_station_id"})
stns_boston

In [None]:
df = pd.merge(boston, stns_boston, how='inner', on='start_station_id')

In [None]:
stns_boston = stns_boston.rename(columns={"start_station_id": "end_station_id"})
stns_boston

In [None]:
boston_df = pd.merge(df, stns_boston, how='inner', on='end_station_id')

In [None]:
boston_df 

In [None]:
boston_df.to_sql(name='ridership', con=engine, if_exists='append', index=False, index_label='id')

***New York***

In [8]:
missing_start = newyork[-newyork["start_station_id"].isin(all_stns_df["station_id"])]
missing_start

Unnamed: 0,trip_duration,start_date,end_date,start_station_id,end_station_id,bike_id,member_type,location_id
6,280,2019-01-01 00:09:21.0060,2019-01-01 00:14:01.1510,3675.0,3288.0,35391,Subscriber,4
72,740,2019-01-01 00:42:16.3560,2019-01-01 00:54:36.8120,524.0,526.0,34089,Subscriber,4
97,967,2019-01-01 00:50:10.0810,2019-01-01 01:06:17.7230,524.0,236.0,35052,Subscriber,4
138,207,2019-01-01 01:00:48.7520,2019-01-01 01:04:15.8860,3360.0,3434.0,25854,Subscriber,4
143,642,2019-01-01 01:01:34.8600,2019-01-01 01:12:17.7070,3341.0,3529.0,28933,Subscriber,4
...,...,...,...,...,...,...,...,...
20551371,754,2019-12-31 23:32:32.5780,2019-12-31 23:45:06.6020,3295.0,3501.0,28831,Subscriber,4
20551377,1273,2019-12-31 23:33:03.5930,2019-12-31 23:54:17.2570,520.0,3387.0,27358,Subscriber,4
20551385,610,2019-12-31 23:33:33.7010,2019-12-31 23:43:44.1560,3341.0,3810.0,40091,Subscriber,4
20551454,399,2019-12-31 23:44:34.4320,2019-12-31 23:51:13.7680,3320.0,3307.0,33907,Subscriber,4


In [9]:
missing_end = newyork[-newyork["end_station_id"].isin(all_stns_df["station_id"])]
missing_end

Unnamed: 0,trip_duration,start_date,end_date,start_station_id,end_station_id,bike_id,member_type,location_id
1,316,2019-01-01 00:04:43.7360,2019-01-01 00:10:00.6080,519.0,518.0,32723,Subscriber,4
18,829,2019-01-01 00:19:08.1030,2019-01-01 00:32:57.3880,3165.0,3295.0,32106,Subscriber,4
20,736,2019-01-01 00:21:02.9570,2019-01-01 00:33:19.4110,3165.0,3295.0,16761,Customer,4
28,1068,2019-01-01 00:25:13.8750,2019-01-01 00:43:01.9350,394.0,3431.0,32042,Subscriber,4
58,187,2019-01-01 00:35:03.5980,2019-01-01 00:38:10.6250,484.0,524.0,34089,Subscriber,4
...,...,...,...,...,...,...,...,...
20551335,986,2019-12-31 23:29:28.0850,2019-12-31 23:45:54.6670,3725.0,518.0,39903,Subscriber,4
20551382,1039,2019-12-31 23:33:21.6760,2019-12-31 23:50:41.0580,3141.0,520.0,19596,Subscriber,4
20551385,610,2019-12-31 23:33:33.7010,2019-12-31 23:43:44.1560,3341.0,3810.0,40091,Subscriber,4
20551434,418,2019-12-31 23:40:58.3430,2019-12-31 23:47:56.9030,545.0,518.0,16971,Subscriber,4


In [10]:
newyork.count()

trip_duration       20551517
start_date          20551517
end_date            20551517
start_station_id    20551517
end_station_id      20551517
bike_id             20551517
member_type         20551517
location_id         20551517
dtype: int64

In [12]:
newyork = newyork[newyork["start_station_id"].isin(all_stns_df["station_id"])]
newyork = newyork[newyork["end_station_id"].isin(all_stns_df["station_id"])]
newyork

Unnamed: 0,trip_duration,start_date,end_date,start_station_id,end_station_id,bike_id,member_type,location_id
0,320,2019-01-01 00:01:47.4010,2019-01-01 00:07:07.5810,3160.0,3283.0,15839,Subscriber,4
2,591,2019-01-01 00:06:03.9970,2019-01-01 00:15:55.4380,3171.0,3154.0,27451,Subscriber,4
3,2719,2019-01-01 00:07:03.5450,2019-01-01 00:52:22.6500,504.0,3709.0,21579,Subscriber,4
4,303,2019-01-01 00:07:35.9450,2019-01-01 00:12:39.5020,229.0,503.0,35379,Subscriber,4
5,535,2019-01-01 00:08:33.1790,2019-01-01 00:17:28.3540,3630.0,3529.0,30315,Subscriber,4
...,...,...,...,...,...,...,...,...
20551512,93,2019-12-31 23:58:18.0160,2019-12-31 23:59:51.6360,3141.0,3142.0,28734,Subscriber,4
20551513,786,2019-12-31 23:58:59.8960,2020-01-01 00:12:06.3460,490.0,513.0,41418,Subscriber,4
20551514,351,2019-12-31 23:59:03.6950,2020-01-01 00:04:54.8730,3349.0,3368.0,27746,Subscriber,4
20551515,1571,2019-12-31 23:59:21.3610,2020-01-01 00:25:32.9420,252.0,366.0,28506,Subscriber,4


In [13]:
newyork.to_sql(name='ridership', con=engine, if_exists='append', index=False, index_label='id')

IntegrityError: (psycopg2.errors.ForeignKeyViolation) insert or update on table "ridership" violates foreign key constraint "fk_rides_stations_end"
DETAIL:  Key (end_station_id, location_id)=(195, 4) is not present in table "stations".

[SQL: INSERT INTO ridership (trip_duration, start_date, end_date, start_station_id, end_station_id, bike_id, member_type, location_id) VALUES (%(trip_duration)s, %(start_date)s, %(end_date)s, %(start_station_id)s, %(end_station_id)s, %(bike_id)s, %(member_type)s, %(location_id)s)]
[parameters: ({'trip_duration': 320, 'start_date': '2019-01-01 00:01:47.4010', 'end_date': '2019-01-01 00:07:07.5810', 'start_station_id': 3160.0, 'end_station_id': 3283.0, 'bike_id': 15839, 'member_type': 'Subscriber', 'location_id': 4}, {'trip_duration': 591, 'start_date': '2019-01-01 00:06:03.9970', 'end_date': '2019-01-01 00:15:55.4380', 'start_station_id': 3171.0, 'end_station_id': 3154.0, 'bike_id': 27451, 'member_type': 'Subscriber', 'location_id': 4}, {'trip_duration': 2719, 'start_date': '2019-01-01 00:07:03.5450', 'end_date': '2019-01-01 00:52:22.6500', 'start_station_id': 504.0, 'end_station_id': 3709.0, 'bike_id': 21579, 'member_type': 'Subscriber', 'location_id': 4}, {'trip_duration': 303, 'start_date': '2019-01-01 00:07:35.9450', 'end_date': '2019-01-01 00:12:39.5020', 'start_station_id': 229.0, 'end_station_id': 503.0, 'bike_id': 35379, 'member_type': 'Subscriber', 'location_id': 4}, {'trip_duration': 535, 'start_date': '2019-01-01 00:08:33.1790', 'end_date': '2019-01-01 00:17:28.3540', 'start_station_id': 3630.0, 'end_station_id': 3529.0, 'bike_id': 30315, 'member_type': 'Subscriber', 'location_id': 4}, {'trip_duration': 1547, 'start_date': '2019-01-01 00:11:22.9070', 'end_date': '2019-01-01 00:37:10.0550', 'start_station_id': 285.0, 'end_station_id': 3358.0, 'bike_id': 35551, 'member_type': 'Subscriber', 'location_id': 4}, {'trip_duration': 300, 'start_date': '2019-01-01 00:12:17.9310', 'end_date': '2019-01-01 00:17:18.8300', 'start_station_id': 3394.0, 'end_station_id': 3398.0, 'bike_id': 18636, 'member_type': 'Subscriber', 'location_id': 4}, {'trip_duration': 123, 'start_date': '2019-01-01 00:12:19.6030', 'end_date': '2019-01-01 00:14:23.1040', 'start_station_id': 432.0, 'end_station_id': 3656.0, 'bike_id': 35206, 'member_type': 'Subscriber', 'location_id': 4}  ... displaying 10 of 18727352 total bound parameter sets ...  {'trip_duration': 1571, 'start_date': '2019-12-31 23:59:21.3610', 'end_date': '2020-01-01 00:25:32.9420', 'start_station_id': 252.0, 'end_station_id': 366.0, 'bike_id': 28506, 'member_type': 'Subscriber', 'location_id': 4}, {'trip_duration': 310, 'start_date': '2019-12-31 23:59:55.2960', 'end_date': '2020-01-01 00:05:05.7270', 'start_station_id': 3351.0, 'end_station_id': 3379.0, 'bike_id': 21013, 'member_type': 'Subscriber', 'location_id': 4})]
(Background on this error at: http://sqlalche.me/e/13/gkpj)

In [15]:
stns_newyork = session.query(Stations.station_id).filter(Stations.location_id==4).all()
stns_newyork = pd.DataFrame(stns_newyork)
stns_newyork = stns_newyork.rename(columns={"station_id": "start_station_id"})
stns_newyork

Unnamed: 0,start_station_id
0,3477
1,72
2,79
3,82
4,83
...,...
1209,4248
1210,4249
1211,4250
1212,4252


In [16]:
ny_df = pd.merge(newyork, stns_newyork, how='inner', on='start_station_id')

In [17]:
stns_newyork = stns_newyork.rename(columns={"start_station_id": "end_station_id"})
stns_newyork

Unnamed: 0,end_station_id
0,3477
1,72
2,79
3,82
4,83
...,...
1209,4248
1210,4249
1211,4250
1212,4252


In [18]:
newyork_df = pd.merge(ny_df, stns_newyork, how='inner', on='end_station_id')

In [19]:
newyork_df

Unnamed: 0,trip_duration,start_date,end_date,start_station_id,end_station_id,bike_id,member_type,location_id
0,320,2019-01-01 00:01:47.4010,2019-01-01 00:07:07.5810,3160.0,3283.0,15839,Subscriber,4
1,353,2019-01-03 16:21:31.0120,2019-01-03 16:27:24.3650,3160.0,3283.0,32547,Subscriber,4
2,427,2019-01-08 17:14:19.8670,2019-01-08 17:21:27.6760,3160.0,3283.0,33767,Subscriber,4
3,436,2019-01-10 17:33:18.0890,2019-01-10 17:40:34.1690,3160.0,3283.0,20155,Subscriber,4
4,440,2019-01-15 17:22:49.7350,2019-01-15 17:30:10.4370,3160.0,3283.0,20203,Subscriber,4
...,...,...,...,...,...,...,...,...
17742183,528,2019-12-21 11:08:16.1030,2019-12-21 11:17:04.1250,3874.0,3884.0,30672,Subscriber,4
17742184,62,2019-12-24 04:28:00.4850,2019-12-24 04:29:02.4870,3875.0,3884.0,29924,Subscriber,4
17742185,193,2019-12-20 04:29:18.8340,2019-12-20 04:32:31.9060,3896.0,3884.0,40415,Subscriber,4
17742186,2861,2019-12-24 16:41:21.7110,2019-12-24 17:29:03.0580,3884.0,3884.0,29924,Customer,4


In [None]:
newyork_df.to_sql(name='ridership', con=engine, if_exists='append', index=False, index_label='id')

In [None]:
pricestr = 'Annual Membership $99'
