### Import Dependencies

In [None]:
import pandas as pd
import os
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session

from config import conn_string
#from config_testdb import conn_string

**get all files for loading**

In [None]:
# path
station_path = os.path.join('.', "station_info.csv")
weather_path = os.path.join('.', "historical_weather.csv")
pricing_path = os.path.join('.', "pricing_index.csv")

#bikes
toronto_bikes = os.path.join("Resources/1_toronto_canada", "toronto_bikes.csv")
vancouver_bikes = os.path.join("Resources/2_vancouver_canada", "vancouver_bikes.csv")
boston_bikes = os.path.join("Resources/3_boston_us", "boston_bikes.csv")
nyc_bikes = os.path.join("Resources/4_NewYork_US", "nyc_bikes.csv")

# membertypes
toronto_users = os.path.join("Resources/1_toronto_canada", "toronto_membertypes.csv")
vancouver_users = os.path.join("Resources/2_vancouver_canada", "vancouver_membertypes.csv")
boston_users = os.path.join("Resources/3_boston_us", "boston_membertypes.csv")
nyc_users = os.path.join("Resources/4_NewYork_US", "nyc_membertypes.csv")

# ridership
toronto_ = os.path.join("Resources/1_toronto_canada", "toronto_bikeshare.csv")
vancouver_ = os.path.join("Resources/2_vancouver_canada", "vancouver_bikeshare.csv")
boston_ = os.path.join("Resources/3_boston_us", "boston_bikeshare.csv")
nyc_ = os.path.join("Resources/4_NewYork_US", "nyc_bikeshare.csv")


**read all files into dataframes**

In [None]:
# read files
stns = pd.read_csv(station_path, encoding="utf-8")
weather = pd.read_csv(weather_path, encoding="utf-8")

# bikes
tor_bikes = pd.read_csv(toronto_bikes, encoding="utf-8")
van_bikes = pd.read_csv(vancouver_bikes, encoding="utf-8")
bos_bikes = pd.read_csv(boston_bikes, encoding="utf-8")
nyc_bikes = pd.read_csv(nyc_bikes, encoding="utf-8")

# member_types
tor_users = pd.read_csv(toronto_users, encoding="utf-8")
van_users = pd.read_csv(vancouver_users, encoding="utf-8")
bos_users = pd.read_csv(boston_users, encoding="utf-8")
nyc_users = pd.read_csv(nyc_users, encoding="utf-8")

# pricing index
pricing_index = pd.read_csv(pricing_path, encoding="utf-8")

# ridership
toronto = pd.read_csv(toronto_, encoding="utf-8")
vancouver = pd.read_csv(vancouver_, encoding="utf-8")
boston = pd.read_csv(boston_, encoding="utf-8")
newyork = pd.read_csv(nyc_, encoding="utf-8")


**Locations**

In [None]:
# create locations table
locations = [{"location_id": 1,"city":"Toronto", "country":"Canada" }, 
             {"location_id": 2,"city":"Vancouver", "country":"Canada"},
             {"location_id": 3,"city":"Boston", "country":"USA"},
             {"location_id": 4,"city":"New York", "country":"USA"}
            ]

df = pd.DataFrame(locations)
df.set_index("location_id", inplace=True)
df


**Stations**

In [None]:
stns.head()

**Historical Weather**

In [None]:
weather.head()

**Bikes**

In [None]:
tor_bikes.head()

In [None]:
van_bikes.head()

In [None]:
bos_bikes.head()

In [None]:
nyc_bikes.head()

**Member Types**

In [None]:
tor_users.head()

In [None]:
van_users.head()

In [None]:
bos_users.head()

In [None]:
nyc_users.head()

**Pricing Index**

In [None]:
pricing_index.head()

**Ridership**

In [None]:
# Toronto
toronto.head()

In [None]:
# Vancouver
vancouver.head()

In [None]:
# Boston
boston.head()

In [None]:
# New York
newyork.head()

### Load data

#### Connect to database

In [None]:
# connect to database
cnx = f'postgresql://{conn_string}'
engine = create_engine(cnx)

In [None]:
# confirm tables
engine.table_names()

***Locations***

In [None]:
# load locations into database
df.to_sql(name='locations', con=engine, if_exists='append', index=True)
print("locations")

***Stations***

In [None]:
# load stations into database
stns.to_sql(name='stations', con=engine, if_exists='append', index=False)
print("stations")

***Historical Weather***

In [None]:
# load historical weather into database
weather.to_sql(name='historicalweather', con=engine, if_exists='append', index=False)
print("weather")

***Bikes***

In [None]:
# load toronto bikes into database
tor_bikes.to_sql(name='bikes', con=engine, if_exists='append', index=False)
print("tor_bikes")

In [None]:
# load vancouver bikes into database
van_bikes.to_sql(name='bikes', con=engine, if_exists='append', index=False)
print("van_bikes")

In [None]:
# load boston bikes into database
bos_bikes.to_sql(name='bikes', con=engine, if_exists='append', index=False)
print("bos_bikes")

In [None]:
# load new york bikes into database
nyc_bikes.to_sql(name='bikes', con=engine, if_exists='append', index=False)
print("nyc_bikes")

***Member Types***

In [None]:
# load toronto member types
tor_users.to_sql(name='membertypes', con=engine, if_exists='append', index=False)
print("tor_users")

In [None]:
# load vancouver member types
van_users.to_sql(name='membertypes', con=engine, if_exists='append', index=False)
print("van_users")

In [None]:
# load boston member types
bos_users.to_sql(name='membertypes', con=engine, if_exists='append', index=False)
print("bos_users")

In [None]:
# load new york member types
nyc_users.to_sql(name='membertypes', con=engine, if_exists='append', index=False)
print("nyc_users")

***Pricing Index***

In [None]:
# load pricing index
pricing_index.to_sql(name='pricing', con=engine, if_exists='append',index=False, index_label='price_id')
print("pricing_index")

### Load Ridership

#### check reference to stations table

In [None]:
# create session
session = Session(bind=engine)
# declare a base using automap_base
Base = automap_base()
# use the Base class to reflect the database tables
Base.prepare(engine, reflect=True)
Base.classes.keys()
Stations = Base.classes.stations

In [None]:
Base.classes.keys()

***Verify that all station_ids used in bikeshare exists in Stations table***

In [None]:
# get all stations
all_stations = session.query(Stations.station_id).all()
all_stns_df = pd.DataFrame(all_stations)
all_stns_df

### ***Toronto***

In [None]:
# check for Toronto records with start_station_id not in stations table
missing_start = toronto[-toronto["start_station_id"].isin(all_stns_df["station_id"])]
missing_start = missing_start.rename(columns={"start_station_id":"station_id"})

In [None]:
missing_start

In [None]:
# check for Toronto records with end_station_id not in stations table
missing_end = toronto[-toronto["end_station_id"].isin(all_stns_df["station_id"])]
missing_end = missing_end.rename(columns={"start_station_id":"station_id"})


In [None]:
missing_end

*Load rows with station_id existing in stations table*

In [None]:
toronto = toronto[toronto["start_station_id"].isin(all_stns_df["station_id"])]
toronto = toronto[toronto["end_station_id"].isin(all_stns_df["station_id"])]

In [None]:
toronto

In [None]:
# get all Toronto stations
stns_toronto = session.query(Stations.station_id).filter(Stations.location_id==1).all()
stns_toronto = pd.DataFrame(stns_toronto)
stns_toronto = stns_toronto.rename(columns={"station_id": "start_station_id"})
stns_toronto

In [None]:
# remove Toronto records with start_station_id not in stations table
tor_df = pd.merge(toronto, stns_toronto, how='inner', on='start_station_id')

In [None]:
tor_df

In [None]:
# rename Toronto station column to check if end_station_id exists in stations table
stns_toronto = stns_toronto.rename(columns={"start_station_id": "end_station_id"})
stns_toronto

In [None]:
#remove Toronto records with end_station_id not in stations table
toronto = pd.merge(tor_df, stns_toronto, how='inner', on='end_station_id')

In [None]:
# count number of records for insert to ridership table
print(toronto.count())

In [None]:
# Load Toronto bikeshare data to ridership table
toronto.to_sql(name='ridership', con=engine, if_exists='append', index=False, index_label='id')

### ***Vancouver***

In [None]:
# check for Vancouver records with start_station_id missing in stations table
missing_start = vancouver[-vancouver["start_station_id"].isin(all_stns_df["station_id"])]
missing_start = missing_start.rename(columns={"start_station_id":"station_id"})


In [None]:
missing_start

In [None]:
# check for Vancouver records with end_station_id missing in stations table
missing_end = vancouver[-vancouver["end_station_id"].isin(all_stns_df["station_id"])]
missing_end = missing_end.rename(columns={"start_station_id":"station_id"})


In [None]:
missing_end

In [None]:
# check Vancouver records with start_station_id and end_station_id existing in stations table
vancouver = vancouver[vancouver["start_station_id"].isin(all_stns_df["station_id"])]
vancouver = vancouver[vancouver["end_station_id"].isin(all_stns_df["station_id"])]

In [None]:
vancouver

In [None]:
# get all Vancouver stations
stns_vancouver = session.query(Stations.station_id).filter(Stations.location_id==2).all()
stns_vancouver = pd.DataFrame(stns_vancouver)
stns_vancouver = stns_vancouver.rename(columns={"station_id": "start_station_id"})
stns_vancouver

In [None]:
# remove Vancouver records with start_station_id not in stations table
vancouver_df = pd.merge(vancouver, stns_vancouver, how='inner', on='start_station_id')

In [None]:
# rename Vancouver station column to end_station_id
stns_vancouver = stns_vancouver.rename(columns={"start_station_id": "end_station_id"})
stns_vancouver.head()

In [None]:
#remove Vancouver records with end_station_id not in stations table
vancouver = pd.merge(vancouver_df, stns_vancouver, how='inner', on='end_station_id')

In [None]:
# check number of records for insert to database
vancouver.count()
print(vancouver.count())

In [None]:
# Load Vancouver bikeshare data to ridership table
vancouver.to_sql(name='ridership', con=engine, if_exists='append', index=False, index_label='id')

### ***Boston***

In [None]:
# check Boston data for missing start_station_id
missing_start = boston[-boston["start_station_id"].isin(all_stns_df["station_id"])]
missing_start

In [None]:
# check for missing end_station_id
missing_end = boston[-boston["end_station_id"].isin(all_stns_df["station_id"])]
missing_end

In [None]:
boston = boston[boston["start_station_id"].isin(all_stns_df["station_id"])]
boston = boston[boston["end_station_id"].isin(all_stns_df["station_id"])]
boston

In [None]:
# get all Boston stations
stns_boston = session.query(Stations.station_id).filter(Stations.location_id==3).all()
stns_boston = pd.DataFrame(stns_boston)
stns_boston = stns_boston.rename(columns={"station_id": "start_station_id"})
stns_boston

In [None]:
# remove Boston records with start_station_id not in stations table
df = pd.merge(boston, stns_boston, how='inner', on='start_station_id')

In [None]:
# rename Boston station column to end_station_id
stns_boston = stns_boston.rename(columns={"start_station_id": "end_station_id"})
stns_boston.head()

In [None]:
#remove Boston records with end_station_id not in stations table
boston_df = pd.merge(df, stns_boston, how='inner', on='end_station_id')

In [None]:
# count number of Boston records for insert to ridership table
boston_df.count()
print(boston_df.count())

In [None]:
# load Boston records to ridership table
boston_df.to_sql(name='ridership', con=engine, if_exists='append', index=False, index_label='id')

### ***New York***

In [None]:
# check for New York records with missing start_station_id in stations table
missing_start = newyork[-newyork["start_station_id"].isin(all_stns_df["station_id"])]
missing_start

In [None]:
# check for New York records with missing end_station_id in stations table
missing_end = newyork[-newyork["end_station_id"].isin(all_stns_df["station_id"])]
missing_end

In [None]:
# check New York records with start_station_id and end_station_id both in stations table
newyork = newyork[newyork["start_station_id"].isin(all_stns_df["station_id"])]
newyork = newyork[newyork["end_station_id"].isin(all_stns_df["station_id"])]
newyork

In [None]:
# get New York stations data
stns_newyork = session.query(Stations.station_id).filter(Stations.location_id==4).all()
stns_newyork = pd.DataFrame(stns_newyork)
stns_newyork = stns_newyork.rename(columns={"station_id": "start_station_id"})
stns_newyork

In [None]:
# remove New York records with start_station_id not existing in stations table
ny_df = pd.merge(newyork, stns_newyork, how='inner', on='start_station_id')

In [None]:
# rename New Yorks stations column to end_station_id
stns_newyork = stns_newyork.rename(columns={"start_station_id": "end_station_id"})
stns_newyork.head()

In [None]:
# remove New York records with end_station_id not existing in stations table
newyork_df = pd.merge(ny_df, stns_newyork, how='inner', on='end_station_id')

In [None]:
# count number of New York records for insert to ridership table
newyork_df.count()
print(newyork_df.count())

In [None]:
newyork_df.to_sql(name='ridership', con=engine, if_exists='append', index=False, index_label='id')

**Close session**

In [None]:
session.close()