In [None]:
# import dependencies
import pandas as pd
import psycopg2
import sqlalchemy
import numpy as np
import matplotlib as plt

%matplotlib inline

In [None]:
# connect Postgres db 
from sqlalchemy import create_engine

In [None]:
# the username and pw is 'postgres'
engine = create_engine("postgresql://postgres:postgres@localhost:5432/CitiBike")
conn = engine.connect()

In [None]:
# test the conn with simple query and view tables at the same time
tables = pd.read_sql("SELECT * from information_schema.tables WHERE table_catalog = 'CitiBike' AND table_schema = 'public'", conn)
tables.head(50)


###  Notice the tables have poor naming choices and NO PK: instead of using a sortable names like 01_2018, I went with january_2018 and so on... don't make these mistakes! 


### Create the new table to combine the data in pgAdmin

CREATE TABLE cumulative (
	id serial primary key,
	tripduration integer NOT NULL, 
	starttime date, stoptime date, 
	start_station_id varchar(20), 
	start_station_name varchar, 
	start_station_latitude float, start_station_longitude float,
	end_station_id varchar(20), end_station_name varchar, 
	end_station_latitude float, end_station_longitude float, 
	bikeid varchar(20),
	usertype varchar(20), birth_year varchar(4), gender integer
);

### NOTE

If you were smarter than I was, and remebered to add the PK to your initial tables, then all you need to do is copy a table setup into the new table instead of typing all the columns as in the above block.
#### Here is the code to do just that: 
SELECT * FROM september_2019; --choose any existing table to copy the format

CREATE TABLE cumulative AS TABLE september_2019 WITH NO DATA;

SELECT * FROM cumulative;  -- test that we have the matching empty table


### Add tables into cumulative table
-- insert data from direct import 

INSERT INTO public.cumulative( tripduration, starttime, stoptime, start_station_id, start_station_name, start_station_latitude, start_station_longitude, end_station_id, end_station_name, end_station_latitude, end_station_longitude, bikeid, usertype, birth_year, gender) 
(SELECT * FROM april_2018);

-- and test this out with 

SELECT * FROM cumulative LIMIT 10;

#### repeat this statement, changing the table name, for the remaining 22 tables:

INSERT INTO public.cumulative( tripduration, starttime, stoptime, start_station_id, start_station_name, start_station_latitude, start_station_longitude, end_station_id, end_station_name, end_station_latitude, end_station_longitude, bikeid, usertype, birth_year, gender) 
(SELECT * FROM april_2019);

### Now we are ready to query the combined table

In [None]:
# lets take a look at the data
station_count = pd.read_sql("SELECT COUNT(DISTINCT start_station_name) FROM cumulative", conn)
station_count

### We want to try to remove any unneccesary data from the table to speed up our queries and prevent tableau from crashing

### First let's test is there are candidates to remove 

Since the [CitiBike pricing](https://www.citibikenyc.com/pricing) states the max time you can have a bike is 45 minutes, lets looks there first. 

In [None]:
testmaxduration = pd.read_sql("SELECT tripduration FROM cumulative WHERE tripduration > (45*60) ORDER BY tripduration DESC LIMIT 50",conn)
testmaxduration.head(50)

### We can see there are plenty of trips that are over the 45 min mark, which could mean the bikes failed to properly dock, they were stolen or something else. 

In [None]:
# Im a day there are 
h = 24 # 24 hours per day
m = 60 # 60 min per hour
s = 60 # 60 sec per min
d = h*m*s
print(f'There are {d} seconds per day')

#### I'd like to take a look at some of the data for trips greater than 1 day

In [None]:
testmaxduration = pd.read_sql("SELECT tripduration, starttime, start_station_id, start_station_name, bikeid, usertype, birth_year, gender FROM cumulative WHERE tripduration > (86400) ORDER BY tripduration DESC LIMIT 200",conn)

In [None]:
tmd = pd.DataFrame(testmaxduration)
tmd.head(50)

In [None]:
# adding in end_station and a few other columns 
testmax2 = pd.read_sql("SELECT tripduration, starttime, start_station_id, end_station_id, start_station_name, end_station_name, bikeid, usertype, birth_year, gender FROM cumulative WHERE tripduration > (86400) order BY tripduration DESC LIMIT 50",conn)
testmax2.head(50)

There does **NOT** appear to be a relation between extended trip duration and any other factors so **I am going to remove any data with trip duration > 1 day and less than 90 seconds.** The data has already excluded trips less than 60 seconds, but I am going to be more generous and not qualify a trip as being less than 90 seconds. 

<div class="alert alert-block alert-info">
    <b>Warning:</b> The following statement took <strong>5 min and 30 sec</strong> to run in postgres, so this may be a good time to go make a sandwich or actually go for a bike ride.
</div>


<img width="320" height="320" src="Images/newbikes.png">


In [30]:
# Now I want to see if this is a potential candidate for uploading to Tableau by getting an approximate count
# The approximate_row_count happens to be MUCH FASTER than simply using COUNT

nb_count = pd.read_sql("SELECT reltuples AS approximate_row_count FROM pg_class WHERE relname = 'newbikes'",conn)
nb_count

Unnamed: 0,approximate_row_count
0,19024368.0


# 19 million rows! 

In [31]:
# Out of curiosity, what was the initial aprox count? 

cumul_ct = pd.read_sql("SELECT reltuples AS approximate_row_count FROM pg_class WHERE relname = 'cumulative'",conn)
cumul_ct

Unnamed: 0,approximate_row_count
0,19147684.0


In [32]:
cumul_ct - nb_count

Unnamed: 0,approximate_row_count
0,123316.0


### If only there was a way to make smaller and more meaningful tables...

<img width="300" height="300" src="Images/death-brainstorm.png">


@Ideas: make smaller tables grouping and joining trips by:
* gender, age
* usertype
* quarters
* most active stations
* aggregate functions
* top n 

In [28]:
# If we divide this data by quarters we should get about 19,024,000/7.66 rows per quarter or 
int(19024000/7.66) #since we are missing december 2019

2483550

But since we know the max rows for an excel file is about 1,048,000, and we don't want to use files that large anyway, 
we will need to make smaller files.

### Back to pgadmin
We are going to create a table of the starting stations and the count of trips each station recorded

SELECT COUNT(id), start_station_name  
FROM newbikes  
GROUP BY start_station_name  
ORDER BY COUNT(id) DESC;

### Using pgadmin, we can export this table to a csv to load into Tableau
(we can also create a df in python here and write to csv)

<img width="400" height="300" src="https://github.com/JonRinko/Citi-Bike-Data-Analysis/blob/master/Images/start_stations.PNG">

### In order to map this in tableau, lets inner join this with the station long and lats in a new table 

<img width="400" height="300" src="https://github.com/JonRinko/Citi-Bike-Data-Analysis/blob/master/Images/start-station-join-coords.PNG">

### From just one csv, we can create 3 visuals and a [Dashboard](https://public.tableau.com/profile/jon4546#!/vizhome/CitiBikeStartStationTripCountsDashboard/Dashboard1?publish=yes) in Tableau! 
<img width="700" height="500" src="https://github.com/JonRinko/Citi-Bike-Data-Analysis/blob/master/Images/dash1.PNG">

In [None]:
# TODO add vid of dashboard and link 

## Bonus: 
Use Linear Regression to Forecast December's Data 
<br>Use same Linear Regression to Forecast November's Data and compare to actual  