In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import glob
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

### Iterate through CSVs, concat files, and create DataFrame:

In [4]:
files = glob.glob('data/JC-2016*-citibike-tripdata.csv')
df_list = []
for filename in files: 
    data = pd.read_csv(filename)
    df_list.append(data)
df = pd.concat(df_list)    

### Inspect the dataframe:

Start Time, Stop Time, and Birth Year seem like they should be timestamps and dates, but are objects. 

df.describe

User Type has 380 null values, <br> and Birth Year has 18,999 null values

df.shape

In [11]:
#Are there more starts or ends? 
start_station_names = df['Start Station Name'].unique()

In [13]:
#There are more ends
end_station_names = df['End Station Name'].unique()

### Haversine formula calculates trip distance

In [16]:
#Earth radius in kilometers (use 3958.8 for miles)
R = 6371  

# Convert latitude and longitude from degrees to radians
df['start_lat_rad'] = np.radians(df['Start Station Latitude'])
df['end_lat_rad'] = np.radians(df['End Station Latitude'])
df['start_lon_rad'] = np.radians(df['Start Station Longitude'])
df['end_lon_rad'] = np.radians(df['End Station Longitude'])

# Differences in coordinates
df['delta_lat'] = df['end_lat_rad'] - df['start_lat_rad']
df['delta_lon'] = df['end_lon_rad'] - df['start_lon_rad']

# Haversine formula
a = np.sin(df['delta_lat'] / 2)**2 + np.cos(df['start_lat_rad']) * np.cos(df['end_lat_rad']) * np.sin(df['delta_lon'] / 2)**2
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

# Compute trip distance
df['Trip Length (km)'] = R * c

# Drop intermediate columns if needed
df.drop(columns=['start_lat_rad', 'end_lat_rad', 'start_lon_rad', 'end_lon_rad', 'delta_lat', 'delta_lon'], inplace=True)

print(df[['Start Station Latitude', 'End Station Latitude', 'Start Station Longitude', 'End Station Longitude', 'Trip Length (km)']])


       Start Station Latitude  End Station Latitude  Start Station Longitude  \
0                   40.727224             40.727596               -74.033759   
1                   40.730743             40.725340               -74.063784   
2                   40.716247             40.742677               -74.033459   
3                   40.712774             40.727596               -74.036486   
4                   40.727596             40.712774               -74.044247   
...                       ...                   ...                      ...   
19483               40.719586             40.724176               -74.043117   
19484               40.724176             40.721525               -74.050656   
19485               40.728745             40.728745               -74.032108   
19486               40.734961             40.746730               -74.059503   
19487               40.717732             40.721525               -74.043845   

       End Station Longitude  Trip Leng

### Changing DataTypes and Naming the Index

In [19]:
df['Trip Duration'] = pd.to_timedelta(df['Trip Duration'], unit='s')
df['Start Time'] = pd.to_datetime(df['Start Time'])
df['Stop Time'] = pd.to_datetime(df['Stop Time'])
df.index.name = 'Trip ID'

### Multiple Imputation for Birth Year NaN

In [22]:
#Providing a test series for trialing the Imputation
birth_year_test = {'X': [1975.0, 1985.0, 1976.0, 1974.0, 1974.0, np.nan, 1990.0, 1988.0, 1984.0, 1986.0]}

In [24]:
#Creating the DataFrame for the test data
df_test = pd.DataFrame(data = birth_year_test)

In [26]:
#Setting up the imputer
imp = IterativeImputer(max_iter=10, random_state=0)

In [28]:
imp.fit(df_test)

In [30]:
# Applying Imputation to DataFrame
df['Birth Year'] = np.round(imp.fit_transform(df[['Birth Year']]), 1)

In [32]:
# New Column 'Age'
df['Age'] = 2016 - df['Birth Year']

### Addressing NaN User Types

In [35]:
user_type = df['User Type'].unique()
user_type

array(['Subscriber', 'Customer', nan], dtype=object)

In [37]:
df = df.dropna()

### Read in and inspect the weather csv

In [40]:
nw = pd.read_csv('data/newark_airport_2016.csv')
nw.head()

Unnamed: 0,STATION,NAME,DATE,AWND,PGTM,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,TSUN,WDF2,WDF5,WSF2,WSF5
0,USW00014734,"NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US",2016-01-01,12.75,,0.0,0.0,0.0,41,43,34,,270,280.0,25.9,35.1
1,USW00014734,"NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US",2016-01-02,9.4,,0.0,0.0,0.0,36,42,30,,260,260.0,21.0,25.1
2,USW00014734,"NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US",2016-01-03,10.29,,0.0,0.0,0.0,37,47,28,,270,250.0,23.9,30.0
3,USW00014734,"NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US",2016-01-04,17.22,,0.0,0.0,0.0,32,35,14,,330,330.0,25.9,33.1
4,USW00014734,"NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US",2016-01-05,9.84,,0.0,0.0,0.0,19,31,10,,360,350.0,25.1,31.1


### Cleaning the weather frame

In [43]:
# Don't need the completely empty columns
nw.drop(['PGTM', 'TSUN'], axis=1, inplace=True)

In [45]:
# Changing the data types
nw['DATE'] = pd.to_datetime(nw['DATE'])

In [47]:
#Drop a few incomplete rows
nw = nw.dropna()

### Setting Up SQL DB

Questions to answer: 
* Time on the bike: Max, Min, Average, Distribution
* Distance on the bike: Max, Min, Average, Distribution Difference between Start Latitude/longitude and End Latitude/longitude
* Temps: Max, Min, Average, Distribution

* Count of trips by gender
* Length of trip by gender
* Count of trips by age
* Length of trip by age
* Avg temp to avg bike trip
* Count of trips in a temp range (with gender, with age)
* Count of trips with snow (with gender, with age)
* Count of trips with wind (with gender, with age)
* Count of trips with sun (with gender, with age)
* Avg time of trips in a temp range
* Count of trips started in each temp range
* Length of trips with snow
* Length of trips with wind speed > avg wind speed
* Length of trips with wind speed < avg wind speed
* Length of trips with sun
* When do non-subscribers use bikes the most? At any certain station?

Columns: 
df
Trip Duration
Start Station Latitude
Start Station Longitude
End Station Latitude
End Station Longitude
Gender
Birth Year
NEW COLUMN: Trip Length

nw 
TAVG
TMAX
TMIN
SNWD
AWND


In [51]:
from sqlalchemy import create_engine
from psycopg2.extras import execute_values
import psycopg2

In [53]:
DB_USER = 'lydiakonstanski'
DB_PASSWORD = 'Celeste'
DB_HOST = 'localhost'
DB_PORT = '5432'
DB_NAME = "Citibike_db"

# Create the database engine
engine = create_engine(f'postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}')

# Connect to PostgreSQL
conn = psycopg2.connect(
    dbname=DB_NAME, user=DB_USER, password=DB_PASSWORD, host=DB_HOST, port=DB_PORT
)
cur = conn.cursor()

### Filling DataBase established on Postbird

In [56]:
# BIKES
# Convert 'Bike ID' column to a list of tuples (required format for single column execute_values)
bike_tuples = [(int(row),) for row in df['Bike ID'].unique()]

# Define the INSERT query
insert_bikes = "INSERT INTO bikes (id) VALUES %s"

# Use execute_values for bulk insert (efficient for large data)
execute_values(cur, insert_bikes, bike_tuples)

In [64]:
unique_stations = df['End Station ID'].unique()

In [66]:
df['unique_stations'] = pd.DataFrame(unique_stations)

In [68]:
#LOCATIONS
locations_tuples = [tuple(row) for row in df[['End Station ID', 'unique_stations', 'End Station Latitude', 'End Station Longitude']].dropna().to_numpy()]
locations_insert = 'INSERT INTO locations (id, station_name, latitude, longitude) VALUES %s'
execute_values(cur, locations_insert, locations_tuples)

InFailedSqlTransaction: current transaction is aborted, commands ignored until end of transaction block


In [None]:
#WEATHER 
weather_tuples =[tuple(row) for row in nw[['DATE', 'TAVG', 'TMIN', 'TMAX', 'PRCP', 'SNWD', 'AWND']].dropna().to_numpy()]
weather_insert = 'INSERT INTO weather (date, temp_avg, temp_min, temp_max, precipitation, snow_depth, avg_wind) VALUES %s'
execute_values(cur, weather_insert, weather_tuples)

In [None]:
#TRIPS
trips_tuples = [tuple(row) for row in df[['Start Time', 'Bike ID', 'Start Station ID', 'End Station ID', 'Start Time', 'Stop Time', 'Trip Duration', 'Trip Length (km)' ]].dropna().to_numpy()]
trips_insert = 'INSERT INTO trips (date, bike_id, start_location, end_location, start_time, end_time, trip_duration, trip_distance) VALUES %s'
execute_values(cur, trips_insert, trips_tuples)

In [None]:
# Commit and close
conn.commit()
cur.close()
conn.close()