In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import glob
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [3]:
files = glob.glob('data/JC-2016*-citibike-tripdata.csv')
df_list = []
for filename in files: 
    data = pd.read_csv(filename)
    df_list.append(data)
df = pd.concat(df_list)    

Start Time, Stop Time, and Birth Year seem like they should be timestamps and dates, but are objects. 

df.describe

User Type has 380 null values, <br> and Birth Year has 18,999 null values

df.shape

In [9]:
df['Trip Duration'] = pd.to_timedelta(df['Trip Duration'], unit='s')
df['Start Time'] = pd.to_datetime(df['Start Time'])
df['Stop Time'] = pd.to_datetime(df['Stop Time'])
df.index.name = 'Trip ID'

In [11]:
birth_year_test = {'X': [1975.0, 1985.0, 1976.0, 1974.0, 1974.0, np.nan, 1990.0, 1988.0, 1984.0, 1986.0]}

In [13]:
df_test = pd.DataFrame(data = birth_year_test)

In [15]:
imp = IterativeImputer(max_iter=10, random_state=0)

In [17]:
imp.fit(df_test)

In [19]:
df['Birth Year'] = np.round(imp.fit_transform(df[['Birth Year']]), 1)

In [21]:
user_type = df['User Type'].unique()
user_type

array(['Subscriber', 'Customer', nan], dtype=object)

In [23]:
df = df.dropna()

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 247204 entries, 0 to 19487
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype          
---  ------                   --------------   -----          
 0   Trip Duration            247204 non-null  timedelta64[ns]
 1   Start Time               247204 non-null  datetime64[ns] 
 2   Stop Time                247204 non-null  datetime64[ns] 
 3   Start Station ID         247204 non-null  int64          
 4   Start Station Name       247204 non-null  object         
 5   Start Station Latitude   247204 non-null  float64        
 6   Start Station Longitude  247204 non-null  float64        
 7   End Station ID           247204 non-null  int64          
 8   End Station Name         247204 non-null  object         
 9   End Station Latitude     247204 non-null  float64        
 10  End Station Longitude    247204 non-null  float64        
 11  Bike ID                  247204 non-null  int64          
 12  User Typ

In [27]:
nw = pd.read_csv('data/newark_airport_2016.csv')
nw.head()

Unnamed: 0,STATION,NAME,DATE,AWND,PGTM,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,TSUN,WDF2,WDF5,WSF2,WSF5
0,USW00014734,"NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US",2016-01-01,12.75,,0.0,0.0,0.0,41,43,34,,270,280.0,25.9,35.1
1,USW00014734,"NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US",2016-01-02,9.4,,0.0,0.0,0.0,36,42,30,,260,260.0,21.0,25.1
2,USW00014734,"NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US",2016-01-03,10.29,,0.0,0.0,0.0,37,47,28,,270,250.0,23.9,30.0
3,USW00014734,"NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US",2016-01-04,17.22,,0.0,0.0,0.0,32,35,14,,330,330.0,25.9,33.1
4,USW00014734,"NEWARK LIBERTY INTERNATIONAL AIRPORT, NJ US",2016-01-05,9.84,,0.0,0.0,0.0,19,31,10,,360,350.0,25.1,31.1


In [29]:
nw.drop(['PGTM', 'TSUN'], axis=1, inplace=True)

In [31]:
nw['DATE'] = pd.to_datetime(nw['DATE'])

In [33]:
nw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   STATION  366 non-null    object        
 1   NAME     366 non-null    object        
 2   DATE     366 non-null    datetime64[ns]
 3   AWND     366 non-null    float64       
 4   PRCP     366 non-null    float64       
 5   SNOW     366 non-null    float64       
 6   SNWD     366 non-null    float64       
 7   TAVG     366 non-null    int64         
 8   TMAX     366 non-null    int64         
 9   TMIN     366 non-null    int64         
 10  WDF2     366 non-null    int64         
 11  WDF5     364 non-null    float64       
 12  WSF2     366 non-null    float64       
 13  WSF5     364 non-null    float64       
dtypes: datetime64[ns](1), float64(7), int64(4), object(2)
memory usage: 40.2+ KB


In [35]:
nw = nw.dropna()

In [37]:
start_station_names = df['Start Station Name'].unique()

In [39]:
end_station_names = df['End Station Name'].unique()

In [41]:
weather_station_names = nw['NAME'].unique()

Questions to answer: 
* Time on the bike: Max, Min, Average, Distribution
* Distance on the bike: Max, Min, Average, Distribution Difference between Start Latitude/longitude and End Latitude/longitude
* Temps: Max, Min, Average, Distribution

* Count of trips by gender
* Length of trip by gender
* Count of trips by age
* Length of trip by age
* Avg temp to avg bike trip
* Count of trips in a temp range (with gender, with age)
* Count of trips with snow (with gender, with age)
* Count of trips with wind (with gender, with age)
* Count of trips with sun (with gender, with age)
* Avg time of trips in a temp range
* Count of trips started in each temp range
* Length of trips with snow
* Length of trips with wind speed > avg wind speed
* Length of trips with wind speed < avg wind speed
* Length of trips with sun
* When do non-subscribers use bikes the most? At any certain station?

Columns: 
df
Trip Duration
Start Station Latitude
Start Station Longitude
End Station Latitude
End Station Longitude
Gender
Birth Year
NEW COLUMN: Trip Length

nw 
TAVG
TMAX
TMIN
SNWD
AWND


In [44]:
# Haversine formula calculates trip distance
#Earth radius in kilometers (use 3958.8 for miles)
R = 6371  

# Convert latitude and longitude from degrees to radians
df['start_lat_rad'] = np.radians(df['Start Station Latitude'])
df['end_lat_rad'] = np.radians(df['End Station Latitude'])
df['start_lon_rad'] = np.radians(df['Start Station Longitude'])
df['end_lon_rad'] = np.radians(df['End Station Longitude'])

# Differences in coordinates
df['delta_lat'] = df['end_lat_rad'] - df['start_lat_rad']
df['delta_lon'] = df['end_lon_rad'] - df['start_lon_rad']

# Haversine formula
a = np.sin(df['delta_lat'] / 2)**2 + np.cos(df['start_lat_rad']) * np.cos(df['end_lat_rad']) * np.sin(df['delta_lon'] / 2)**2
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))

# Compute trip distance
df['Trip Length (km)'] = R * c

# Drop intermediate columns if needed
df.drop(columns=['start_lat_rad', 'end_lat_rad', 'start_lon_rad', 'end_lon_rad', 'delta_lat', 'delta_lon'], inplace=True)

print(df[['Start Station Latitude', 'End Station Latitude', 'Start Station Longitude', 'End Station Longitude', 'Trip Length (km)']])


       Start Station Latitude  End Station Latitude  Start Station Longitude  \
0                   40.727224             40.727596               -74.033759   
1                   40.730743             40.725340               -74.063784   
2                   40.716247             40.742677               -74.033459   
3                   40.712774             40.727596               -74.036486   
4                   40.727596             40.712774               -74.044247   
...                       ...                   ...                      ...   
19483               40.719586             40.724176               -74.043117   
19484               40.724176             40.721525               -74.050656   
19485               40.728745             40.728745               -74.032108   
19486               40.734961             40.746730               -74.059503   
19487               40.717732             40.721525               -74.043845   

       End Station Longitude  Trip Leng

In [46]:
df['Age'] = 2016 - df['Birth Year']

In [48]:
!pip install psycopg2-binary sqlalchemy pandas



In [54]:
from sqlalchemy import create_engine
from psycopg2.extras import execute_values
import psycopg2

In [62]:
DB_USER = 'lydiakonstanski'
DB_PASSWORD = 'Celeste'
DB_HOST = 'localhost'
DB_PORT = '5432'
DB_NAME = "Citibike_db"

# Create the database engine
engine = create_engine(f'postgresql+psycopg2://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}')

# Connect to PostgreSQL
conn = psycopg2.connect(
    dbname=DB_NAME, user=DB_USER, password=DB_PASSWORD, host=DB_HOST, port=DB_PORT
)
cur = conn.cursor()

# BIKES
# Convert 'Bike ID' column to a list of tuples (required format for execute_values)
#bike_tuples = [(int(row),) for row in df['Bike ID'].unique()]

# Define the INSERT query
# insert_bikes = "INSERT INTO bikes (id) VALUES %s"

# Use execute_values for bulk insert (efficient for large data)
#execute_values(cur, insert_bikes, bike_tuples)



#USERS
user_tuples = [tuple(row) for row in df[['Age', 'Gender', 'User Type', 'Trip ID']].dropna().to_numpy()]
user_insert = 'INSERT INTO users (age, gender, user_type, trip_id) VALUES %s'
execute_values(cur, user_insert, user_tuples)

#LOCATIONS
locations_tuples = 
locations_insert = 
execute_values(cur, user_insert, user_tuples)
#WEATHER 

#TRIPS

# Commit and close
conn.commit()
cur.close()
conn.close()

In [64]:
df.head()

Unnamed: 0,Trip Duration,Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,End Station Longitude,Bike ID,User Type,Birth Year,Gender,Trip Length (km),Age
0,0 days 00:06:01,2016-02-01 00:31:18,2016-02-01 00:37:19,3202,Newport PATH,40.727224,-74.033759,3203,Hamilton Park,40.727596,-74.044247,24393,Subscriber,1975.0,1,0.884786,41.0
1,0 days 00:04:57,2016-02-01 01:55:05,2016-02-01 02:00:02,3195,Sip Ave,40.730743,-74.063784,3194,McGinley Square,40.72534,-74.067622,24394,Subscriber,1985.0,2,0.682284,31.0
2,0 days 00:19:15,2016-02-01 02:40:05,2016-02-01 02:59:20,3183,Exchange Place,40.716247,-74.033459,3210,Pershing Field,40.742677,-74.051789,24676,Subscriber,1976.0,1,3.320056,40.0
3,0 days 00:29:29,2016-02-01 05:11:28,2016-02-01 05:40:58,3214,Essex Light Rail,40.712774,-74.036486,3203,Hamilton Park,40.727596,-74.044247,24700,Subscriber,1974.0,2,1.773164,42.0
4,0 days 00:15:35,2016-02-01 05:48:24,2016-02-01 06:03:59,3203,Hamilton Park,40.727596,-74.044247,3214,Essex Light Rail,40.712774,-74.036486,24639,Subscriber,1974.0,2,1.773164,42.0
