# Wrangeling and cleaning CityBike2022

In [1]:
import pandas as pd
import numpy as np
import os
import requests
import json
from datetime import datetime

In [2]:
from dotenv import load_dotenv

load_dotenv()

True

In [3]:

citibike_2022_path = os.getenv("CITIBIKE_2022_full")

In [5]:
# loading part1 of the citibike2022

citibike_2022_path = os.getenv("CITIBIKE_2022_full")
citibike_df = pd.read_csv(citibike_2022_path, usecols=['ride_id', 'rideable_type','member_casual', 'started_at', 'ended_at',
       'start_station_name','end_station_name'])

In [6]:
citibike_df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,end_station_name,member_casual
0,BFD29218AB271154,electric_bike,2022-01-21 13:13:43.392,2022-01-21 13:22:31.463,West End Ave & W 107 St,Mt Morris Park W & W 120 St,member
1,7C953F2FD7BE1302,classic_bike,2022-01-10 11:30:54.162,2022-01-10 11:41:43.422,4 Ave & 3 St,Boerum Pl\t& Pacific St,member
2,95893ABD40CED4B8,electric_bike,2022-01-26 10:52:43.096,2022-01-26 11:06:35.227,1 Ave & E 62 St,5 Ave & E 29 St,member
3,F853B50772137378,classic_bike,2022-01-03 08:35:48.247,2022-01-03 09:10:50.475,2 Ave & E 96 St,5 Ave & E 29 St,member
4,7590ADF834797B4B,classic_bike,2022-01-22 14:14:23.043,2022-01-22 14:34:57.474,6 Ave & W 34 St,5 Ave & E 29 St,member


In [7]:
citibike_df.shape

(29838806, 7)

In [8]:
# Convert the datatype of the times from Object to datetime
citibike_df['started_at'] = pd.to_datetime(citibike_df['started_at'])
citibike_df['ended_at']   = pd.to_datetime(citibike_df['ended_at'])


In [35]:
# creating Date column to use it later for merging the weather table
citibike_df['date'] = citibike_df['started_at'].dt.floor('D')


In [10]:
# Create month name column
citibike_df['month'] = citibike_df['started_at'].dt.month_name()

In [11]:
# Create weekday column
citibike_df['weekday'] = citibike_df['started_at'].dt.day_name()

In [12]:
citibike_df['start_hour'] = citibike_df['started_at'].dt.hour

In [13]:
# Creating ride duration column
citibike_df['duration'] = citibike_df['ended_at'] - citibike_df['started_at']

In [14]:
citibike_df['duration'].head()

0   0 days 00:08:48.071000
1   0 days 00:10:49.260000
2   0 days 00:13:52.131000
3   0 days 00:35:02.228000
4   0 days 00:20:34.431000
Name: duration, dtype: timedelta64[ns]

In [15]:
citibike_df['duration_min'] = citibike_df['duration'].dt.total_seconds() / 60

In [16]:
new_order = [
    'date', 'month', 'weekday','start_hour',
    'ride_id', 'member_casual', 'rideable_type',
    'started_at', 'ended_at', 'duration', 'duration_min',
    'start_station_name', 'end_station_name',
]


In [17]:
citibike_df = citibike_df[new_order]


In [58]:
citibike_df.head()

Unnamed: 0,date,month,weekday,start_hour,ride_id,member_casual,rideable_type,started_at,ended_at,duration,duration_min,start_station_name,end_station_name
0,2022-01-21,January,Friday,13,BFD29218AB271154,member,electric_bike,2022-01-21 13:13:43.392,2022-01-21 13:22:31.463,0 days 00:08:48.071000,8.801183,West End Ave & W 107 St,Mt Morris Park W & W 120 St
1,2022-01-10,January,Monday,11,7C953F2FD7BE1302,member,classic_bike,2022-01-10 11:30:54.162,2022-01-10 11:41:43.422,0 days 00:10:49.260000,10.821,4 Ave & 3 St,Boerum Pl\t& Pacific St
2,2022-01-26,January,Wednesday,10,95893ABD40CED4B8,member,electric_bike,2022-01-26 10:52:43.096,2022-01-26 11:06:35.227,0 days 00:13:52.131000,13.86885,1 Ave & E 62 St,5 Ave & E 29 St
3,2022-01-03,January,Monday,8,F853B50772137378,member,classic_bike,2022-01-03 08:35:48.247,2022-01-03 09:10:50.475,0 days 00:35:02.228000,35.037133,2 Ave & E 96 St,5 Ave & E 29 St
4,2022-01-22,January,Saturday,14,7590ADF834797B4B,member,classic_bike,2022-01-22 14:14:23.043,2022-01-22 14:34:57.474,0 days 00:20:34.431000,20.57385,6 Ave & W 34 St,5 Ave & E 29 St


In [36]:
citibike_df.dtypes

date                   datetime64[ns]
month                          object
weekday                        object
start_hour                      int32
ride_id                        object
member_casual                  object
rideable_type                  object
started_at             datetime64[ns]
ended_at               datetime64[ns]
duration              timedelta64[ns]
duration_min                  float64
start_station_name             object
end_station_name               object
dtype: object

In [None]:
citibike_df.to_csv(r"C:\Users\analy\Documents\Case-Study5--NewYork_Bike-sharing-service_Stategic_Analysis\Data\final_output\citybike2022_clean.csv")

## Daily Rides Aggregation 

In [66]:
rides_df = citibike_df.groupby("date")["ride_id"].count().to_frame(name='ride_count').sort_index()
rides_df.shape

(402, 1)

In [67]:
rides_df.head()

Unnamed: 0_level_0,ride_count
date,Unnamed: 1_level_1
2021-01-30,1
2021-02-15,1
2021-03-11,1
2021-03-14,1
2021-03-31,1


In [68]:
rides_df= rides_df[rides_df.index.year==2022]
rides_df.shape

(365, 1)

In [69]:
rides_df.head()

Unnamed: 0_level_0,ride_count
date,Unnamed: 1_level_1
2022-01-01,20428
2022-01-02,43009
2022-01-03,33189
2022-01-04,36842
2022-01-05,34230


# Merging Weather table with aggregated CityBike 2022 Daily rides

In [63]:
# upload weather table
weather_df_path = os.getenv("WEATHER_OUTPUT")
weather_df= pd.read_csv(weather_df_path,)


In [64]:
weather_df.head()

Unnamed: 0,date,tavg,prcp
0,2022-01-01,11.6,19.3
1,2022-01-02,11.4,1.0
2,2022-01-03,1.4,0.0
3,2022-01-04,-2.7,0.0
4,2022-01-05,3.2,6.1


In [65]:
weather_df.shape

(365, 3)

In [70]:
weather_df.dtypes

date     object
tavg    float64
prcp    float64
dtype: object

In [71]:
weather_df['date'] = pd.to_datetime(weather_df['date'])

In [72]:
weather_df.dtypes

date    datetime64[ns]
tavg           float64
prcp           float64
dtype: object

In [73]:
import gc
gc.collect()

9364

In [74]:
# merging weather table with citibike table
rides_temp = pd.merge(weather_df, rides_df, on='date', how='left')

In [75]:
rides_temp.shape

(365, 4)

In [76]:
rides_temp.head()

Unnamed: 0,date,tavg,prcp,ride_count
0,2022-01-01,11.6,19.3,20428
1,2022-01-02,11.4,1.0,43009
2,2022-01-03,1.4,0.0,33189
3,2022-01-04,-2.7,0.0,36842
4,2022-01-05,3.2,6.1,34230


In [77]:
rides_temp.isna().sum()

date          0
tavg          0
prcp          0
ride_count    0
dtype: int64

In [79]:
# Save combined new table
rides_temp_path = os.getenv("CITIBIKE_2022_WEATHER")
rides_temp.to_csv(rides_temp_path, index=False)
