## CitiBike data is collected for 1 year 2018
https://ride.citibikenyc.com/system-data

Data Schema:
Trip Duration (seconds)

*   Start Time and Date
*   Stop Time and Date
*   Start Station Name
*   End Station Name
*   Station ID
*   Station Lat/Long
*   Bike ID
*   User Type (Customer = 24-hour pass or 3-day pass user; Subscriber = Annual    Member)
*   Gender (Zero=unknown; 1=male; 2=female)
*   Year of Birth
 

In [221]:
import pandas as pd
import numpy as np
import haversine as hs
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
%matplotlib inline
from glob import iglob

import warnings
warnings.filterwarnings("ignore")


In [222]:
path = r'D:\Beuth Sems\Thesis\dataset\JC-2018*.csv'

all_rec = iglob(path, recursive=True)     
dataframes = (pd.read_csv(f) for f in all_rec)
tripdata = pd.concat(dataframes, ignore_index=True)

  

In [223]:
tripdata.describe()

Unnamed: 0,tripduration,start station id,start station latitude,start station longitude,end station id,end station latitude,end station longitude,bikeid,birth year,gender
count,353892.0,353892.0,353892.0,353892.0,353892.0,353892.0,353892.0,353892.0,353892.0,353892.0
mean,673.393,3264.996606,40.722724,-74.046039,3258.408418,40.722326,-74.045505,29452.498808,1980.387875,1.152388
std,7004.022,138.429108,0.007251,0.010755,147.610023,0.007095,0.010759,2529.992371,10.268528,0.500198
min,61.0,3183.0,40.69264,-74.096937,127.0,40.679331,-74.096937,14697.0,1887.0,0.0
25%,228.0,3192.0,40.718211,-74.050444,3186.0,40.717732,-74.049968,26315.0,1974.0,1.0
50%,335.0,3205.0,40.721525,-74.043845,3203.0,40.721124,-74.043117,29493.0,1983.0,1.0
75%,549.0,3272.0,40.727224,-74.038051,3272.0,40.727224,-74.037683,29679.0,1988.0,1.0
max,2061932.0,3694.0,40.748716,-74.032108,3694.0,40.814326,-73.947821,35009.0,2002.0,2.0


In [224]:
tripdata.isnull().sum()

tripduration               0
starttime                  0
stoptime                   0
start station id           0
start station name         0
start station latitude     0
start station longitude    0
end station id             0
end station name           0
end station latitude       0
end station longitude      0
bikeid                     0
usertype                   0
birth year                 0
gender                     0
dtype: int64

In [225]:
tripdata.shape

(353892, 15)

In [226]:
tripdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 353892 entries, 0 to 353891
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   tripduration             353892 non-null  int64  
 1   starttime                353892 non-null  object 
 2   stoptime                 353892 non-null  object 
 3   start station id         353892 non-null  int64  
 4   start station name       353892 non-null  object 
 5   start station latitude   353892 non-null  float64
 6   start station longitude  353892 non-null  float64
 7   end station id           353892 non-null  int64  
 8   end station name         353892 non-null  object 
 9   end station latitude     353892 non-null  float64
 10  end station longitude    353892 non-null  float64
 11  bikeid                   353892 non-null  int64  
 12  usertype                 353892 non-null  object 
 13  birth year               353892 non-null  int64  
 14  gend

### calculate travelled distances based on lat and lon


In [227]:
##!pip install haversine

In [228]:
from haversine import Unit
import haversine as hs

def find_distance(slat, slon, elat, elon):
    loc1=(slat,slon)
    loc2=(elat, elon)
    return round(hs.haversine(loc1,loc2,unit=Unit.MILES),1)

In [229]:

tripdata['dist'] = tripdata.apply(lambda r: find_distance(r['start station latitude'],r['start station longitude'],r['end station latitude'],r['end station longitude']) , axis=1)

In [230]:
tripdata.head(5)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,dist
0,932,2018-01-01 02:06:17.5410,2018-01-01 02:21:50.0270,3183,Exchange Place,40.716247,-74.033459,3199,Newport Pkwy,40.728745,-74.032108,31929,Subscriber,1992,1,0.9
1,550,2018-01-01 12:06:18.0390,2018-01-01 12:15:28.4430,3183,Exchange Place,40.716247,-74.033459,3199,Newport Pkwy,40.728745,-74.032108,31845,Subscriber,1969,2,0.9
2,510,2018-01-01 12:06:56.9780,2018-01-01 12:15:27.8100,3183,Exchange Place,40.716247,-74.033459,3199,Newport Pkwy,40.728745,-74.032108,31708,Subscriber,1946,1,0.9
3,354,2018-01-01 14:53:10.1860,2018-01-01 14:59:05.0960,3183,Exchange Place,40.716247,-74.033459,3267,Morris Canal,40.712419,-74.038526,31697,Subscriber,1994,1,0.4
4,250,2018-01-01 17:34:30.1920,2018-01-01 17:38:40.9840,3183,Exchange Place,40.716247,-74.033459,3639,Harborside,40.719252,-74.034234,31861,Subscriber,1991,1,0.2


## Explode the start and end date into different columns

In [231]:
tripdata['starttime'] = pd.to_datetime(tripdata['starttime'])
tripdata['stoptime'] = pd.to_datetime(tripdata['stoptime'])
tripdata['year'] = tripdata["starttime"].dt.year
tripdata["month"] = tripdata["starttime"].dt.month_name()
tripdata['day'] = tripdata["starttime"].dt.day_name()
tripdata["hour"] = tripdata["starttime"].dt.hour

In [232]:
tripdata

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,dist,year,month,day,hour
0,932,2018-01-01 02:06:17.541,2018-01-01 02:21:50.027,3183,Exchange Place,40.716247,-74.033459,3199,Newport Pkwy,40.728745,-74.032108,31929,Subscriber,1992,1,0.9,2018,January,Monday,2
1,550,2018-01-01 12:06:18.039,2018-01-01 12:15:28.443,3183,Exchange Place,40.716247,-74.033459,3199,Newport Pkwy,40.728745,-74.032108,31845,Subscriber,1969,2,0.9,2018,January,Monday,12
2,510,2018-01-01 12:06:56.978,2018-01-01 12:15:27.810,3183,Exchange Place,40.716247,-74.033459,3199,Newport Pkwy,40.728745,-74.032108,31708,Subscriber,1946,1,0.9,2018,January,Monday,12
3,354,2018-01-01 14:53:10.186,2018-01-01 14:59:05.096,3183,Exchange Place,40.716247,-74.033459,3267,Morris Canal,40.712419,-74.038526,31697,Subscriber,1994,1,0.4,2018,January,Monday,14
4,250,2018-01-01 17:34:30.192,2018-01-01 17:38:40.984,3183,Exchange Place,40.716247,-74.033459,3639,Harborside,40.719252,-74.034234,31861,Subscriber,1991,1,0.2,2018,January,Monday,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353887,1081,2018-12-22 11:51:46.060,2018-12-22 12:09:47.473,3694,Jackson Square,40.711130,-74.078900,3269,Brunswick & 6th,40.726012,-74.050389,29586,Subscriber,1993,1,1.8,2018,December,Saturday,11
353888,344,2018-12-25 21:40:09.866,2018-12-25 21:45:54.267,3694,Jackson Square,40.711130,-74.078900,3280,Astor Place,40.719282,-74.071262,26241,Subscriber,1983,2,0.7,2018,December,Tuesday,21
353889,1233,2018-12-29 12:55:45.969,2018-12-29 13:16:19.596,3694,Jackson Square,40.711130,-74.078900,3186,Grove St PATH,40.719586,-74.043117,29294,Subscriber,1988,1,2.0,2018,December,Saturday,12
353890,1057,2018-12-30 15:32:09.332,2018-12-30 15:49:46.351,3694,Jackson Square,40.711130,-74.078900,3213,Van Vorst Park,40.718489,-74.047727,29475,Subscriber,1991,2,1.7,2018,December,Sunday,15


In [233]:
#!pip install holidays

In [234]:
from datetime import date
import holidays

# Select country
us_holidays = holidays.US()
data=[]
for ptr in holidays.US(years = 2018).items():
    data.append([ptr[0],ptr[1]])
    
df_holiday = pd.DataFrame(data, columns=['date','holiday'])
  
# print dataframe.
df_holiday   

Unnamed: 0,date,holiday
0,2018-01-01,New Year's Day
1,2018-01-15,Martin Luther King Jr. Day
2,2018-02-19,Washington's Birthday
3,2018-05-28,Memorial Day
4,2018-07-04,Independence Day
5,2018-09-03,Labor Day
6,2018-10-08,Columbus Day
7,2018-11-11,Veterans Day
8,2018-11-12,Veterans Day (Observed)
9,2018-11-22,Thanksgiving


### Add holiday details to trip data

In [235]:
tripdata['date'] = tripdata['starttime'].dt.date

In [236]:
df = pd.merge(
    left=tripdata, 
    right=df_holiday, 
    on='date',
    how='left'
)

In [237]:
df['holiday'].fillna('No Holiday', inplace=True)

In [238]:
df.head(10)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,...,usertype,birth year,gender,dist,year,month,day,hour,date,holiday
0,932,2018-01-01 02:06:17.541,2018-01-01 02:21:50.027,3183,Exchange Place,40.716247,-74.033459,3199,Newport Pkwy,40.728745,...,Subscriber,1992,1,0.9,2018,January,Monday,2,2018-01-01,New Year's Day
1,550,2018-01-01 12:06:18.039,2018-01-01 12:15:28.443,3183,Exchange Place,40.716247,-74.033459,3199,Newport Pkwy,40.728745,...,Subscriber,1969,2,0.9,2018,January,Monday,12,2018-01-01,New Year's Day
2,510,2018-01-01 12:06:56.978,2018-01-01 12:15:27.810,3183,Exchange Place,40.716247,-74.033459,3199,Newport Pkwy,40.728745,...,Subscriber,1946,1,0.9,2018,January,Monday,12,2018-01-01,New Year's Day
3,354,2018-01-01 14:53:10.186,2018-01-01 14:59:05.096,3183,Exchange Place,40.716247,-74.033459,3267,Morris Canal,40.712419,...,Subscriber,1994,1,0.4,2018,January,Monday,14,2018-01-01,New Year's Day
4,250,2018-01-01 17:34:30.192,2018-01-01 17:38:40.984,3183,Exchange Place,40.716247,-74.033459,3639,Harborside,40.719252,...,Subscriber,1991,1,0.2,2018,January,Monday,17,2018-01-01,New Year's Day
5,613,2018-01-01 22:05:05.874,2018-01-01 22:15:19.419,3183,Exchange Place,40.716247,-74.033459,3203,Hamilton Park,40.727596,...,Subscriber,1982,1,1.0,2018,January,Monday,22,2018-01-01,New Year's Day
6,290,2018-01-02 12:13:51.794,2018-01-02 12:18:42.107,3183,Exchange Place,40.716247,-74.033459,3267,Morris Canal,40.712419,...,Subscriber,1958,1,0.4,2018,January,Tuesday,12,2018-01-02,No Holiday
7,381,2018-01-02 12:50:03.343,2018-01-02 12:56:24.644,3183,Exchange Place,40.716247,-74.033459,3205,JC Medical Center,40.71654,...,Subscriber,1989,2,0.8,2018,January,Tuesday,12,2018-01-02,No Holiday
8,318,2018-01-02 13:55:58.243,2018-01-02 14:01:16.881,3183,Exchange Place,40.716247,-74.033459,3275,Columbus Drive,40.718355,...,Subscriber,1960,1,0.3,2018,January,Tuesday,13,2018-01-02,No Holiday
9,1852,2018-01-02 16:55:29.639,2018-01-02 17:26:22.305,3183,Exchange Place,40.716247,-74.033459,3281,Leonard Gordon Park,40.74591,...,Subscriber,1976,1,2.4,2018,January,Tuesday,16,2018-01-02,No Holiday


## Read a weather data

In [239]:
weather_data = pd.read_csv("../dataset/newyork_weather.csv")
weather_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 26 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   STATION    365 non-null    object 
 1   NAME       365 non-null    object 
 2   LATITUDE   365 non-null    float64
 3   LONGITUDE  365 non-null    float64
 4   ELEVATION  365 non-null    float64
 5   DATE       365 non-null    object 
 6   AWND       281 non-null    float64
 7   PGTM       0 non-null      float64
 8   PRCP       365 non-null    float64
 9   SNOW       365 non-null    float64
 10  SNWD       365 non-null    float64
 11  TAVG       0 non-null      float64
 12  TMAX       365 non-null    int64  
 13  TMIN       365 non-null    int64  
 14  TSUN       0 non-null      float64
 15  WDF2       281 non-null    float64
 16  WDF5       281 non-null    float64
 17  WSF2       281 non-null    float64
 18  WSF5       281 non-null    float64
 19  WT01       176 non-null    float64
 20  WT02      

In [240]:
weather_data.isnull().sum()


STATION        0
NAME           0
LATITUDE       0
LONGITUDE      0
ELEVATION      0
DATE           0
AWND          84
PGTM         365
PRCP           0
SNOW           0
SNWD           0
TAVG         365
TMAX           0
TMIN           0
TSUN         365
WDF2          84
WDF5          84
WSF2          84
WSF5          84
WT01         189
WT02         355
WT03         340
WT04         365
WT05         365
WT06         364
WT08         304
dtype: int64

In [241]:
## selected only columns which have full data
columns = ['DATE','STATION','NAME','ELEVATION','PRCP','SNOW','SNWD','TMAX','TMIN']
weather_df = weather_data[columns]
'''
Schema Description:
TMAX -- Maximun Temp
TMIN - Minimum Temp
ELEVATION -----
PRCP----Precipitation
SNOW --- Snowfall
SNWD  ---Snow depth
'''

'\nSchema Description:\nTMAX -- Maximun Temp\nTMIN - Minimum Temp\nELEVATION -----\nPRCP----Precipitation\nSNOW --- Snowfall\nSNWD  ---Snow depth\n'

In [242]:
weather_df.rename(columns={'DATE': 'date'}, inplace=True)
weather_df['date'] = pd.to_datetime(weather_df['date'])
weather_df.dtypes


date         datetime64[ns]
STATION              object
NAME                 object
ELEVATION           float64
PRCP                float64
SNOW                float64
SNWD                float64
TMAX                  int64
TMIN                  int64
dtype: object

In [243]:
df['date'] = pd.to_datetime(df['date'])
df.dtypes

tripduration                        int64
starttime                  datetime64[ns]
stoptime                   datetime64[ns]
start station id                    int64
start station name                 object
start station latitude            float64
start station longitude           float64
end station id                      int64
end station name                   object
end station latitude              float64
end station longitude             float64
bikeid                              int64
usertype                           object
birth year                          int64
gender                              int64
dist                              float64
year                                int64
month                              object
day                                object
hour                                int64
date                       datetime64[ns]
holiday                            object
dtype: object

In [244]:
complete_df = pd.merge(
    left=df, 
    right=weather_df, 
    on='date',
    how='left'
)

In [245]:
complete_df.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,...,date,holiday,STATION,NAME,ELEVATION,PRCP,SNOW,SNWD,TMAX,TMIN
0,932,2018-01-01 02:06:17.541,2018-01-01 02:21:50.027,3183,Exchange Place,40.716247,-74.033459,3199,Newport Pkwy,40.728745,...,2018-01-01,New Year's Day,USW00094728,"NY CITY CENTRAL PARK, NY US",42.7,0.0,0.0,0.0,19,7
1,550,2018-01-01 12:06:18.039,2018-01-01 12:15:28.443,3183,Exchange Place,40.716247,-74.033459,3199,Newport Pkwy,40.728745,...,2018-01-01,New Year's Day,USW00094728,"NY CITY CENTRAL PARK, NY US",42.7,0.0,0.0,0.0,19,7
2,510,2018-01-01 12:06:56.978,2018-01-01 12:15:27.810,3183,Exchange Place,40.716247,-74.033459,3199,Newport Pkwy,40.728745,...,2018-01-01,New Year's Day,USW00094728,"NY CITY CENTRAL PARK, NY US",42.7,0.0,0.0,0.0,19,7
3,354,2018-01-01 14:53:10.186,2018-01-01 14:59:05.096,3183,Exchange Place,40.716247,-74.033459,3267,Morris Canal,40.712419,...,2018-01-01,New Year's Day,USW00094728,"NY CITY CENTRAL PARK, NY US",42.7,0.0,0.0,0.0,19,7
4,250,2018-01-01 17:34:30.192,2018-01-01 17:38:40.984,3183,Exchange Place,40.716247,-74.033459,3639,Harborside,40.719252,...,2018-01-01,New Year's Day,USW00094728,"NY CITY CENTRAL PARK, NY US",42.7,0.0,0.0,0.0,19,7


In [246]:
complete_df.isnull().sum()

tripduration               0
starttime                  0
stoptime                   0
start station id           0
start station name         0
start station latitude     0
start station longitude    0
end station id             0
end station name           0
end station latitude       0
end station longitude      0
bikeid                     0
usertype                   0
birth year                 0
gender                     0
dist                       0
year                       0
month                      0
day                        0
hour                       0
date                       0
holiday                    0
STATION                    0
NAME                       0
ELEVATION                  0
PRCP                       0
SNOW                       0
SNWD                       0
TMAX                       0
TMIN                       0
dtype: int64

## add seasonality data
1. spring (March-May)
2. summer (June-August)
3. autumn (September-November) 
4. winter (December-February

In [247]:
def add_seasonality(month):
    if (month == 'March' or month == 'April' or month == 'May'):
        return 'spring'
    elif month == 'June' or month == 'July' or month == 'August':
        return 'summer'
    elif month == 'September' or month == 'October' or month == 'November':
        return 'autumn'
    elif month == 'December' or month == 'January' or month == 'February':
        return 'winter'

In [248]:
complete_df['month'].unique()

array(['January', 'February', 'March', 'April', 'May', 'June', 'July',
       'August', 'September', 'October', 'November', 'December'],
      dtype=object)

In [249]:

complete_df['seasons'] = complete_df.apply(lambda r: add_seasonality(r['month']) , axis=1)

In [269]:
complete_df

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,...,holiday,STATION,NAME,ELEVATION,PRCP,SNOW,SNWD,TMAX,TMIN,seasons
0,932,2018-01-01 02:06:17.541,2018-01-01 02:21:50.027,3183,Exchange Place,40.716247,-74.033459,3199,Newport Pkwy,40.728745,...,New Year's Day,USW00094728,"NY CITY CENTRAL PARK, NY US",42.7,0.00,0.0,0.0,19,7,winter
1,550,2018-01-01 12:06:18.039,2018-01-01 12:15:28.443,3183,Exchange Place,40.716247,-74.033459,3199,Newport Pkwy,40.728745,...,New Year's Day,USW00094728,"NY CITY CENTRAL PARK, NY US",42.7,0.00,0.0,0.0,19,7,winter
2,510,2018-01-01 12:06:56.978,2018-01-01 12:15:27.810,3183,Exchange Place,40.716247,-74.033459,3199,Newport Pkwy,40.728745,...,New Year's Day,USW00094728,"NY CITY CENTRAL PARK, NY US",42.7,0.00,0.0,0.0,19,7,winter
3,354,2018-01-01 14:53:10.186,2018-01-01 14:59:05.096,3183,Exchange Place,40.716247,-74.033459,3267,Morris Canal,40.712419,...,New Year's Day,USW00094728,"NY CITY CENTRAL PARK, NY US",42.7,0.00,0.0,0.0,19,7,winter
4,250,2018-01-01 17:34:30.192,2018-01-01 17:38:40.984,3183,Exchange Place,40.716247,-74.033459,3639,Harborside,40.719252,...,New Year's Day,USW00094728,"NY CITY CENTRAL PARK, NY US",42.7,0.00,0.0,0.0,19,7,winter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353887,1081,2018-12-22 11:51:46.060,2018-12-22 12:09:47.473,3694,Jackson Square,40.711130,-74.078900,3269,Brunswick & 6th,40.726012,...,No Holiday,USW00094728,"NY CITY CENTRAL PARK, NY US",42.7,0.02,0.0,0.0,54,40,winter
353888,344,2018-12-25 21:40:09.866,2018-12-25 21:45:54.267,3694,Jackson Square,40.711130,-74.078900,3280,Astor Place,40.719282,...,Christmas Day,USW00094728,"NY CITY CENTRAL PARK, NY US",42.7,0.00,0.0,0.0,40,33,winter
353889,1233,2018-12-29 12:55:45.969,2018-12-29 13:16:19.596,3694,Jackson Square,40.711130,-74.078900,3186,Grove St PATH,40.719586,...,No Holiday,USW00094728,"NY CITY CENTRAL PARK, NY US",42.7,0.00,0.0,0.0,57,39,winter
353890,1057,2018-12-30 15:32:09.332,2018-12-30 15:49:46.351,3694,Jackson Square,40.711130,-74.078900,3213,Van Vorst Park,40.718489,...,No Holiday,USW00094728,"NY CITY CENTRAL PARK, NY US",42.7,0.00,0.0,0.0,40,33,winter


In [277]:
complete_df['seasons'].unique()

array(['winter', 'spring', 'summer', 'autumn'], dtype=object)

In [278]:
complete_df.to_csv("../dataset/complete_bike_sharing.csv", index=False,)