In [154]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt
import datetime as dt
import os 




### Concatenate Datasets from the last 6 months

In [155]:
list_of_dfs = []

for filename in os.listdir('datasets/2023'):
    list_of_dfs.append(filename)
df_list = []
for i in range(len(list_of_dfs)):
    name = 'df_' + str(i)
    name = pd.read_csv('datasets/2023/' + list_of_dfs[i])
    df_list.append(name)


In [156]:
df_may = df_list[0]
df_february = df_list[1]
df_march = df_list[2]
df_april = df_list[3]
df_june = df_list[4]
df_january = df_list[5]


In [157]:
my_list = [df_january, df_february,df_march,df_april,df_may,df_june]

In [158]:
all_trips = pd.concat(my_list)

### Data Cleaning

In [159]:
all_trips.isna().sum()

ride_id                    0
rideable_type              0
started_at                 0
ended_at                   0
start_station_name    132564
start_station_id      132564
end_station_name      142260
end_station_id        142260
start_lat                  0
start_lng                  0
end_lat                 2875
end_lng                 2875
member_casual              0
dtype: int64

In [160]:
print(f'Before dropping missing values: {all_trips.shape}')
all_trips.dropna(how='any',inplace=True)
print(f'After dropping missing values: {all_trips.shape}')

Before dropping missing values: (1953963, 13)
After dropping missing values: (1755337, 13)


In [161]:
all_trips.sort_values('started_at', inplace=True)

In [162]:
all_trips['started_at'] = pd.to_datetime(all_trips['started_at'])
all_trips['ended_at'] = pd.to_datetime(all_trips['ended_at'])

In [163]:
all_trips['trip_duration'] = all_trips['ended_at'] - all_trips['started_at']
all_trips['trip_duration_seconds'] = all_trips['trip_duration'].dt.total_seconds()

In [164]:
all_trips.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,trip_duration,trip_duration_seconds
128323,4B88BE3AEA4FD3C2,classic_bike,2023-01-01 00:03:48,2023-01-01 00:41:23,Lincoln Memorial,31258.0,20th & E St NW,31204.0,38.888255,-77.049436,38.8963,-77.045,member,0 days 00:37:35,2255.0
148254,B9CBD9EBCD559256,electric_bike,2023-01-01 00:05:08,2023-01-01 00:16:09,14th & Belmont St NW,31119.0,16th & Irving St NW,31122.0,38.921118,-77.031776,38.928893,-77.03625,member,0 days 00:11:01,661.0
155929,0514F0FDDA83CC2D,classic_bike,2023-01-01 00:05:36,2023-01-01 00:32:44,Lincoln Memorial,31258.0,S Troy St & 26th St S,31056.0,38.888255,-77.049436,38.847977,-77.075104,casual,0 days 00:27:08,1628.0
72858,BD93FC917AE9F223,classic_bike,2023-01-01 00:06:16,2023-01-01 00:38:13,Court House Metro / 15th St & N Uhle St,31089.0,Wilson Blvd & Ft Myer Dr,31015.0,38.890612,-77.084801,38.8946,-77.072305,casual,0 days 00:31:57,1917.0
72854,179364770202D6D8,classic_bike,2023-01-01 00:06:51,2023-01-01 00:37:13,Court House Metro / 15th St & N Uhle St,31089.0,Wilson Blvd & Ft Myer Dr,31015.0,38.890612,-77.084801,38.8946,-77.072305,casual,0 days 00:30:22,1822.0


In [165]:
all_trips = all_trips[['ride_id',
 'rideable_type',
 'started_at',
 'ended_at',
 'trip_duration',
 'trip_duration_seconds',
 'start_station_name',
 'start_station_id',
 'end_station_name',
 'end_station_id',
 'start_lat',
 'start_lng',
 'end_lat',
 'end_lng',
 'member_casual']]

### Get Weatherdata

In [166]:
import requests
import json 

class WeatherData():


    def __init__(self):

        self.base_url = "https://archive-api.open-meteo.com/v1/archive"
        self.coordinates = '?latitude=38.8951&longitude=-77.0364'
        self.timezone = '&timezone=America%2FNew_York'

    def get_weather(self, start_date = '2011-01-01', end_date = '2012-12-31', 
                        hourly_list=['precipitation,cloudcover,windspeed_10m','temperature_2m']):
            
            timeframe = ('&start_date='+ start_date + '&end_date=' + end_date)
            hourly = ('&hourly=' + (",".join(hourly_list)))
            api_url = (self.base_url + self.coordinates + timeframe + hourly + self.timezone)
            resp = requests.get(api_url)
            weather_data = resp.json()
            
            return weather_data
    
    def concat_to_df(self,df,my_weather):
        # Extract the data
        for measure in my_weather['hourly']:

            hourly_measure = my_weather['hourly'][measure]
            unit_of_measure = my_weather['hourly_units'][measure]

            # Iterate over the rows in the DataFrame
            for index, row in df.iterrows():
                custom_index = index 
                measure_value = hourly_measure[custom_index]

                # Assign the value to the respective row in the DataFrame
                name = measure + '_' + unit_of_measure
                df.at[index, name] = measure_value

            
dc_weather = WeatherData()
my_weather = dc_weather.get_weather(start_date='2023-01-01',end_date='2023-07-01')


In [167]:
# Extract the hourly data from the JSON object
hourly_data = my_weather['hourly']

df_weather = pd.DataFrame(hourly_data)

In [168]:
df_weather

Unnamed: 0,time,precipitation,cloudcover,windspeed_10m,temperature_2m
0,2023-01-01T00:00,0.0,2,11.4,10.7
1,2023-01-01T01:00,0.0,1,12.7,10.1
2,2023-01-01T02:00,0.0,2,14.6,9.9
3,2023-01-01T03:00,0.0,3,14.7,10.1
4,2023-01-01T04:00,0.0,3,13.4,9.5
...,...,...,...,...,...
4363,2023-07-01T19:00,2.8,93,11.7,24.6
4364,2023-07-01T20:00,1.5,100,11.9,24.0
4365,2023-07-01T21:00,0.1,79,10.3,23.6
4366,2023-07-01T22:00,0.1,100,8.8,23.3


In [169]:
df_weather['started_at'] = pd.to_datetime(df_weather['time'])
df_weather['temperature_2m'] = round(df_weather['temperature_2m'])
df_weather.rename(columns={'temperature_2m': 'temperature', 'windspeed_10m': 'windspeed'}, inplace=True)
df_weather.drop('time',axis=1, inplace=True)

In [172]:
all_trips.set_index('started_at',inplace=True)

In [173]:
df_weather.set_index('started_at',inplace=True)

### Merge trip data with weather data

In [177]:
final_df = pd.merge_asof(all_trips, df_weather, left_index=True, right_index=True)
trip_data = final_df.copy()

In [178]:
trip_data['trip_duration_bins'] = pd.cut(trip_data['trip_duration_seconds'], bins=5)

In [181]:
trip_data[trip_data['trip_duration_seconds'] < 0]

Unnamed: 0_level_0,ride_id,rideable_type,ended_at,trip_duration,trip_duration_seconds,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,precipitation,cloudcover,windspeed,temperature,trip_duration_bins
started_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2023-01-11 08:51:55,18767473F78A7C81,classic_bike,2023-01-11 08:50:32,-1 days +23:58:37,-83.0,Convention Center / 7th & M St NW,31223.0,Convention Center / 7th & M St NW,31223.0,38.905737,-77.02227,38.905737,-77.02227,member,0.0,30,6.0,-0.0,"(-2219.405, 426394.0]"
2023-02-19 17:56:26,3FEDCD9A4D6C7A85,classic_bike,2023-02-19 17:56:25,-1 days +23:59:59,-1.0,5th & K St NW,31600.0,5th & K St NW,31600.0,38.90304,-77.019027,38.90304,-77.019027,member,0.0,1,18.0,14.0,"(-2219.405, 426394.0]"
2023-03-07 08:37:10,2751DBCCB3790F9B,classic_bike,2023-03-07 08:35:43,-1 days +23:58:33,-87.0,15th & P St NW,31201.0,15th & P St NW,31201.0,38.909801,-77.034427,38.909801,-77.034427,member,0.0,20,11.2,4.0,"(-2219.405, 426394.0]"
2023-04-02 23:58:38,8DCD496965C76BD4,electric_bike,2023-04-02 23:58:18,-1 days +23:59:40,-20.0,23rd & E St NW,31260.0,23rd & E St NW,31260.0,38.896124,-77.049737,38.896104,-77.049882,casual,0.0,0,7.2,6.0,"(-2219.405, 426394.0]"
2023-05-12 19:01:30,86D37BE7C943697F,classic_bike,2023-05-12 19:01:27,-1 days +23:59:57,-3.0,8th & O St NW,31281.0,8th & O St NW,31281.0,38.90864,-77.02277,38.90864,-77.02277,member,0.0,30,5.6,26.0,"(-2219.405, 426394.0]"
2023-06-08 17:44:02,01610120C501449A,classic_bike,2023-06-08 17:44:01,-1 days +23:59:59,-1.0,Eads St & 15th St S,31000.0,Eads St & 15th St S,31000.0,38.858971,-77.05323,38.858971,-77.05323,member,0.0,32,5.2,25.0,"(-2219.405, 426394.0]"


In [182]:
trip_data['trip_duration_bins'].value_counts()

(-2219.405, 426394.0]     1755293
(426394.0, 852875.0]           24
(1279356.0, 1705837.0]         10
(852875.0, 1279356.0]           8
(1705837.0, 2132318.0]          2
Name: trip_duration_bins, dtype: int64