In [2]:
# importing libraries to use
import pandas as pd 
from datetime import datetime

In [4]:
df_bikes = pd.read_csv("./Trip_Data_022023.csv")

# Let's take a look at the data we're dealing with:
print("Cyclistic's dataframe has", df_bikes.shape[0], "rows and", df_bikes.shape[1], "columns")

Cyclistic's dataframe has 190445 rows and 13 columns


In [5]:
# Let's take a quick look at the first 5 rows of the dataframe:
df_bikes.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,CBCD0D7777F0E45F,classic_bike,2023-02-14 11:59:42,2023-02-14 12:13:38,Southport Ave & Clybourn Ave,TA1309000030,Clark St & Schiller St,TA1309000024,41.920771,-87.663712,41.907993,-87.631501,casual
1,F3EC5FCE5FF39DE9,electric_bike,2023-02-15 13:53:48,2023-02-15 13:59:08,Clarendon Ave & Gordon Ter,13379,Sheridan Rd & Lawrence Ave,TA1309000041,41.957879,-87.649584,41.969517,-87.654691,casual
2,E54C1F27FA9354FF,classic_bike,2023-02-19 11:10:57,2023-02-19 11:35:01,Southport Ave & Clybourn Ave,TA1309000030,Aberdeen St & Monroe St,13156,41.920771,-87.663712,41.880419,-87.655519,member
3,3D561E04F739CC45,electric_bike,2023-02-26 16:12:05,2023-02-26 16:39:55,Southport Ave & Clybourn Ave,TA1309000030,Franklin St & Adams St (Temp),TA1309000008,41.920873,-87.663733,41.879434,-87.635504,member
4,0CB4B4D53B2DBE05,electric_bike,2023-02-20 11:55:23,2023-02-20 12:05:48,Prairie Ave & Garfield Blvd,TA1307000160,Cottage Grove Ave & 63rd St,KA1503000054,41.794827,-87.618795,41.780531,-87.60597,member


In [6]:
# We can also check for the type of data that there is in each column:
df_bikes.dtypes

ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object

In [7]:
# Columns 'started at' and 'ended at' are not an object type, so, we should change that:
df_bikes['started_at'] = pd.to_datetime(df_bikes['started_at'])
df_bikes['ended_at'] = pd.to_datetime(df_bikes['ended_at'])

In [8]:
# We can now check again if it's changed or not:
df_bikes.dtypes

ride_id                       object
rideable_type                 object
started_at            datetime64[ns]
ended_at              datetime64[ns]
start_station_name            object
start_station_id              object
end_station_name              object
end_station_id                object
start_lat                    float64
start_lng                    float64
end_lat                      float64
end_lng                      float64
member_casual                 object
dtype: object

The types are correct in relation to info displayed in the dataframe.

In [9]:
# We have some columns that have different types of data in it that may be worth to look at.
print("The column 'rideable_type' has", df_bikes['rideable_type'].nunique(), "different types of bikes registered, which are",
     df_bikes['rideable_type'].unique()[0], df_bikes['rideable_type'].value_counts()[0],
      df_bikes['rideable_type'].unique()[1], df_bikes['rideable_type'].value_counts()[1], "and",
       df_bikes['rideable_type'].unique()[2], df_bikes['rideable_type'].value_counts()[2] )


The column 'rideable_type' has 3 different types of bikes registered, which are classic_bike 98362 electric_bike 89888 and docked_bike 2195


In [10]:
# We can also see what types of members have been registered in this dataframe and how many for each:
print("There are", df_bikes['member_casual'].nunique(), "different types of members registered, which are",
     df_bikes['member_casual'].unique()[0], df_bikes['member_casual'].value_counts()[0], "and",
      df_bikes['member_casual'].unique()[1], df_bikes['member_casual'].value_counts()[1] )

There are 2 different types of members registered, which are casual 147429 and member 43016


In [11]:
# Let's consider now the nan values that may have been registered, first we'll check if they exist in the dataframe:
df_bikes.isna().sum()

ride_id                   0
rideable_type             0
started_at                0
ended_at                  0
start_station_name    25473
start_station_id      25605
end_station_name      26738
end_station_id        26879
start_lat                 0
start_lng                 0
end_lat                 116
end_lng                 116
member_casual             0
dtype: int64

In [12]:
# Finding some of the indexes where the column 'start_station_name' is NaN:
df_bikes[df_bikes.start_station_name.isna()]==True

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
1385,False,False,False,False,False,False,False,False,False,False,False,False,False
1386,False,False,False,False,False,False,False,False,False,False,False,False,False
1387,False,False,False,False,False,False,False,False,False,False,False,False,False
1388,False,False,False,False,False,False,False,False,False,False,False,False,False
1389,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
189375,False,False,False,False,False,False,False,False,False,False,False,False,False
189376,False,False,False,False,False,False,False,False,False,False,False,False,False
189377,False,False,False,False,False,False,False,False,False,False,False,False,False
189378,False,False,False,False,False,False,False,False,False,False,False,False,False


In [13]:
# We can see that all the rest of the data is okay for this row index, for now we'll consider these 
# as long as we have information from started and ended times.
df_bikes.iloc[1385]

ride_id                     97DC27C4C2F5AC4B
rideable_type                  electric_bike
started_at               2023-02-26 07:15:01
ended_at                 2023-02-26 07:16:26
start_station_name                       NaN
start_station_id                         NaN
end_station_name      Clark St & Schiller St
end_station_id                  TA1309000024
start_lat                              41.91
start_lng                             -87.63
end_lat                            41.907993
end_lng                           -87.631501
member_casual                         member
Name: 1385, dtype: object

In [39]:
# Let's calculate the length of each ride by subtracting the columns "started_at" from the column "ended_at":
df_bikes["ride_length"] = df_bikes["started_at"].dt.time - df_bikes["ended_at"].dt.time
df_bikes

TypeError: unsupported operand type(s) for -: 'datetime.time' and 'datetime.time'

In [43]:
df_bikes['started_time'] = df_bikes['started_at'].dt.time
df_bikes['ended_time'] = df_bikes['ended_at'].dt.time

start_delta = timedelta(hours=df_bikes['started_time'].hour, minutes=df_bikes['started_time'].minute, seconds=df_bikes['started_time'].second)
end_delta = timedelta(hours=df_bikes['started_time'].hour, minutes=df_bikes['started_time'].minute, seconds=df_bikes['started_time'].second)
df_bikes['length'] = start_delta - end_delta



AttributeError: type object 'datetime.datetime' has no attribute 'timedelta'

In [42]:
df_bikes['started_time'].dt.hour

AttributeError: Can only use .dt accessor with datetimelike values

In [22]:
datetime.time

<method 'time' of 'datetime.datetime' objects>

In [72]:
df_bikes["started_at"] = pd.to_datetime(df_bikes["started_at"])

In [75]:
type(df_bikes.started_at.time())

AttributeError: 'Series' object has no attribute 'time'