In [1]:
import pandas as pd
import numpy as np

# Pipeline Resume:

**Data Source:** https://www.kaggle.com/datasets/shane3martin/cyclist-capstone-project

**Usage Dataset:** 202008-divvy-tripdata.csv

**Original Columns:** ride_id , rideable_type , started_at , ended_at , start_station_name , start_station_id ,end_station_name , end_station_id , start_lat , start_lng , end_lat , end_lng , member_casual

**Original Datetime range analysed:** 2020-07-31 until 2020-08-31

**Date - Processed:** 2026-02-21 09:54 | Pipeline v1.0

# Before vs After

In [4]:
original_df = pd.read_csv("../data/raw/202008-divvy-tripdata.csv")
transformed_df = pd.read_csv("../data/processed/dataset_not_null_features_removed_outliers")

In [13]:
lines_original = original_df.shape[0]
lines_transformed = transformed_df.shape[0]
dif_lines = lines_original - lines_transformed
dif_lines_percent = round( (dif_lines/lines_original)*100 , 2)

print(f"We removed {dif_lines} lines from the original dataset , wich represents {dif_lines_percent}% of original dataset")

We removed 79960 lines from the original dataset , wich represents 12.85% of original dataset


In [23]:
original_columns = set(original_df.columns)
transformed_columns = set(transformed_df.columns)
new_columns = list(transformed_columns - original_columns)

print(f"We create {len(new_columns)} columns:")
for i in new_columns:
    print("* " + i)

We create 9 columns:
* hour_start
* ride_duration_minutes
* hour_end
* week_day
* distance_m
* distance_km
* is_weekend
* round_trip
* ride_duration


**Loss of almost 13% of the original data, a high percentage of data loss.**

Most of the missing data was due to outliers (which will obviously compromise any interpretation).

# Validate Basic KPI's

In [26]:
transformed_df['started_at'] = pd.to_datetime(transformed_df['started_at'])
transformed_df['ended_at'] = pd.to_datetime(transformed_df['ended_at'])

In [28]:
def summary_KPIs(df):
    # Data Cleaning
    # Removing outliers or errors where duration is 0 or negative
    df = df[df['ride_duration_minutes'] > 0].copy()

    # 3. Metrics Calculation
    
    # A. Average rides per day
    rides_per_day = df.groupby(df['started_at'].dt.date)['ride_id'].count()
    avg_rides_per_day = rides_per_day.mean()
    
    # B. Top 3 days with most rides
    top_3_days = rides_per_day.sort_values(ascending=False).head(3)
    
    # C. Average ride duration (minutes)
    avg_duration = df['ride_duration_minutes'].mean()
    
    # D. Average distance (meters)
    avg_distance_m = df['distance_m'].mean()
    
    # E. Average speed (km/h)
    # Speed = Distance (km) / Time (hours)
    df['speed_kmh'] = df['distance_km'] / (df['ride_duration_minutes'] / 60)
    avg_speed = df['speed_kmh'].mean()
    
    # Formatting Results
    metrics = {
        "avg_rides_per_day": round(avg_rides_per_day, 2),
        "top_3_busiest_days": top_3_days.to_dict(),
        "avg_duration_min": round(avg_duration, 2),
        "avg_distance_m": round(avg_distance_m, 2),
        "avg_speed_kmh": round(avg_speed, 2)
    }
    
    return metrics

In [34]:
results = summary_KPIs(transformed_df)
for key, value in results.items():
    print(f"{key}: {value}")

avg_rides_per_day: 17494.0
top_3_busiest_days: {datetime.date(2020, 8, 22): 24351, datetime.date(2020, 8, 8): 24278, datetime.date(2020, 8, 29): 23676}
avg_duration_min: 19.47
avg_distance_m: 2019.75
avg_speed_kmh: 28.41
