In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import math

In [3]:
df = pd.read_csv('202304-divvy-tripdata.csv')
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,8FE8F7D9C10E88C7,electric_bike,2023-04-02 08:37:28,2023-04-02 08:41:37,,,,,41.8,-87.6,41.79,-87.6,member
1,34E4ED3ADF1D821B,electric_bike,2023-04-19 11:29:02,2023-04-19 11:52:12,,,,,41.87,-87.65,41.93,-87.68,member
2,5296BF07A2F77CB5,electric_bike,2023-04-19 08:41:22,2023-04-19 08:43:22,,,,,41.93,-87.66,41.93,-87.66,member
3,40759916B76D5D52,electric_bike,2023-04-19 13:31:30,2023-04-19 13:35:09,,,,,41.92,-87.65,41.91,-87.65,member
4,77A96F460101AC63,electric_bike,2023-04-19 12:05:36,2023-04-19 12:10:26,,,,,41.91,-87.65,41.91,-87.63,member


In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426590 entries, 0 to 426589
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   ride_id             426590 non-null  object 
 1   rideable_type       426590 non-null  object 
 2   started_at          426590 non-null  object 
 3   ended_at            426590 non-null  object 
 4   start_station_name  362776 non-null  object 
 5   start_station_id    362776 non-null  object 
 6   end_station_name    357960 non-null  object 
 7   end_station_id      357960 non-null  object 
 8   start_lat           426590 non-null  float64
 9   start_lng           426590 non-null  float64
 10  end_lat             426155 non-null  float64
 11  end_lng             426155 non-null  float64
 12  member_casual       426590 non-null  object 
dtypes: float64(4), object(9)
memory usage: 42.3+ MB


Confirmo los distintos tipos de bicicletas y la cantidad de cada una

In [7]:
df.rideable_type.value_counts(normalize=True)

electric_bike    0.581272
classic_bike     0.397895
docked_bike      0.020833
Name: rideable_type, dtype: float64

Tenemos un porcentaje pequeño de "docked_bike". No hay datos exactos de que signiifica este tipo, se puede interpretar como "bicicletas reclinadas, triciclos manuales y bicicletas de carga" mencionadas en la documentación del proyecto.

In [8]:
df.member_casual.value_counts(normalize=True)

member    0.654739
casual    0.345261
Name: member_casual, dtype: float64

Confirmo tambien que la columna destinada a el tipo de miembro no tenga errores.

In [11]:
df.dtypes

ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object

In [19]:
def calcular_distancia(lat1, lon1, lat2, lon2):
    # Radio aproximado de la Tierra en kilómetros
    radio_tierra = 6371.0

    # Convertir las coordenadas de grados a radianes
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)

    # Diferencia de latitud y longitud
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad

    # Fórmula de Haversine
    a = math.sin(dlat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distancia = radio_tierra * c

    return distancia

In [20]:
df['distance'] = df.apply(lambda row: calcular_distancia(row['start_lat'], row['start_lng'], row['end_lat'], row['end_lng']), axis = 1)

In [21]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,distance
0,8FE8F7D9C10E88C7,electric_bike,2023-04-02 08:37:28,2023-04-02 08:41:37,,,,,41.8,-87.6,41.79,-87.6,member,1.111949
1,34E4ED3ADF1D821B,electric_bike,2023-04-19 11:29:02,2023-04-19 11:52:12,,,,,41.87,-87.65,41.93,-87.68,member,7.118733
2,5296BF07A2F77CB5,electric_bike,2023-04-19 08:41:22,2023-04-19 08:43:22,,,,,41.93,-87.66,41.93,-87.66,member,0.0
3,40759916B76D5D52,electric_bike,2023-04-19 13:31:30,2023-04-19 13:35:09,,,,,41.92,-87.65,41.91,-87.65,member,1.111949
4,77A96F460101AC63,electric_bike,2023-04-19 12:05:36,2023-04-19 12:10:26,,,,,41.91,-87.65,41.91,-87.63,member,1.655014


In [31]:
df.groupby('rideable_type')['distance'].mean()

rideable_type
classic_bike     1.934538
docked_bike      2.129310
electric_bike    2.098000
Name: distance, dtype: float64

En el promedio de distancia según tipo de vehiculo vemos que la que mas distancia tiene es la docked, la que menos cantidad de vehiculos tiene. La diferencia no es significativa, pero aun asi es llamativa por la diferencia que hay con la cantidad de este vehiculo en comparación con los otros.

In [32]:
df.groupby('member_casual')['distance'].mean()

member_casual
casual    2.037176
member    2.031733
Name: distance, dtype: float64

In [33]:
df.to_csv('cyclistic.csv')

In [34]:
df.groupby('start_station_id')['ride_id'].count()

start_station_id
021320           121
1011               7
1018               2
1027               2
1031               1
                ... 
chargingstx07    194
chargingstx1     921
chargingstx3      64
chargingstx4     666
chargingstx5     113
Name: ride_id, Length: 1032, dtype: int64