# Python, Pandas et Traitement d'un dataset
---



*Le jeu de données à analyser se trouve à l'adresse suivante : (https://divvy-tripdata.s3.amazonaws.com/index.html). Il y a un fichier par mois, pour un total de 12 fichiers. Chaque fichier contient 13 colonnes avec des types de données variés. Nous allons fusionner les fichiers en un seul et le nommer 'combined_data'.*

In [136]:
# Importation des bibliothèques
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import datetime
import requests
import zipfile
import io
import os
from math import radians, sin, cos, sqrt, atan2

In [137]:
####### Importation des datasets depuis le serveur AWS dédiéc (A EXECUTER UNE SEULE FOIS !!!!)

# Liste des noms de fichiers
file_names = [
    "202101-divvy-tripdata.zip",
    "202102-divvy-tripdata.zip",
    "202103-divvy-tripdata.zip",
    "202104-divvy-tripdata.zip",
    "202105-divvy-tripdata.zip",
    "202106-divvy-tripdata.zip",
    "202107-divvy-tripdata.zip",
    "202108-divvy-tripdata.zip",
    "202109-divvy-tripdata.zip",
    "202110-divvy-tripdata.zip",
    "202111-divvy-tripdata.zip",
    "202112-divvy-tripdata.zip"
]

# URL de base
base_url = "https://divvy-tripdata.s3.amazonaws.com/"

# Télécharger chaque fichier
for file_name in file_names:

    url = base_url + file_name
    response = requests.get(url)

    # Extraire le fichier Zip dans le dossier local "wild_divvy_data"
    with zipfile.ZipFile(io.BytesIO(response.content)) as the_zip:
      the_zip.extractall("wild_divvy_data")


In [138]:
folder_path = 'wild_divvy_data'

# Obtenir une liste de tous les fichiers CSV dans le dossier local
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Initialiser un DataFrame vide pour stocker les données combinées
combined_data = pd.DataFrame()

# Parcourir chaque fichier CSV et concaténer ses données au DataFrame combiné
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    data = pd.read_csv(file_path)
    combined_data = pd.concat([combined_data, data], ignore_index=True)

# Afficher les données combinées
combined_data

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,E19E6F1B8D4C42ED,electric_bike,2021-01-23 16:14:19,2021-01-23 16:24:44,California Ave & Cortez St,17660,,,41.900341,-87.696743,41.890000,-87.720000,member
1,DC88F20C2C55F27F,electric_bike,2021-01-27 18:43:08,2021-01-27 18:47:12,California Ave & Cortez St,17660,,,41.900333,-87.696707,41.900000,-87.690000,member
2,EC45C94683FE3F27,electric_bike,2021-01-21 22:35:54,2021-01-21 22:37:14,California Ave & Cortez St,17660,,,41.900313,-87.696643,41.900000,-87.700000,member
3,4FA453A75AE377DB,electric_bike,2021-01-07 13:31:13,2021-01-07 13:42:55,California Ave & Cortez St,17660,,,41.900399,-87.696662,41.920000,-87.690000,member
4,BE5E8EB4E7263A0B,electric_bike,2021-01-23 02:24:02,2021-01-23 02:24:45,California Ave & Cortez St,17660,,,41.900326,-87.696697,41.900000,-87.700000,casual
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5595058,847431F3D5353AB7,electric_bike,2021-12-12 13:36:55,2021-12-12 13:56:08,Canal St & Madison St,13341,,,41.882289,-87.639752,41.890000,-87.610000,casual
5595059,CF407BBC3B9FAD63,electric_bike,2021-12-06 19:37:50,2021-12-06 19:44:51,Canal St & Madison St,13341,Kingsbury St & Kinzie St,KA1503000043,41.882123,-87.640053,41.889106,-87.638862,member
5595060,60BB69EBF5440E92,electric_bike,2021-12-02 08:57:04,2021-12-02 09:05:21,Canal St & Madison St,13341,Dearborn St & Monroe St,TA1305000006,41.881956,-87.639955,41.880254,-87.629603,member
5595061,C414F654A28635B8,electric_bike,2021-12-13 09:00:26,2021-12-13 09:14:39,Lawndale Ave & 16th St,362.0,,,41.860000,-87.720000,41.850000,-87.710000,member


1. Afficher les types des colonnes

In [139]:
combined_data.dtypes.to_frame()

Unnamed: 0,0
ride_id,object
rideable_type,object
started_at,object
ended_at,object
start_station_name,object
start_station_id,object
end_station_name,object
end_station_id,object
start_lat,float64
start_lng,float64


2. Afficher le nombre de valeurs manquantes par colonne

In [140]:
combined_data.isna().sum().to_frame()

Unnamed: 0,0
ride_id,0
rideable_type,0
started_at,0
ended_at,0
start_station_name,690809
start_station_id,690806
end_station_name,739170
end_station_id,739170
start_lat,0
start_lng,0


3. Identifier et compter les lignes dupliquées pour toutes les colonnes

In [40]:
print(combined_data.duplicated().sum())

0


## Traitement

*Commentaire :*

4. Créer une nouvelle dataframe en supprimant tous les trajets avec des valeurs manquantes et afficher le nombre de lignes et de colonnes de la nouvelle dataframe

In [158]:
df_divvy = combined_data.dropna().sort_values(by='started_at').reset_index(drop=True)

df_divvy.shape

(4588302, 13)

- Je regarde la taille du dataset

In [156]:
df_divvy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4588302 entries, 0 to 4588301
Data columns (total 13 columns):
 #   Column              Dtype  
---  ------              -----  
 0   ride_id             object 
 1   rideable_type       object 
 2   started_at          object 
 3   ended_at            object 
 4   start_station_name  object 
 5   start_station_id    object 
 6   end_station_name    object 
 7   end_station_id      object 
 8   start_lat           float64
 9   start_lng           float64
 10  end_lat             float64
 11  end_lng             float64
 12  member_casual       object 
dtypes: float64(4), object(9)
memory usage: 455.1+ MB


In [157]:
# POSSIBILITE OU NON DE CHANGER LE FORMAT DES VARIABLES NUMERIQUES POUR DIMINUER LA TAILLE DU FICHIER 
df_divvy[list(df_divvy.select_dtypes('number').columns)] = df_divvy[list(df_divvy.select_dtypes('number').columns)].astype('float32')

display(df_divvy.info(),
        df_divvy)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4588302 entries, 0 to 4588301
Data columns (total 13 columns):
 #   Column              Dtype  
---  ------              -----  
 0   ride_id             object 
 1   rideable_type       object 
 2   started_at          object 
 3   ended_at            object 
 4   start_station_name  object 
 5   start_station_id    object 
 6   end_station_name    object 
 7   end_station_id      object 
 8   start_lat           float32
 9   start_lng           float32
 10  end_lat             float32
 11  end_lng             float32
 12  member_casual       object 
dtypes: float32(4), object(9)
memory usage: 385.1+ MB


None

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,0D139A3203274B87,classic_bike,2021-01-01 00:02:24,2021-01-01 00:08:39,State St & 33rd St,13216,MLK Jr Dr & 29th St,TA1307000139,41.834736,-87.625816,41.842052,-87.616997,member
1,C7AE8E9CDB197A8E,classic_bike,2021-01-01 00:06:55,2021-01-01 00:26:36,Lakeview Ave & Fullerton Pkwy,TA1309000019,Ritchie Ct & Banks St,KA1504000134,41.925858,-87.638969,41.906864,-87.626221,member
2,3097EF26414C7016,classic_bike,2021-01-01 00:12:21,2021-01-01 00:12:33,Montrose Harbor,TA1308000012,Montrose Harbor,TA1308000012,41.963982,-87.638184,41.963982,-87.638184,member
3,938D5D1998A5470E,classic_bike,2021-01-01 00:12:27,2021-01-01 00:12:30,Montrose Harbor,TA1308000012,Montrose Harbor,TA1308000012,41.963982,-87.638184,41.963982,-87.638184,casual
4,6604F61AE4B14BC1,electric_bike,2021-01-01 00:12:49,2021-01-01 00:43:59,Western Ave & Howard St,527,Campbell Ave & Fullerton Ave,15648,42.018860,-87.690025,41.924679,-87.689331,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4588297,7B3D0E8AB0842D5E,classic_bike,2021-12-31 23:58:21,2022-01-01 00:46:56,Michigan Ave & Lake St,TA1305000011,Clinton St & Tilden St,13037,41.886024,-87.624115,41.875885,-87.640793,casual
4588298,B5AEBEF3B5F41C77,electric_bike,2021-12-31 23:58:45,2022-01-01 00:47:07,Michigan Ave & Lake St,TA1305000011,Clinton St & Tilden St,13037,41.885994,-87.624779,41.875885,-87.640793,casual
4588299,CE1BE016BCE85CCB,electric_bike,2021-12-31 23:59:27,2022-01-01 00:32:34,Clark St & Ida B Wells Dr,TA1305000009,Clark St & Ida B Wells Dr,TA1305000009,41.875832,-87.631348,41.875931,-87.630585,casual
4588300,B0466FF51982DE4B,electric_bike,2021-12-31 23:59:39,2022-01-01 00:21:08,Millennium Park,13008,Michigan Ave & 14th St,TA1307000124,41.881130,-87.624046,41.864059,-87.623726,member


## Colonnes supplémentaires et transformation des données

Nous ajouterons la distance parcourue en kilomètres à partir des latitudes et longitudes de départ et d'arrivée données. Nous utiliserons la formule de Haversine.

In [143]:
def haversine(lat1, lon1, lat2, lon2):
# Rayon de la Terre en kilomètres
    R = 6371.0

    # Convertir la latitude et la longitude de degrés en radians
    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])

    # Calculer les différences entre les latitudes et les longitudes
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    # Formule de Haversine
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    # Calculer la distance
    distance = R * c

    return distance

5. Ajouter une nouvelle colonne "distance_travelled_km" en appliquant la fonction haversine sur tous les trajets.

> On rappelle le prototype de la fonction haversine(lat1, lon1, lat2, lon2)



Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,distance_travelled_km
0,89E7AA6C29227EFF,classic_bike,2021-02-12 16:14:56,2021-02-12 16:21:43,Glenwood Ave & Touhy Ave,525,Sheridan Rd & Columbia Ave,660,42.012701,-87.666058,42.004583,-87.661406,member,0.981104
1,0FEFDE2603568365,classic_bike,2021-02-14 17:52:38,2021-02-14 18:12:09,Glenwood Ave & Touhy Ave,525,Bosworth Ave & Howard St,16806,42.012701,-87.666058,42.019537,-87.669563,casual,0.813412
2,E6159D746B2DBB91,electric_bike,2021-02-09 19:10:18,2021-02-09 19:19:10,Clark St & Lake St,KA1503000012,State St & Randolph St,TA1305000029,41.885795,-87.631101,41.884866,-87.627498,member,0.315636
3,B32D3199F1C2E75B,classic_bike,2021-02-02 17:49:41,2021-02-02 17:54:06,Wood St & Chicago Ave,637,Honore St & Division St,TA1305000034,41.895634,-87.672069,41.903119,-87.673935,member,0.846501
4,83E463F23575F4BF,electric_bike,2021-02-23 15:07:23,2021-02-23 15:22:37,State St & 33rd St,13216,Emerald Ave & 31st St,TA1309000055,41.834733,-87.625827,41.838163,-87.645123,member,1.64346


In [None]:
# METHODE EN PRENANT LES COLONNES SPECIFIQUES
df_divvy['distance_travelled_km'] = df_divvy[['start_lat', 'start_lng', 'end_lat', 'end_lng']].apply(lambda x: haversine(x[0], x[1], x[2], x[3]), axis=1)

df_divvy.head()

In [160]:
# METHODE 2 : EN UTILISANT LE DATASET COMPLET ET EN JOUANT SUR LES LIGNES (PLUS RAPIDE)
df_divvy['distance_travelled_km'] = df_divvy.apply(lambda row: haversine(row['start_lat'], row['start_lng'], row['end_lat'], row['end_lng']), axis=1)

6. Analyser les statistiques concernant la colonne "distance_travelled_km".

Que remarquez vous?

In [161]:
df_divvy['distance_travelled_km'].describe().to_frame()

Unnamed: 0,distance_travelled_km
count,4588302.0
mean,2.128845
std,1.879802
min,0.0
25%,0.9038498
50%,1.619725
75%,2.814214
max,33.80018


*Réponse :*

Les trajets sont essentiellement de courtes distances :
- inférieures à 2kms pour la moitié d'entre eux
- inférieures à 3kms pour les 3/4 des courses.     
On note la présence de course à 0 et un max à 33kms

7. Pour remédier à ce problème, ajouter une nouvelle colonne "ride_duration_s" pour calculer la période du trajet en secondes

In [162]:
df_divvy['ride_duration_s'] = (pd.to_datetime(df_divvy['ended_at']) - pd.to_datetime(df_divvy['started_at'])).dt.total_seconds()

df_divvy.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,distance_travelled_km,ride_duration_s
0,0D139A3203274B87,classic_bike,2021-01-01 00:02:24,2021-01-01 00:08:39,State St & 33rd St,13216,MLK Jr Dr & 29th St,TA1307000139,41.834734,-87.625813,41.842052,-87.617,member,1.093249,375.0
1,C7AE8E9CDB197A8E,classic_bike,2021-01-01 00:06:55,2021-01-01 00:26:36,Lakeview Ave & Fullerton Pkwy,TA1309000019,Ritchie Ct & Banks St,KA1504000134,41.925858,-87.638973,41.906866,-87.626217,member,2.360881,1181.0
2,3097EF26414C7016,classic_bike,2021-01-01 00:12:21,2021-01-01 00:12:33,Montrose Harbor,TA1308000012,Montrose Harbor,TA1308000012,41.963982,-87.638181,41.963982,-87.638181,member,0.0,12.0
3,938D5D1998A5470E,classic_bike,2021-01-01 00:12:27,2021-01-01 00:12:30,Montrose Harbor,TA1308000012,Montrose Harbor,TA1308000012,41.963982,-87.638181,41.963982,-87.638181,casual,0.0,3.0
4,6604F61AE4B14BC1,electric_bike,2021-01-01 00:12:49,2021-01-01 00:43:59,Western Ave & Howard St,527,Campbell Ave & Fullerton Ave,15648,42.018858,-87.690022,41.92468,-87.689328,member,10.472291,1870.0


8. Analyser les statistiques concernant la colonne "ride_duration_s".

Que remarquez vous?

In [163]:
df_divvy['ride_duration_s'].describe().to_frame()

Unnamed: 0,ride_duration_s
count,4588302.0
mean,1308.638
std,11065.5
min,-3354.0
25%,417.0
50%,732.0
75%,1327.0
max,3356649.0


*Réponse :*

On remarque la présence de durée négative.      
Il nous faut voir ce qui se passe sur ces trajets. Y-a t-il une distance parcourue, etc.        
Nous avons également 3 356 649 secondes pour un trajet 

In [164]:
df_divvy[df_divvy['ride_duration_s'] <= 0].sort_values(by='ride_duration_s')

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,distance_travelled_km,ride_duration_s
4225233,FD8AF7324ABAE9DA,electric_bike,2021-11-07 01:56:51,2021-11-07 01:00:57,Clark St & North Ave,13128,Larrabee St & Webster Ave,13193,41.911738,-87.632145,41.921762,-87.644034,casual,1.486670,-3354.0
4225221,508B09A5FB0737DC,classic_bike,2021-11-07 01:54:50,2021-11-07 01:00:45,Sedgwick St & Webster Ave,13191,Sedgwick St & North Ave,TA1307000038,41.922167,-87.638888,41.911386,-87.638677,member,1.198920,-3245.0
4225229,6F9E76F5EDAAC1B8,electric_bike,2021-11-07 01:55:42,2021-11-07 01:01:55,Milwaukee Ave & Wabansia Ave,13243,Western Ave & Division St,13241,41.912580,-87.681424,41.902906,-87.687367,member,1.182794,-3227.0
4225222,7AECC76D1562B51C,classic_bike,2021-11-07 01:54:58,2021-11-07 01:01:29,Sheffield Ave & Wrightwood Ave,TA1309000023,Southport Ave & Wellington Ave,TA1307000006,41.928712,-87.653833,41.935733,-87.663576,casual,1.122079,-3209.0
4225224,CDB307B8494885AD,electric_bike,2021-11-07 01:55:09,2021-11-07 01:02:26,Sedgwick St & Webster Ave,13191,Halsted St & Roscoe St,TA1309000025,41.922190,-87.638957,41.943615,-87.649029,casual,2.523825,-3163.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2287737,A4F0CEEE2CACE722,classic_bike,2021-07-28 12:50:27,2021-07-28 12:50:27,Clark St & Drummond Pl,TA1307000142,Clark St & Drummond Pl,TA1307000142,41.931248,-87.644336,41.931248,-87.644336,member,0.000000,0.0
2319377,412BBDCAF07E4D5D,classic_bike,2021-07-29 18:33:14,2021-07-29 18:33:14,DuSable Lake Shore Dr & North Blvd,LF-005,DuSable Lake Shore Dr & North Blvd,LF-005,41.911722,-87.626804,41.911722,-87.626804,member,0.000000,0.0
2381103,77BD53CEDEE4E7B1,classic_bike,2021-07-31 22:55:54,2021-07-31 22:55:54,Larrabee St & Webster Ave,13193,Larrabee St & Webster Ave,13193,41.921822,-87.644140,41.921822,-87.644140,casual,0.000000,0.0
2102771,CCCF93E09080ADA5,classic_bike,2021-07-20 09:42:58,2021-07-20 09:42:58,Halsted St & Dickens Ave,13192,Halsted St & Dickens Ave,13192,41.919936,-87.648830,41.919936,-87.648830,member,0.000000,0.0


Je note la présence de 198 lignes "problématiques" sur 4,588,302.       
J'opte pour la suppression de ces lignes car elles ne constitue que 0.0043% de mes données

In [151]:
df_divvy.sort_values(by='ride_duration_s', ascending=False)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,distance_travelled_km,ride_duration_s
1157578,F043F0F6A1AA4F85,docked_bike,2021-06-05 02:27:26,2021-07-13 22:51:35,Michigan Ave & Lake St,TA1305000011,Malcolm X College Vaccination Site,631,41.886024,-87.624117,41.877618,-87.673895,casual,4.225656,3356649.0
1154279,7F0578ABF030FC83,docked_bike,2021-06-04 22:03:33,2021-07-13 14:15:14,Streeter Dr & Grand Ave,13022,Base - 2132 W Hubbard Warehouse,Hubbard Bike-checking (LBS-WH-TEST),41.892278,-87.612043,41.889955,-87.680651,casual,5.684911,3341501.0
653325,BDA1217EC8532C7B,docked_bike,2021-05-02 02:56:07,2021-06-08 13:37:43,State St & Van Buren St,TA1305000035,Wells St & Polk St,SL-011,41.877181,-87.627844,41.872596,-87.633502,casual,0.692373,3235296.0
1185860,E6E1E5BFFD72ECEC,docked_bike,2021-06-05 23:33:51,2021-07-12 13:55:14,Clark St & Lake St,KA1503000012,Base - 2132 W Hubbard Warehouse,Hubbard Bike-checking (LBS-WH-TEST),41.886021,-87.630876,41.889955,-87.680651,casual,4.143492,3162083.0
1866111,99415B003B7E7EEE,docked_bike,2021-07-08 19:29:49,2021-08-11 21:56:58,Kedzie Ave & Milwaukee Ave,13085,Base - 2132 W Hubbard Warehouse,Hubbard Bike-checking (LBS-WH-TEST),41.929567,-87.707857,41.889955,-87.680651,casual,4.946659,2946429.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4225224,CDB307B8494885AD,electric_bike,2021-11-07 01:55:09,2021-11-07 01:02:26,Sedgwick St & Webster Ave,13191,Halsted St & Roscoe St,TA1309000025,41.922190,-87.638957,41.943615,-87.649029,casual,2.523825,-3163.0
4225222,7AECC76D1562B51C,classic_bike,2021-11-07 01:54:58,2021-11-07 01:01:29,Sheffield Ave & Wrightwood Ave,TA1309000023,Southport Ave & Wellington Ave,TA1307000006,41.928712,-87.653833,41.935733,-87.663576,casual,1.122079,-3209.0
4225229,6F9E76F5EDAAC1B8,electric_bike,2021-11-07 01:55:42,2021-11-07 01:01:55,Milwaukee Ave & Wabansia Ave,13243,Western Ave & Division St,13241,41.912580,-87.681424,41.902906,-87.687367,member,1.182794,-3227.0
4225221,508B09A5FB0737DC,classic_bike,2021-11-07 01:54:50,2021-11-07 01:00:45,Sedgwick St & Webster Ave,13191,Sedgwick St & North Ave,TA1307000038,41.922167,-87.638888,41.911386,-87.638677,member,1.198920,-3245.0


9. Supprimer les trajets de moins d'une minute

In [57]:
df_divvy[df_divvy['ride_duration_s'] < 60]

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,distance_travelled_km,ride_duration_s
2,3097EF26414C7016,classic_bike,2021-01-01 00:12:21,2021-01-01 00:12:33,Montrose Harbor,TA1308000012,Montrose Harbor,TA1308000012,41.963982,-87.638181,41.963982,-87.638181,member,0.000000,12.0
3,938D5D1998A5470E,classic_bike,2021-01-01 00:12:27,2021-01-01 00:12:30,Montrose Harbor,TA1308000012,Montrose Harbor,TA1308000012,41.963982,-87.638181,41.963982,-87.638181,casual,0.000000,3.0
87,455B2BA0C5839C65,classic_bike,2021-01-01 02:29:49,2021-01-01 02:30:21,McClurg Ct & Erie St,KA1503000041,McClurg Ct & Erie St,KA1503000041,41.894503,-87.617854,41.894503,-87.617854,casual,0.000000,32.0
174,4E0B310844B50762,classic_bike,2021-01-01 06:26:58,2021-01-01 06:27:08,Wilton Ave & Belmont Ave,TA1307000134,Wilton Ave & Belmont Ave,TA1307000134,41.940180,-87.653040,41.940180,-87.653040,member,0.000000,10.0
302,A087592F66310E85,classic_bike,2021-01-01 09:19:01,2021-01-01 09:19:24,Lake Shore Dr & Belmont Ave,TA1309000049,Lake Shore Dr & Belmont Ave,TA1309000049,41.940775,-87.639192,41.940775,-87.639192,member,0.000000,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4587913,813BB5E66E46A301,electric_bike,2021-12-31 20:51:25,2021-12-31 20:51:52,Meade Ave & Diversey Ave,353,Meade Ave & Diversey Ave,353,41.930000,-87.780000,41.930000,-87.780000,casual,0.000000,27.0
4588036,38F19ADE4902F470,classic_bike,2021-12-31 21:48:19,2021-12-31 21:49:03,Kedzie Ave & Palmer Ct,13292,Kedzie Ave & Palmer Ct,13292,41.921525,-87.707322,41.921525,-87.707322,member,0.000000,44.0
4588086,345651F70D60B7C9,electric_bike,2021-12-31 22:17:02,2021-12-31 22:17:04,Ogden Ave & Chicago Ave,TA1305000020,Ogden Ave & Chicago Ave,TA1305000020,41.896367,-87.654035,41.896362,-87.654061,member,0.002232,2.0
4588225,A064AACB04136862,classic_bike,2021-12-31 23:19:14,2021-12-31 23:19:17,Dusable Harbor,KA1503000064,Dusable Harbor,KA1503000064,41.886976,-87.612813,41.886976,-87.612813,member,0.000000,3.0


La suppression des 59,369 lignes dont la durée est inférieure à la minute, représente 1.09% de mes données.     
Je peux parfaitement le faire sans biaiser mon jeu de données

In [169]:
df_divvy_clean = df_divvy[~(df_divvy['ride_duration_s'] < 60)]

df_divvy_clean.shape

(4528933, 15)

10. Créer une colonne "speed_mph" afin d'analyser la vitesse en calculant la vitesse du km/h vers mph.



Pour répondre à cette question, je dois tout d'abord convertir les secondes en heure (soit une division par 3600).      
Ensuite, je devrai diviser les kilomètres par les heures.       
Pour terminer je devrai faire la conversion des km/h en mph (multiplication par 0,62137119)

In [170]:
def fn_caluclate_speed(km, h):

    return (km / (h/3600)) * 0.62137119

df_divvy_clean['speed_mph'] = df_divvy_clean.apply(lambda row: round(fn_caluclate_speed(row['distance_travelled_km'], row['ride_duration_s']), 3), axis=1)

In [171]:
df_divvy_clean['speed_mph'].describe().to_frame()

Unnamed: 0,speed_mph
count,4528933.0
mean,5.790967
std,2.945405
min,0.0
25%,4.123
50%,6.097
75%,7.738
max,42.232


In [172]:
df_divvy_clean[(df_divvy_clean['speed_mph'] == 0) | (df_divvy_clean['distance_travelled_km'] == 0)]

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,distance_travelled_km,ride_duration_s,speed_mph
14,3A27773729F0B363,classic_bike,2021-01-01 00:35:35,2021-01-01 00:46:02,Wells St & Elm St,KA1504000135,Wells St & Elm St,KA1504000135,41.903222,-87.634324,41.903222,-87.634324,member,0.0,627.0,0.0
77,306B28E16ECBA0DD,classic_bike,2021-01-01 02:18:09,2021-01-01 02:27:33,Clark St & Winnemac Ave,TA1309000035,Clark St & Winnemac Ave,TA1309000035,41.973348,-87.667855,41.973348,-87.667855,casual,0.0,564.0,0.0
80,996F76BCEDFCEE4D,docked_bike,2021-01-01 02:20:32,2021-01-01 02:21:36,LaSalle St & Washington St,13006,LaSalle St & Washington St,13006,41.882664,-87.632530,41.882664,-87.632530,casual,0.0,64.0,0.0
97,A9B22294987DA9D2,docked_bike,2021-01-01 02:47:27,2021-01-01 03:20:03,Michigan Ave & 8th St,623,Michigan Ave & 8th St,623,41.872773,-87.623981,41.872773,-87.623981,casual,0.0,1956.0,0.0
98,CB804831BBB3F7BC,docked_bike,2021-01-01 02:47:41,2021-01-01 03:21:51,Michigan Ave & 8th St,623,Michigan Ave & 8th St,623,41.872773,-87.623981,41.872773,-87.623981,casual,0.0,2050.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4588236,DBB263F59BB66273,classic_bike,2021-12-31 23:24:15,2022-01-01 00:26:55,Field Blvd & South Water St,15534,Field Blvd & South Water St,15534,41.886349,-87.617517,41.886349,-87.617517,casual,0.0,3760.0,0.0
4588239,03A5E32DC1BE7D0E,classic_bike,2021-12-31 23:24:58,2022-01-01 00:26:47,Field Blvd & South Water St,15534,Field Blvd & South Water St,15534,41.886349,-87.617517,41.886349,-87.617517,casual,0.0,3709.0,0.0
4588255,2370278412D49B63,classic_bike,2021-12-31 23:31:39,2021-12-31 23:32:56,Michigan Ave & Ida B Wells Dr,TA1305000010,Michigan Ave & Ida B Wells Dr,TA1305000010,41.876243,-87.624426,41.876243,-87.624426,member,0.0,77.0,0.0
4588290,6007E15F50A14DBF,classic_bike,2021-12-31 23:52:32,2022-01-01 00:08:35,Dearborn St & Monroe St,TA1305000006,Dearborn St & Monroe St,TA1305000006,41.881320,-87.629521,41.881320,-87.629521,casual,0.0,963.0,0.0


In [174]:
df_divvy_clean = df_divvy_clean[(df_divvy_clean['speed_mph'] != 0) | (df_divvy_clean['distance_travelled_km'] != 0)]

df_divvy_clean.shape

(4295746, 16)

11. Supprimer les données avec une vitesse supérieure à 45 km/h (~28 mph, vitesse maximale des vélos aux États-Unis).

In [175]:
# Pour rester précis, c'est 27,9617037 mille/heure

df_divvy_clean[df_divvy_clean['speed_mph'] >= 27.9617037]

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,distance_travelled_km,ride_duration_s,speed_mph
16647,14F41BA143DB5511,electric_bike,2021-01-07 16:42:12,2021-01-07 16:50:04,Canal St & Harrison St,13326,Michigan Ave & Ida B Wells Dr,TA1305000010,41.874228,-87.639593,41.894838,-87.730776,member,7.888718,472.0,37.387
3512008,EF8EDC133E481ACC,classic_bike,2021-09-22 13:50:31,2021-09-22 13:51:31,Lincoln Ave & Belmont Ave,TA1309000042,Lincoln Ave & Diversey Pkwy,TA1307000064,41.939365,-87.668385,41.932225,-87.658617,member,1.132768,60.0,42.232
3649243,172BA8E029A7A91E,classic_bike,2021-09-29 16:18:20,2021-09-29 16:19:29,Wells St & Concord Ln,TA1308000050,Sedgwick St & Webster Ave,13191,41.912133,-87.634656,41.922167,-87.638888,member,1.169387,69.0,37.911


Je vais donc supprimer 3 enregistrements

In [176]:
df_divvy_clean = df_divvy_clean[~(df_divvy_clean['speed_mph'] >= 27.9617037)]

df_divvy_clean.shape

(4295743, 16)

12. Supprimer les colonnes ("ride_id" et "speed_kph") pour un chargement plus rapide

In [177]:
df_divvy_clean.drop(columns='ride_id', inplace=True)

13. * Supprimer les espaces de fin pour les colonnes de type chaîne de caractères.

  * Il y a des données qui sont des trajets de test (avec 'test' inclus dans le nom de la station et l'identifiant de la station). Nous supprimerons également ces données.

In [178]:
# 1. Je vais prendre toutes les colonnes objets et effectuer la suppression des espaces en fin uniquement, comme demandé

for col in list(df_divvy_clean.select_dtypes('object').columns):
    df_divvy_clean[col] = df_divvy_clean[col].apply(lambda x: x.rstrip())

In [180]:
# 2. Je supprime les trajets de test

df_divvy_clean.loc[~((df_divvy_clean['start_station_name'].str.contains('test|TEST')) | 
               (df_divvy_clean['start_station_id'].str.contains('test|TEST'))), :]

df_divvy_clean = df_divvy_clean.loc[~((df_divvy_clean['start_station_name'].str.contains('test|TEST')) | 
                                        (df_divvy_clean['start_station_id'].str.contains('test|TEST'))), :]

In [181]:
# Je reset les index pour avoir plus de faciliter à trouver mes enregistrements pour vérifier mes fonctions suivantes
df_divvy_clean.reset_index(drop=True, inplace=True)

14. Ajouter les catégories suivantes pour découvrir les motifs basés sur le temps :

* day_of_week : Lundi, Mardi, Mercredi, etc.
* day_type : Jour de semaine ou Week-end
* month : Janvier, Février, Mars, etc.
* season : Hiver, Printemps, Été, Automne

    **Créer une fonction pour day_type**

    **Créer une fonction pour season**

In [182]:
# Traitement de day_of_week
df_divvy_clean['day_of_week'] = pd.to_datetime(df_divvy_clean['started_at']).dt.day_name()

# Traitement 'month'
df_divvy_clean['month'] = pd.to_datetime(df_divvy_clean['started_at']).dt.month_name()

df_divvy_clean.head()

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,distance_travelled_km,ride_duration_s,speed_mph,day_of_week,month
0,classic_bike,2021-01-01 00:02:24,2021-01-01 00:08:39,State St & 33rd St,13216,MLK Jr Dr & 29th St,TA1307000139,41.834734,-87.625813,41.842052,-87.617,member,1.093249,375.0,6.521,Friday,January
1,classic_bike,2021-01-01 00:06:55,2021-01-01 00:26:36,Lakeview Ave & Fullerton Pkwy,TA1309000019,Ritchie Ct & Banks St,KA1504000134,41.925858,-87.638973,41.906866,-87.626217,member,2.360881,1181.0,4.472,Friday,January
2,electric_bike,2021-01-01 00:12:49,2021-01-01 00:43:59,Western Ave & Howard St,527,Campbell Ave & Fullerton Ave,15648,42.018858,-87.690022,41.92468,-87.689328,member,10.472291,1870.0,12.527,Friday,January
3,electric_bike,2021-01-01 00:13:12,2021-01-01 00:33:26,Montrose Harbor,TA1308000012,Lakefront Trail & Bryn Mawr Ave,KA1504000152,41.963901,-87.638211,41.984012,-87.652338,casual,2.522755,1214.0,4.648,Friday,January
4,classic_bike,2021-01-01 00:14:03,2021-01-01 00:33:22,Montrose Harbor,TA1308000012,Lakefront Trail & Bryn Mawr Ave,KA1504000152,41.963982,-87.638181,41.984037,-87.65231,member,2.517371,1159.0,4.859,Friday,January


In [183]:
#Fonction pour 'day_type'
# Je vais d'abord essayer avec une valeur

from datetime import datetime

datetime.strptime(df_divvy_clean['started_at'][868], '%Y-%m-%d %H:%M:%S').weekday()

5

In [184]:
# Création de ma fonction

def fn_attrib_weekday(date):

    from datetime import datetime

    if datetime.strptime(date, '%Y-%m-%d %H:%M:%S').weekday() >= 5: return('Weekend')
    
    else: return('Weekday')

fn_attrib_weekday(df_divvy_clean['started_at'][868])

'Weekend'

In [185]:
# Application de la fonction pour 'day_type'

df_divvy_clean['day_type'] = df_divvy_clean["started_at"].apply(fn_attrib_weekday)

df_divvy_clean.head()

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,distance_travelled_km,ride_duration_s,speed_mph,day_of_week,month,day_type
0,classic_bike,2021-01-01 00:02:24,2021-01-01 00:08:39,State St & 33rd St,13216,MLK Jr Dr & 29th St,TA1307000139,41.834734,-87.625813,41.842052,-87.617,member,1.093249,375.0,6.521,Friday,January,Weekday
1,classic_bike,2021-01-01 00:06:55,2021-01-01 00:26:36,Lakeview Ave & Fullerton Pkwy,TA1309000019,Ritchie Ct & Banks St,KA1504000134,41.925858,-87.638973,41.906866,-87.626217,member,2.360881,1181.0,4.472,Friday,January,Weekday
2,electric_bike,2021-01-01 00:12:49,2021-01-01 00:43:59,Western Ave & Howard St,527,Campbell Ave & Fullerton Ave,15648,42.018858,-87.690022,41.92468,-87.689328,member,10.472291,1870.0,12.527,Friday,January,Weekday
3,electric_bike,2021-01-01 00:13:12,2021-01-01 00:33:26,Montrose Harbor,TA1308000012,Lakefront Trail & Bryn Mawr Ave,KA1504000152,41.963901,-87.638211,41.984012,-87.652338,casual,2.522755,1214.0,4.648,Friday,January,Weekday
4,classic_bike,2021-01-01 00:14:03,2021-01-01 00:33:22,Montrose Harbor,TA1308000012,Lakefront Trail & Bryn Mawr Ave,KA1504000152,41.963982,-87.638181,41.984037,-87.65231,member,2.517371,1159.0,4.859,Friday,January,Weekday


In [None]:
#Application de la fonction pour 'day_type'


Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,distance_travelled_km,ride_duration_s,day_of_week,month,day_type
0,classic_bike,2021-02-12 16:14:56,2021-02-12 16:21:43,Glenwood Ave & Touhy Ave,525,Sheridan Rd & Columbia Ave,660,42.012701,-87.666058,42.004583,-87.661406,member,0.981104,407.0,Friday,February,Weekday
1,classic_bike,2021-02-14 17:52:38,2021-02-14 18:12:09,Glenwood Ave & Touhy Ave,525,Bosworth Ave & Howard St,16806,42.012701,-87.666058,42.019537,-87.669563,casual,0.813412,1171.0,Sunday,February,Weekend
2,electric_bike,2021-02-09 19:10:18,2021-02-09 19:19:10,Clark St & Lake St,KA1503000012,State St & Randolph St,TA1305000029,41.885795,-87.631101,41.884866,-87.627498,member,0.315636,532.0,Tuesday,February,Weekday
3,classic_bike,2021-02-02 17:49:41,2021-02-02 17:54:06,Wood St & Chicago Ave,637,Honore St & Division St,TA1305000034,41.895634,-87.672069,41.903119,-87.673935,member,0.846501,265.0,Tuesday,February,Weekday
4,electric_bike,2021-02-23 15:07:23,2021-02-23 15:22:37,State St & 33rd St,13216,Emerald Ave & 31st St,TA1309000055,41.834733,-87.625827,41.838163,-87.645123,member,1.64346,914.0,Tuesday,February,Weekday


In [186]:
# Fonction pour 'season'
# Création de ma fonction

def fn_attrib_season(date):

    from datetime import datetime

    if 9 > datetime.strptime(date, '%Y-%m-%d %H:%M:%S').month >= 6: return('Summer')
    
    elif 12 > datetime.strptime(date, '%Y-%m-%d %H:%M:%S').month >= 9: return('Autumn')

    elif (2 >= datetime.strptime(date, '%Y-%m-%d %H:%M:%S').month) or \
         (datetime.strptime(date, '%Y-%m-%d %H:%M:%S').month == 12): return('Winter')

    elif 6 > datetime.strptime(date, '%Y-%m-%d %H:%M:%S').month >= 3: return('Spring')

    else: return('Flag')

fn_attrib_season(df_divvy_clean['started_at'][134560])


'Spring'

In [187]:
# Application de la fonction pour 'season'

df_divvy_clean['season'] = df_divvy_clean['started_at'].apply(fn_attrib_season)

df_divvy_clean.head()

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,distance_travelled_km,ride_duration_s,speed_mph,day_of_week,month,day_type,season
0,classic_bike,2021-01-01 00:02:24,2021-01-01 00:08:39,State St & 33rd St,13216,MLK Jr Dr & 29th St,TA1307000139,41.834734,-87.625813,41.842052,-87.617,member,1.093249,375.0,6.521,Friday,January,Weekday,Winter
1,classic_bike,2021-01-01 00:06:55,2021-01-01 00:26:36,Lakeview Ave & Fullerton Pkwy,TA1309000019,Ritchie Ct & Banks St,KA1504000134,41.925858,-87.638973,41.906866,-87.626217,member,2.360881,1181.0,4.472,Friday,January,Weekday,Winter
2,electric_bike,2021-01-01 00:12:49,2021-01-01 00:43:59,Western Ave & Howard St,527,Campbell Ave & Fullerton Ave,15648,42.018858,-87.690022,41.92468,-87.689328,member,10.472291,1870.0,12.527,Friday,January,Weekday,Winter
3,electric_bike,2021-01-01 00:13:12,2021-01-01 00:33:26,Montrose Harbor,TA1308000012,Lakefront Trail & Bryn Mawr Ave,KA1504000152,41.963901,-87.638211,41.984012,-87.652338,casual,2.522755,1214.0,4.648,Friday,January,Weekday,Winter
4,classic_bike,2021-01-01 00:14:03,2021-01-01 00:33:22,Montrose Harbor,TA1308000012,Lakefront Trail & Bryn Mawr Ave,KA1504000152,41.963982,-87.638181,41.984037,-87.65231,member,2.517371,1159.0,4.859,Friday,January,Weekday,Winter


In [None]:
#Application de la fonction pour 'season'


Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,distance_travelled_km,ride_duration_s,day_of_week,month,day_type,season
0,classic_bike,2021-02-12 16:14:56,2021-02-12 16:21:43,Glenwood Ave & Touhy Ave,525,Sheridan Rd & Columbia Ave,660,42.012701,-87.666058,42.004583,-87.661406,member,0.981104,407.0,Friday,February,Weekday,Winter
1,classic_bike,2021-02-14 17:52:38,2021-02-14 18:12:09,Glenwood Ave & Touhy Ave,525,Bosworth Ave & Howard St,16806,42.012701,-87.666058,42.019537,-87.669563,casual,0.813412,1171.0,Sunday,February,Weekend,Winter
2,electric_bike,2021-02-09 19:10:18,2021-02-09 19:19:10,Clark St & Lake St,KA1503000012,State St & Randolph St,TA1305000029,41.885795,-87.631101,41.884866,-87.627498,member,0.315636,532.0,Tuesday,February,Weekday,Winter
3,classic_bike,2021-02-02 17:49:41,2021-02-02 17:54:06,Wood St & Chicago Ave,637,Honore St & Division St,TA1305000034,41.895634,-87.672069,41.903119,-87.673935,member,0.846501,265.0,Tuesday,February,Weekday,Winter
4,electric_bike,2021-02-23 15:07:23,2021-02-23 15:22:37,State St & 33rd St,13216,Emerald Ave & 31st St,TA1309000055,41.834733,-87.625827,41.838163,-87.645123,member,1.64346,914.0,Tuesday,February,Weekday,Winter


15. Ajouter la colonne 'route_type' (même 'start_station_name' et 'end_station_name') pour classifier les trajets comme trajet aller-retour ou trajet aller simple.

In [188]:
df_divvy_clean.loc[df_divvy_clean['start_station_name'] == df_divvy_clean['end_station_name'], :]

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,distance_travelled_km,ride_duration_s,speed_mph,day_of_week,month,day_type,season
36,electric_bike,2021-01-01 01:25:13,2021-01-01 01:41:12,Columbus Dr & Randolph St,13263,Columbus Dr & Randolph St,13263,41.884378,-87.619360,41.884557,-87.619510,casual,0.023385,959.0,0.055,Friday,January,Weekday,Winter
234,electric_bike,2021-01-01 08:13:49,2021-01-01 08:24:34,Sangamon St & Washington Blvd,13409,Sangamon St & Washington Blvd,13409,41.882901,-87.651062,41.883276,-87.650915,member,0.043412,645.0,0.151,Friday,January,Weekday,Winter
259,electric_bike,2021-01-01 08:52:43,2021-01-01 09:24:58,Halsted St & Dickens Ave,13192,Halsted St & Dickens Ave,13192,41.920045,-87.648724,41.919790,-87.648852,member,0.030278,1935.0,0.035,Friday,January,Weekday,Winter
269,electric_bike,2021-01-01 09:06:44,2021-01-01 09:27:10,Dearborn St & Erie St,13045,Dearborn St & Erie St,13045,41.894101,-87.629217,41.894035,-87.629424,member,0.018591,1226.0,0.034,Friday,January,Weekday,Winter
299,electric_bike,2021-01-01 09:59:18,2021-01-01 10:06:31,Dearborn St & Erie St,13045,Dearborn St & Erie St,13045,41.893735,-87.628758,41.893993,-87.629507,member,0.068351,433.0,0.353,Friday,January,Weekday,Winter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4295487,electric_bike,2021-12-31 23:09:23,2021-12-31 23:15:44,Larrabee St & Oak St,KA1504000116,Larrabee St & Oak St,KA1504000116,41.900181,-87.643033,41.900219,-87.642985,casual,0.005850,381.0,0.034,Friday,December,Weekday,Winter
4295507,electric_bike,2021-12-31 23:18:58,2021-12-31 23:28:07,Halsted St & 51st St,KA1503000010,Halsted St & 51st St,KA1503000010,41.801694,-87.645278,41.801832,-87.645214,casual,0.016152,549.0,0.066,Friday,December,Weekday,Winter
4295564,electric_bike,2021-12-31 23:49:59,2022-01-01 00:15:53,Pine Grove Ave & Irving Park Rd,TA1308000022,Pine Grove Ave & Irving Park Rd,TA1308000022,41.954422,-87.648104,41.954383,-87.648043,casual,0.006664,1554.0,0.010,Friday,December,Weekday,Winter
4295572,electric_bike,2021-12-31 23:56:50,2022-01-01 00:24:40,Federal St & Polk St,SL-008,Federal St & Polk St,SL-008,41.872108,-87.629491,41.872078,-87.629544,casual,0.005556,1670.0,0.007,Friday,December,Weekday,Winter


In [189]:
#Fonction for 'route_type'

def fn_attrib_route_type(row):

    if row['start_station_name'] == row['end_station_name']: return 'One-way trip'

    else: return 'Round trip'

fn_attrib_route_type(df_divvy_clean.iloc[12])

'Round trip'

In [190]:
df_divvy_clean['route_type'] = df_divvy_clean.apply(fn_attrib_route_type, axis=1)

df_divvy_clean.head()

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,distance_travelled_km,ride_duration_s,speed_mph,day_of_week,month,day_type,season,route_type
0,classic_bike,2021-01-01 00:02:24,2021-01-01 00:08:39,State St & 33rd St,13216,MLK Jr Dr & 29th St,TA1307000139,41.834734,-87.625813,41.842052,-87.617,member,1.093249,375.0,6.521,Friday,January,Weekday,Winter,Round trip
1,classic_bike,2021-01-01 00:06:55,2021-01-01 00:26:36,Lakeview Ave & Fullerton Pkwy,TA1309000019,Ritchie Ct & Banks St,KA1504000134,41.925858,-87.638973,41.906866,-87.626217,member,2.360881,1181.0,4.472,Friday,January,Weekday,Winter,Round trip
2,electric_bike,2021-01-01 00:12:49,2021-01-01 00:43:59,Western Ave & Howard St,527,Campbell Ave & Fullerton Ave,15648,42.018858,-87.690022,41.92468,-87.689328,member,10.472291,1870.0,12.527,Friday,January,Weekday,Winter,Round trip
3,electric_bike,2021-01-01 00:13:12,2021-01-01 00:33:26,Montrose Harbor,TA1308000012,Lakefront Trail & Bryn Mawr Ave,KA1504000152,41.963901,-87.638211,41.984012,-87.652338,casual,2.522755,1214.0,4.648,Friday,January,Weekday,Winter,Round trip
4,classic_bike,2021-01-01 00:14:03,2021-01-01 00:33:22,Montrose Harbor,TA1308000012,Lakefront Trail & Bryn Mawr Ave,KA1504000152,41.963982,-87.638181,41.984037,-87.65231,member,2.517371,1159.0,4.859,Friday,January,Weekday,Winter,Round trip


In [None]:
#Application 'route_type'


Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,distance_travelled_km,ride_duration_s,day_of_week,month,day_type,season,route_type
0,classic_bike,2021-02-12 16:14:56,2021-02-12 16:21:43,Glenwood Ave & Touhy Ave,525,Sheridan Rd & Columbia Ave,660,42.012701,-87.666058,42.004583,-87.661406,member,0.981104,407.0,Friday,February,Weekday,Winter,One-way trip
1,classic_bike,2021-02-14 17:52:38,2021-02-14 18:12:09,Glenwood Ave & Touhy Ave,525,Bosworth Ave & Howard St,16806,42.012701,-87.666058,42.019537,-87.669563,casual,0.813412,1171.0,Sunday,February,Weekend,Winter,One-way trip
2,electric_bike,2021-02-09 19:10:18,2021-02-09 19:19:10,Clark St & Lake St,KA1503000012,State St & Randolph St,TA1305000029,41.885795,-87.631101,41.884866,-87.627498,member,0.315636,532.0,Tuesday,February,Weekday,Winter,One-way trip
3,classic_bike,2021-02-02 17:49:41,2021-02-02 17:54:06,Wood St & Chicago Ave,637,Honore St & Division St,TA1305000034,41.895634,-87.672069,41.903119,-87.673935,member,0.846501,265.0,Tuesday,February,Weekday,Winter,One-way trip
4,electric_bike,2021-02-23 15:07:23,2021-02-23 15:22:37,State St & 33rd St,13216,Emerald Ave & 31st St,TA1309000055,41.834733,-87.625827,41.838163,-87.645123,member,1.64346,914.0,Tuesday,February,Weekday,Winter,One-way trip


16. Ajouter la colonne "ride_duration_min" qui représente la durée du trajet en minutes.

In [191]:
df_divvy_clean['ride_duration_min'] = round(df_divvy_clean['ride_duration_s'] / 60, 3)

df_divvy_clean.head()

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,...,member_casual,distance_travelled_km,ride_duration_s,speed_mph,day_of_week,month,day_type,season,route_type,ride_duration_min
0,classic_bike,2021-01-01 00:02:24,2021-01-01 00:08:39,State St & 33rd St,13216,MLK Jr Dr & 29th St,TA1307000139,41.834734,-87.625813,41.842052,...,member,1.093249,375.0,6.521,Friday,January,Weekday,Winter,Round trip,6.25
1,classic_bike,2021-01-01 00:06:55,2021-01-01 00:26:36,Lakeview Ave & Fullerton Pkwy,TA1309000019,Ritchie Ct & Banks St,KA1504000134,41.925858,-87.638973,41.906866,...,member,2.360881,1181.0,4.472,Friday,January,Weekday,Winter,Round trip,19.683
2,electric_bike,2021-01-01 00:12:49,2021-01-01 00:43:59,Western Ave & Howard St,527,Campbell Ave & Fullerton Ave,15648,42.018858,-87.690022,41.92468,...,member,10.472291,1870.0,12.527,Friday,January,Weekday,Winter,Round trip,31.167
3,electric_bike,2021-01-01 00:13:12,2021-01-01 00:33:26,Montrose Harbor,TA1308000012,Lakefront Trail & Bryn Mawr Ave,KA1504000152,41.963901,-87.638211,41.984012,...,casual,2.522755,1214.0,4.648,Friday,January,Weekday,Winter,Round trip,20.233
4,classic_bike,2021-01-01 00:14:03,2021-01-01 00:33:22,Montrose Harbor,TA1308000012,Lakefront Trail & Bryn Mawr Ave,KA1504000152,41.963982,-87.638181,41.984037,...,member,2.517371,1159.0,4.859,Friday,January,Weekday,Winter,Round trip,19.317


17. Renommer tes colonnes pour qu'elles soient pretes pour la dataViz

In [192]:
columns_renamed = {
                    'rideable_type' : 'bike_type',
                    'member_casual' : 'user_type',
                    'started_at' : 'start_time',
                    'ended_at' : 'end_time' 
                }

df_divvy_clean.rename(columns=columns_renamed, inplace=True)

df_divvy_clean.head()

Unnamed: 0,bike_type,start_time,end_time,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,...,user_type,distance_travelled_km,ride_duration_s,speed_mph,day_of_week,month,day_type,season,route_type,ride_duration_min
0,classic_bike,2021-01-01 00:02:24,2021-01-01 00:08:39,State St & 33rd St,13216,MLK Jr Dr & 29th St,TA1307000139,41.834734,-87.625813,41.842052,...,member,1.093249,375.0,6.521,Friday,January,Weekday,Winter,Round trip,6.25
1,classic_bike,2021-01-01 00:06:55,2021-01-01 00:26:36,Lakeview Ave & Fullerton Pkwy,TA1309000019,Ritchie Ct & Banks St,KA1504000134,41.925858,-87.638973,41.906866,...,member,2.360881,1181.0,4.472,Friday,January,Weekday,Winter,Round trip,19.683
2,electric_bike,2021-01-01 00:12:49,2021-01-01 00:43:59,Western Ave & Howard St,527,Campbell Ave & Fullerton Ave,15648,42.018858,-87.690022,41.92468,...,member,10.472291,1870.0,12.527,Friday,January,Weekday,Winter,Round trip,31.167
3,electric_bike,2021-01-01 00:13:12,2021-01-01 00:33:26,Montrose Harbor,TA1308000012,Lakefront Trail & Bryn Mawr Ave,KA1504000152,41.963901,-87.638211,41.984012,...,casual,2.522755,1214.0,4.648,Friday,January,Weekday,Winter,Round trip,20.233
4,classic_bike,2021-01-01 00:14:03,2021-01-01 00:33:22,Montrose Harbor,TA1308000012,Lakefront Trail & Bryn Mawr Ave,KA1504000152,41.963982,-87.638181,41.984037,...,member,2.517371,1159.0,4.859,Friday,January,Weekday,Winter,Round trip,19.317


18. Vérifier les caractéristiques des dataframes avant l'export du dataset nettoyé

In [193]:
df_divvy_final = df_divvy_clean[['bike_type', 'user_type', 'start_time', 'end_time',
                                 'day_of_week',	'day_type',	'month', 'season', 'start_station_name',
                                 'end_station_name', 'route_type', 'start_lat',	'start_lng',
                                'end_lat', 'end_lng', 'distance_travelled_km', 'ride_duration_s', 'ride_duration_min']]

df_divvy_final

Unnamed: 0,bike_type,user_type,start_time,end_time,day_of_week,day_type,month,season,start_station_name,end_station_name,route_type,start_lat,start_lng,end_lat,end_lng,distance_travelled_km,ride_duration_s,ride_duration_min
0,classic_bike,member,2021-01-01 00:02:24,2021-01-01 00:08:39,Friday,Weekday,January,Winter,State St & 33rd St,MLK Jr Dr & 29th St,Round trip,41.834734,-87.625813,41.842052,-87.617000,1.093249,375.0,6.250
1,classic_bike,member,2021-01-01 00:06:55,2021-01-01 00:26:36,Friday,Weekday,January,Winter,Lakeview Ave & Fullerton Pkwy,Ritchie Ct & Banks St,Round trip,41.925858,-87.638973,41.906866,-87.626217,2.360881,1181.0,19.683
2,electric_bike,member,2021-01-01 00:12:49,2021-01-01 00:43:59,Friday,Weekday,January,Winter,Western Ave & Howard St,Campbell Ave & Fullerton Ave,Round trip,42.018858,-87.690022,41.924680,-87.689328,10.472291,1870.0,31.167
3,electric_bike,casual,2021-01-01 00:13:12,2021-01-01 00:33:26,Friday,Weekday,January,Winter,Montrose Harbor,Lakefront Trail & Bryn Mawr Ave,Round trip,41.963901,-87.638211,41.984012,-87.652338,2.522755,1214.0,20.233
4,classic_bike,member,2021-01-01 00:14:03,2021-01-01 00:33:22,Friday,Weekday,January,Winter,Montrose Harbor,Lakefront Trail & Bryn Mawr Ave,Round trip,41.963982,-87.638181,41.984037,-87.652310,2.517371,1159.0,19.317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4295574,classic_bike,casual,2021-12-31 23:58:21,2022-01-01 00:46:56,Friday,Weekday,December,Winter,Michigan Ave & Lake St,Clinton St & Tilden St,Round trip,41.886024,-87.624117,41.875885,-87.640795,1.782554,2915.0,48.583
4295575,electric_bike,casual,2021-12-31 23:58:45,2022-01-01 00:47:07,Friday,Weekday,December,Winter,Michigan Ave & Lake St,Clinton St & Tilden St,Round trip,41.885992,-87.624782,41.875885,-87.640795,1.737988,2902.0,48.367
4295576,electric_bike,casual,2021-12-31 23:59:27,2022-01-01 00:32:34,Friday,Weekday,December,Winter,Clark St & Ida B Wells Dr,Clark St & Ida B Wells Dr,One-way trip,41.875831,-87.631348,41.875933,-87.630585,0.064204,1987.0,33.117
4295577,electric_bike,member,2021-12-31 23:59:39,2022-01-01 00:21:08,Friday,Weekday,December,Winter,Millennium Park,Michigan Ave & 14th St,Round trip,41.881130,-87.624046,41.864059,-87.623727,1.898363,1289.0,21.483
