# Exploratory and statistic indsights

In [70]:
# Load data and refine it a bit (string to datetime, merge both mechanical and electrical, check for null values, ...)

import pandas as pd

column_names = ['date','capacity','available_mechanical','available_electrical','station_name','station_geo','operative']
df = pd.read_csv('data/historique_stations.csv', header = None, names = column_names)

df['available'] = df['available_mechanical'] + df['available_electrical']
df = df.drop(columns = ['available_mechanical', 'available_electrical'])
df['date'] = pd.to_datetime(df['date'])

print(df.isnull().values.any())
display(df)
df.dtypes


False


Unnamed: 0,date,capacity,station_name,station_geo,operative,available
0,2020-11-26 12:59:00+00:00,35,Benjamin Godard - Victor Hugo,"48.86598,2.27572",True,9
1,2020-11-26 12:59:00+00:00,55,André Mazet - Saint-André des Arts,"48.85376,2.33910",True,27
2,2020-11-26 12:59:00+00:00,20,Charonne - Robert et Sonia Delauney,"48.85591,2.39257",True,0
3,2020-11-26 12:59:00+00:00,21,Toudouze - Clauzel,"48.87930,2.33736",True,1
4,2020-11-26 12:59:00+00:00,30,Mairie du 12ème,"48.84086,2.38755",True,4
...,...,...,...,...,...,...
10986725,2021-04-09 14:37:00+00:00,38,Général Michel Bizot - Claude Decaen,"48.83481,2.40093",True,6
10986726,2021-04-09 14:37:00+00:00,20,Ivry - Baudricourt,"48.82470,2.36311",True,3
10986727,2021-04-09 14:37:00+00:00,39,Saint-Mandé - Docteur Arnold Netter,"48.84463,2.40495",True,17
10986728,2021-04-09 14:37:00+00:00,21,Saint-Marcel - Hôpital,"48.83950,2.36099",True,16


date            datetime64[ns, UTC]
capacity                      int64
station_name                 object
station_geo                  object
operative                      bool
available                     int64
dtype: object

## How many stations, total capacity (the highest and the lowest)

In [71]:
num_stations = len(df['station_name'].unique())
highest_capacity = df.sort_values('capacity',ascending=False).groupby(by='station_name').first()['capacity'].sum()
lowest_capacity = df.sort_values('capacity').groupby(by='station_name').first()['capacity'].sum()

print('Number of stations :')
print(num_stations)

print('Highest total capacity :')
print(highest_capacity)

print('Lowest total capacity :')
print(lowest_capacity)


Number of stations :
1399
Highest total capacity :
44037
Lowest total capacity :
43864


## Should we ignore stations ? Check for capacity = 0, unoperative stations, unresponsive stations, ...

### Capacity of 0 (probably bugs) : 

In [74]:
display(df[df['capacity'] == 0].groupby('station_name').size().sort_values())

station_name
Hôpital Européen Georges Pompidou          2
Jean Jaurès - Paul Lafargue                2
Arago - Paul Lafargue                    680
Chabanais - Petits Champs               1154
Université Paris Dauphine               3884
Parc Floral du Bel Air                  3978
Berthier - Porte de Clichy              7866
Champs de Manoeuvre - Cartoucherie      7866
Longchamp - Suresnes                    7866
Quai de la Gare - Pont de Bercy         7866
Saint-Michel - Luxembourg               7866
Verdun - Pierre et Marie Curie          7866
Victor Massé - Jean-Baptiste Pigalle    7866
dtype: int64

### Capacity of 0 but with bikes available (definitely bugs) :

In [75]:
display(df[(df['capacity'] == 0) & (df['available'] > 0)].groupby('station_name').size().sort_values())

station_name
Chabanais - Petits Champs    31
dtype: int64

In [None]:
# How many total dates available ?
# Convert the 'date' column as a datetime column, then investigate the frequency of the data.

date_arr1 = df['date'].unique()
num_date = len(date_arr1)
latest = date_arr1.max()

print("Number of different dates :")
print(num_date)
print("Latest date :")
print(latest)

#Is it the case for every stations ?

display((df.groupby(by='station_name')['date'].nunique() == num_date).sum())


display(df[df['operative'] == False].groupby('station_name').size().sort_values().tail(50))

In [50]:
import numpy as np

# Can't .diff() a DatetimeArray so do it manually. Remove the first one (and last to get matching lengths), then substract.

date_arr2 = date_arr1[:-1]
date_arr1 = date_arr1[1:]
granularity = date_arr1 - date_arr2
print(sorted(granularity, reverse = True)[:5])
print(sorted(granularity)[:5])

#Time granularity is uneven, we should resample and bin the data



inop_df = df[df['operative'] == False]['station_name'].unique
#display(inop_df)

Number of different dates :
7866


station_name
 Jean Bleuzen - Square du 11 Novembre    7866
 Place Léon Gambetta                     7866
11 Novembre 1918 - 8 Mai 1945            7866
18 juin 1940 - Buzenval                  7866
8 Mai 1945 - 10 Juillet 1940             7866
                                         ... 
Youri Gagarine - Commune de Paris        7866
Youri Gagarine - Karl Marx               7866
station formation alfortville               5
Édouard Vaillant - Place Jean Jaurès     7866
Île de la Jatte                          7866
Name: date, Length: 1399, dtype: int64

[Timedelta('3 days 19:46:00'), Timedelta('0 days 03:41:00'), Timedelta('0 days 03:24:00'), Timedelta('0 days 02:15:00'), Timedelta('0 days 02:05:00')]
[Timedelta('0 days 00:01:00'), Timedelta('0 days 00:01:00'), Timedelta('0 days 00:01:00'), Timedelta('0 days 00:03:00'), Timedelta('0 days 00:03:00')]


Unnamed: 0,date,capacity,available_mechanical,available_electrical,station_name,station_geo,operative
1393,2020-11-26 12:59:00+00:00,20,1,3,Ivry - Baudricourt,"48.82470,2.36311",True
2790,2020-11-26 13:06:00+00:00,20,2,4,Ivry - Baudricourt,"48.82470,2.36311",True
4187,2020-11-26 13:21:00+00:00,20,1,3,Ivry - Baudricourt,"48.82470,2.36311",True
5584,2020-11-26 13:32:00+00:00,20,0,3,Ivry - Baudricourt,"48.82470,2.36311",True
6981,2020-11-26 13:47:00+00:00,20,0,3,Ivry - Baudricourt,"48.82470,2.36311",True
...,...,...,...,...,...,...,...
10981132,2021-04-09 13:44:00+00:00,20,0,1,Ivry - Baudricourt,"48.82470,2.36311",True
10982530,2021-04-09 13:53:00+00:00,20,0,1,Ivry - Baudricourt,"48.82470,2.36311",True
10983928,2021-04-09 14:04:00+00:00,20,0,3,Ivry - Baudricourt,"48.82470,2.36311",True
10985327,2021-04-09 14:25:00+00:00,20,2,1,Ivry - Baudricourt,"48.82470,2.36311",True


In [65]:
display(df[df['capacity'] == 0])

Unnamed: 0,date,capacity,available_mechanical,available_electrical,station_name,station_geo,operative
206,2020-11-26 12:59:00+00:00,0,0,0,Verdun - Pierre et Marie Curie,"48.81274,2.37079",False
262,2020-11-26 12:59:00+00:00,0,0,0,Chabanais - Petits Champs,"48.86728,2.33673",False
379,2020-11-26 12:59:00+00:00,0,0,0,Longchamp - Suresnes,"48.86301,2.24083",False
860,2020-11-26 12:59:00+00:00,0,0,0,Quai de la Gare - Pont de Bercy,"48.83731,2.37409",False
964,2020-11-26 12:59:00+00:00,0,0,0,Berthier - Porte de Clichy,"48.89430,2.31256",False
...,...,...,...,...,...,...,...
10986296,2021-04-09 14:37:00+00:00,0,0,0,Berthier - Porte de Clichy,"48.89430,2.31256",False
10986466,2021-04-09 14:37:00+00:00,0,0,0,Saint-Michel - Luxembourg,"48.84500,2.33979",False
10986532,2021-04-09 14:37:00+00:00,0,0,0,Champs de Manoeuvre - Cartoucherie,"48.83407,2.44561",False
10986540,2021-04-09 14:37:00+00:00,0,0,0,Jean Jaurès - Paul Lafargue,"48.88501,2.24857",False
