In [1]:
import os
import re
import ast
import typing
import requests
import icalendar
import dask.dataframe as dd
import glob
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import matplotlib as mpl
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
from datetime import date, datetime
from icalendar import Calendar, Event, vCalAddress, vText

import xgboost as xgb
from xgboost import plot_importance

from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder

%matplotlib inline

np.random.seed(31415)

sns.set(rc={'figure.figsize':(15,3)})
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')

from IPython.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



# Loading Data

In [2]:
#i2m = list(zip(range(1,13), ['Gener','Febrer','Marc','Abril','Maig','Juny','Juliol','Agost','Setembre','Octubre','Novembre','Desembre']))
#for year in [2023, 2022, 2021, 2020]:
    #for month, month_name in i2m:        
        #os.system(f"wget 'https://opendata-ajuntament.barcelona.cat/resources/bcn/BicingBCN/{year}_{month:02d}_{month_name}_BicingNou_ESTACIONS.7z'")
        #os.system(f"7z x '{year}_{month:02d}_{month_name}_BicingNou_ESTACIONS.7z'")
        #os.system(f"rm '{year}_{month:02d}_{month_name}_BicingNou_ESTACIONS.7z'")

In [3]:
df_2020_03 = pd.read_csv('./data/2020_03_Marc_BicingNou_ESTACIONS.csv')

In [4]:
df_2020_03.head()

Unnamed: 0,station_id,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available,is_installed,is_renting,is_returning,last_reported,is_charging_station,status,last_updated,ttl
0,1,24,23,1,16,1,1,1,1583017000.0,True,IN_SERVICE,1583017224,25
1,2,5,4,1,19,1,1,1,1583017000.0,True,IN_SERVICE,1583017224,25
2,3,6,6,0,18,1,1,1,1583017000.0,True,IN_SERVICE,1583017224,25
3,4,4,3,1,15,1,1,1,1583017000.0,True,IN_SERVICE,1583017224,25
4,5,14,14,0,25,1,1,1,1583017000.0,True,IN_SERVICE,1583017224,25


In [5]:
df_2020_03.shape

(2408419, 13)

In [6]:
df_2020_03.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2408419 entries, 0 to 2408418
Data columns (total 13 columns):
 #   Column                                Dtype  
---  ------                                -----  
 0   station_id                            int64  
 1   num_bikes_available                   int64  
 2   num_bikes_available_types.mechanical  int64  
 3   num_bikes_available_types.ebike       int64  
 4   num_docks_available                   int64  
 5   is_installed                          int64  
 6   is_renting                            int64  
 7   is_returning                          int64  
 8   last_reported                         float64
 9   is_charging_station                   bool   
 10  status                                object 
 11  last_updated                          int64  
 12  ttl                                   int64  
dtypes: bool(1), float64(1), int64(10), object(1)
memory usage: 222.8+ MB


In [7]:
missing_val = df_2020_03.isna().sum()

In [8]:
print(missing_val)

station_id                              0
num_bikes_available                     0
num_bikes_available_types.mechanical    0
num_bikes_available_types.ebike         0
num_docks_available                     0
is_installed                            0
is_renting                              0
is_returning                            0
last_reported                           0
is_charging_station                     0
status                                  0
last_updated                            0
ttl                                     0
dtype: int64


In [9]:
# Specify the file pattern matching the CSV files
file_pattern = './data/202*_*_*_BicingNou_ESTACIONS.csv'

# Read all CSV files into a Dask DataFrame with assume_missing=True
dfs = [
    dd.read_csv(
        file, 
        dtype={
            'status': 'object', 
            'station_id': 'float64', 
            'num_bikes_available': 'float64',
            'num_bikes_available_types.mechanical': 'float64',
            'num_bikes_available_types.ebike': 'float64',
            'num_docks_available': 'float64', 
            'is_installed': 'float64',
            'is_renting': 'float64', 
            'is_returning': 'float64', 
            'last_reported': 'float64',
            'is_charging_station': 'object', # Change to object to handle NA values
            'last_updated': 'float64',
            'ttl': 'float64'
        },
        assume_missing=True
    ) 
    for file in sorted(glob.glob(file_pattern))
]

# Concatenate all DataFrames into a single DataFrame
df_concatenated = dd.concat(dfs, axis=0)

# Reset the index to ensure it is unique
df_concatenated = df_concatenated.reset_index(drop=True)

# Persist the concatenated DataFrame for faster access
df_concatenated = df_concatenated.persist()

In [10]:
# Compute the shape of the concatenated DataFrame
shape = df_concatenated.shape
num_rows = shape[0].compute()
num_columns = shape[1]

print("Number of rows:", num_rows)
print("Number of columns:", num_columns)

Number of rows: 200973778
Number of columns: 15


In [11]:
missing_values = df_concatenated.isna().sum().compute()

In [12]:
print("Missing values per column:")
print(missing_values)

Missing values per column:
station_id                                   3765
num_bikes_available                          3765
num_bikes_available_types.mechanical         3765
num_bikes_available_types.ebike              3765
num_docks_available                          3765
is_installed                                 3765
is_renting                                   3765
is_returning                                 3765
last_reported                                3765
is_charging_station                          3765
status                                       3765
last_updated                                 3765
ttl                                          3765
traffic                                 200138821
V1                                      200973778
dtype: int64


Remove all those rows from our dataframe that do not contain records in any of the variables.

In [13]:
# Drop rows where all values are missing
df_cleaned = df_concatenated.dropna(how='all')

# Reset the index to ensure it is unique
df_cleaned = df_cleaned.reset_index(drop=True)

# Persist the cleaned DataFrame for faster access
df_cleaned = df_cleaned.persist()

# Compute the shape of the cleaned DataFrame
shape = df_cleaned.shape
num_rows = shape[0].compute()
num_columns = shape[1]

print("Number of rows:", num_rows)
print("Number of columns:", num_columns)

Number of rows: 200970013
Number of columns: 15


In [14]:
missing_values = df_cleaned.isna().sum().compute()
print("Missing values per column:")
print(missing_values)

Missing values per column:
station_id                                      0
num_bikes_available                             0
num_bikes_available_types.mechanical            0
num_bikes_available_types.ebike                 0
num_docks_available                             0
is_installed                                    0
is_renting                                      0
is_returning                                    0
last_reported                                   0
is_charging_station                             0
status                                          0
last_updated                                    0
ttl                                             0
traffic                                 200135056
V1                                      200970013
dtype: int64


In [18]:
# Define the custom functions
def get_datetime(milliseconds: int):
    return datetime.fromtimestamp(milliseconds)

def create_date_df(df: pd.DataFrame):
    df['date'] = pd.to_datetime(df['last_reported'].apply(lambda x: get_datetime(x)))
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['hour'] = df['date'].dt.hour
    df['day'] = df['date'].dt.day
    return df

In [24]:
# Create a sample DataFrame to infer metadata
sample_df = create_date_df(pd.DataFrame(columns=[
    'station_id', 'num_bikes_available', 'num_bikes_available_types.mechanical',
    'num_bikes_available_types.ebike', 'num_docks_available', 'is_installed', 'is_renting',
    'is_returning', 'last_reported', 'is_charging_station', 'status', 'last_updated', 'ttl', 'traffic', 'V1'
]))

In [None]:
# Apply the create_date_df function to each partition
df_transformed = df_cleaned.map_partitions(create_date_df, meta=sample_df)

# Reset the index to ensure it is unique
df_transformed = df_transformed.reset_index(drop=True)

# Persist the transformed DataFrame for faster access
df_transformed = df_transformed.persist()

# Compute the shape of the transformed DataFrame
shape = df_transformed.shape
num_rows = shape[0].compute()
num_columns = shape[1]

print("Number of rows:", num_rows)
print("Number of columns:", num_columns)