# Pre processing information

In [12]:
%pip install dask



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [13]:
import pandas as pd
def clean_is_installed(value):
    if isinstance(value, str):
        value = value.lower()
        if value in ['true', 'false']:
            return 1 if value == 'true' else 0
    try:
        return float(value)
    except ValueError:
        return None


def to_bool(x):
    if pd.isna(x):
        return False
    if isinstance(x, str):
        return x.lower() in ['true', '1']
    return bool(x)


def clean_is_renting_returning(value):
    if value in ['0', '1']:
        return int(value)
    return None


def clean_num_docks_available(value):
    try:
        return float(value)
    except ValueError:
        return None


def cleanColumns(df):
    # Convert 'is_installed' to boolean using a custom function
    df['is_installed'] = df['is_installed'].map_partitions(
        lambda s: s.map(clean_is_installed))

    # Convert 'is_renting' and 'is_returning' to boolean using a custom function
    df['is_renting'] = df['is_renting'].map_partitions(
        lambda s: s.map(clean_is_renting_returning))
    df['is_returning'] = df['is_returning'].map_partitions(
        lambda s: s.map(clean_is_renting_returning))

    # Convert 'num_docks_available' to float using a custom function
    df['num_docks_available'] = df['num_docks_available'].map_partitions(
        lambda s: s.map(clean_num_docks_available))

    # Convert 'is_charging_station' to boolean using a custom function
    df['is_charging_station'] = df['is_charging_station'].map_partitions(
        lambda s: s.map(to_bool))
    
    return df

### Pre-process the bicing station information

In [14]:
import dask.dataframe as dd
import pandas as pd
import glob
from datetime import  datetime

# Define the path to the directory containing the CSV files
csv_files = glob.glob('bicing_data/*.csv')

# Read all files into a single Dask DataFrame, treating all columns as strings initially
df = dd.read_csv(csv_files, assume_missing=True, dtype=str)


# last_updated column has unecessary information
columns_to_drop = ['last_updated']
df = df.drop(columns=columns_to_drop)


# Get all unique columns across all CSV files
all_columns = set(df.columns)
# Ensure all columns are present in the DataFrame
for col in all_columns:
    if col not in df.columns:
        df[col] = None





df = cleanColumns(df)




# Convert columns to their appropriate data types
df = df.astype({
    'is_charging_station': 'Int64', 
    'is_installed': 'Int64',
    'is_renting': 'Int64',
    'is_returning': 'Int64',
    'num_bikes_available': 'Int64',
    'num_bikes_available_types.ebike': 'Int64',
    'num_bikes_available_types.mechanical': 'Int64',
    'num_docks_available': 'Int64',
    'station_id': 'Int64',
    'status': 'object',
    'ttl': 'float64'
})


df = df.persist()

### Add day, month, hour and year columns to our df

In [15]:

# Function to convert Unix timestamp to datetime
def convert_unix_to_datetime(s):
    # Convert to numeric, coercing errors to NaN
    s = pd.to_numeric(s, errors='coerce')
    # Convert numeric values to datetime, coercing errors
    return pd.to_datetime(s, unit='s', errors='coerce')

def define_last_reported_column(df):
   # Apply the conversion function to the last_reported column
    df['last_reported'] = df['last_reported'].map_partitions(
    convert_unix_to_datetime, meta=('last_reported', 'datetime64[ns]'))

    return df

# Function to extract datetime components
def extract_datetime_components(df):
    df['year'] = df['last_reported'].dt.year
    df['month'] = df['last_reported'].dt.month
    df['day'] = df['last_reported'].dt.day
    df['hour'] = df['last_reported'].dt.hour
    return df


df = define_last_reported_column(df)

df.head()
df = df.map_partitions(extract_datetime_components)

df.head()


Unnamed: 0,station_id,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available,is_installed,is_renting,is_returning,last_reported,is_charging_station,status,ttl,year,month,day,hour
0,1,27,25,2,17,1,1,1,2020-01-31 22:59:52,1,IN_SERVICE,21.0,2020,1,31,22
1,2,0,0,0,26,1,1,1,2020-01-31 22:56:48,1,IN_SERVICE,21.0,2020,1,31,22
2,3,25,25,0,1,1,1,1,2020-01-31 22:56:06,1,IN_SERVICE,21.0,2020,1,31,22
3,4,8,8,0,10,1,1,1,2020-01-31 22:58:09,1,IN_SERVICE,21.0,2020,1,31,22
4,5,34,33,1,4,1,1,1,2020-01-31 22:56:46,1,IN_SERVICE,21.0,2020,1,31,22


### Add capacity column by mergin the station information from the Informacio_Estacions_Bicing.csv

In [16]:
def merge_capacity_data(df):
    # Define the path to the capacity data file
    capacity_file_path = 'estaciones/Informacio_Estacions_Bicing.csv'

    # Read the capacity data
    capacity_df = dd.read_csv(capacity_file_path, dtype={
                              'station_id': 'Int64', 'capacity': 'Int64'})
    

    capacity_df = capacity_df[capacity_df['capacity'].notnull()].compute()
    # Merge the main DataFrame with the capacity DataFrame on 'station_id'
    df = df.merge(capacity_df, on='station_id', how='left')

    # Filter rows where 'capacity' is not null
    df = df[df['capacity'].notnull()]

    # Convert 'capacity' to float for any subsequent operations
    df['capacity'] = df['capacity'].astype(float)

    return df


df = merge_capacity_data(df)

df.persist()

df.head()

Unnamed: 0,station_id,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available,is_installed,is_renting,is_returning,last_reported,is_charging_station_x,...,lon,altitude,address,post_code,capacity,is_charging_station_y,nearby_distance,_ride_code_support,rental_uris,cross_street
0,1,27,25,2,17,1,1,1,2020-01-31 22:59:52,1,...,2.180107,16.0,"GRAN VIA CORTS CATALANES, 760",8013.0,45.0,True,1000.0,True,,
1,2,0,0,0,26,1,1,1,2020-01-31 22:56:48,1,...,2.177198,17.0,"C/ ROGER DE FLOR, 126",8013.0,29.0,True,1000.0,True,,
2,3,25,25,0,1,1,1,1,2020-01-31 22:56:06,1,...,2.181331,11.0,"C/ NÀPOLS, 82",8013.0,27.0,True,1000.0,True,,
3,4,8,8,0,10,1,1,1,2020-01-31 22:58:09,1,...,2.181248,8.0,"C/ RIBES, 13",8013.0,21.0,True,1000.0,True,,
4,5,34,33,1,4,1,1,1,2020-01-31 22:56:46,1,...,2.180176,7.0,"PG. LLUIS COMPANYS, 11 (ARC TRIOMF)",8018.0,39.0,True,1000.0,True,,


### Add percentage dock availability column

In [17]:
import numpy as np


def add_percentage_docks_available(df):
    # Ensure both columns are in float format and handle missing values
    df['num_docks_available'] = df['num_docks_available'].astype(float)

    # Compute the percentage of docks available
    df['percentage_docks_available'] = df['num_docks_available'] / df['capacity']

    # Handle division by zero or missing values by replacing them with zero
    df['percentage_docks_available'] = df['percentage_docks_available'].fillna(
        0)
    df['percentage_docks_available'] = df['percentage_docks_available'].replace(
        [np.inf, -np.inf], 0)

    return df



df = add_percentage_docks_available(df)

df = df.persist()


### Group by station and date. Calculate mean value

In [18]:
df = df.drop_duplicates()

df.dropna()

  # Select only the necessary columns for now
def get_selected_columns(df):
    my_var = ['station_id', 'year', 'month', 'day', 'hour', 'percentage_docks_available']
    df_2 = df[my_var]

    df_2.head()
    return df_2


def groupby_station_id_and_date(df):
    # Group by 'station_id' and compute the mean of the 'percentage_docks_available' column
    df_2 = df.groupby(['station_id', 'year', 'month', 'day', 'hour']).mean(numeric_only=True).reset_index()

    return df_2

In [19]:

df = get_selected_columns(df)
df_grouped_by = groupby_station_id_and_date(df)

df_grouped_by = df_grouped_by.persist()
df_grouped_by.head()

Unnamed: 0,station_id,year,month,day,hour,percentage_docks_available
0,1,2019,12,31,22,0.511111
1,1,2019,12,31,23,0.492593
2,1,2020,1,1,0,0.459259
3,1,2020,1,1,1,0.394444
4,1,2020,1,1,2,0.346296


In [20]:
df_sorted_values = df_grouped_by.sort_values(
    by=['station_id', 'year', 'month', 'day', 'hour']).reset_index(drop=True)

df_sorted_values.head()

Unnamed: 0,station_id,year,month,day,hour,percentage_docks_available
0,1,2019,12,31,22,0.511111
1,1,2019,12,31,23,0.492593
2,1,2020,1,1,0,0.459259
3,1,2020,1,1,1,0.394444
4,1,2020,1,1,2,0.346296


### Calculate ctx variables




In [21]:
def calculate_context_variables(df):
    # Número de desplazamientos
    max_shift = 4

    # Lista para almacenar las filas con las variables de contexto calculadas
    context_rows = []

    # Iterar sobre cada estación única
    for station_id in df['station_id'].unique():
        # Filtrar el DataFrame por la estación actual
        station_df = df[df['station_id'] ==
                        station_id].copy().reset_index(drop=True)

        # Iterar sobre cada fila del DataFrame de la estación actual
        for i in range(max_shift, len(station_df), max_shift + 1):
            current_row = station_df.iloc[i]
            context_values = []

            # Iterar sobre los desplazamientos para calcular las variables de contexto
            for shift in range(1, max_shift + 1):
                context_index = i - shift

                # Verificar si el índice calculado está dentro del rango del DataFrame de la estación actual
                if 0 <= context_index < len(station_df):
                    context_value = station_df.iloc[context_index]['percentage_docks_available']
                    context_values.append(context_value)
                else:
                    context_values.append(None)

            # Agregar los datos de la fila actual y las variables de contexto a la lista
            context_row = list(current_row) + context_values
            context_rows.append(context_row)

    # Convertir la lista de filas con variables de contexto en un DataFrame
    df_ctx = pd.DataFrame(context_rows, columns=list(
        df.columns) + [f'ctx-{shift}' for shift in range(1, max_shift + 1)])

    return df_ctx

In [24]:
pd_sorted_values = df_sorted_values.compute()

df_ctx = calculate_context_variables(pd_sorted_values)


df_ctx.to_csv('data_cleaned/bicing_data_cleaned.csv', index=False)

In [25]:
df_ctx.head()

Unnamed: 0,station_id,year,month,day,hour,percentage_docks_available,ctx-1,ctx-2,ctx-3,ctx-4
0,1.0,2020.0,1.0,1.0,2.0,0.346296,0.394444,0.459259,0.492593,0.511111
1,1.0,2020.0,1.0,1.0,7.0,0.235185,0.233333,0.298148,0.364815,0.283333
2,1.0,2020.0,1.0,1.0,12.0,0.385185,0.337037,0.342593,0.262963,0.248148
3,1.0,2020.0,1.0,1.0,17.0,0.177778,0.211111,0.244444,0.335185,0.381481
4,1.0,2020.0,1.0,1.0,22.0,0.45,0.492593,0.418519,0.209259,0.07963
