# Pre processing information

### Pre-process the bicing station information

In [48]:
import dask.dataframe as dd
import pandas as pd
import glob
from datetime import  datetime

# Define the path to the directory containing the CSV files
csv_files = glob.glob('../data/*.csv')

# Read all files into a single Dask DataFrame, treating all columns as strings initially
df = dd.read_csv(csv_files, usecols=['station_id', 'num_docks_available', 'num_bikes_available', 'last_reported'],
                 assume_missing=True, dtype={'last_reported': 'float64'})

In [49]:
column_types = df.dtypes
print(column_types)

station_id             float64
num_bikes_available    float64
num_docks_available    float64
last_reported          float64
dtype: object


Let's keep for now just the variables I will use to extract the % of docks available and context variables:

In [50]:
df.head()

Unnamed: 0,station_id,num_bikes_available,num_docks_available,last_reported
0,1.0,9.0,35.0,1590962000.0
1,2.0,22.0,3.0,1590962000.0
2,3.0,12.0,15.0,1590962000.0
3,4.0,9.0,12.0,1590962000.0
4,5.0,31.0,7.0,1590962000.0


In [51]:
# Get all unique columns across all CSV files
all_columns = set(df.columns)
# Ensure all columns are present in the DataFrame
for col in all_columns:
    if col not in df.columns:
        df[col] = None

In [52]:
df.head()

Unnamed: 0,station_id,num_bikes_available,num_docks_available,last_reported
0,1.0,9.0,35.0,1590962000.0
1,2.0,22.0,3.0,1590962000.0
2,3.0,12.0,15.0,1590962000.0
3,4.0,9.0,12.0,1590962000.0
4,5.0,31.0,7.0,1590962000.0


In [53]:
df.shape

(Delayed('int-1ab60aaf-8374-4212-b795-ea6a81572917'), 4)

In [54]:
# Eliminate NaNs
df = df.dropna().persist()

# Eliminate last_reported = 0.0 because it ends up being 01/01/1970
df = df[df['last_reported'] != 0.0].persist()

# Display the first few rows of the cleaned DataFrame
df.head()

  df = reader(bio, **kwargs)


Unnamed: 0,station_id,num_bikes_available,num_docks_available,last_reported
0,1.0,9.0,35.0,1590962000.0
1,2.0,22.0,3.0,1590962000.0
2,3.0,12.0,15.0,1590962000.0
3,4.0,9.0,12.0,1590962000.0
4,5.0,31.0,7.0,1590962000.0


In [55]:
df.shape

(Delayed('int-d21e56f1-2049-43a1-8e47-5815b752f343'), 4)

In [56]:
df_kaggle_metadata = pd.read_csv('../metadata_sample_submission_2024.csv')
df_kaggle_metadata.head()

Unnamed: 0,index,station_id,month,day,hour,ctx-4,ctx-3,ctx-2,ctx-1
0,0,1,1,1,5,0.781481,0.677778,0.696296,0.75
1,1,1,1,1,10,0.737374,0.711111,0.711111,0.731624
2,2,1,1,1,15,0.827778,0.896296,0.901852,0.883333
3,3,1,1,1,20,0.825926,0.874074,0.927778,0.918519
4,4,2,1,1,3,0.592593,0.341954,0.275862,0.54023


In [57]:
unique_station_ids = df_kaggle_metadata['station_id'].unique()

In [58]:
df_kaggle_metadata['station_id'].nunique()

399

In [59]:
# Filter the Dask DataFrame to keep rows with station IDs present in unique_station_ids
df = df[df['station_id'].isin(unique_station_ids)].persist()

In [60]:
df.head()

Unnamed: 0,station_id,num_bikes_available,num_docks_available,last_reported
0,1.0,9.0,35.0,1590962000.0
1,2.0,22.0,3.0,1590962000.0
2,3.0,12.0,15.0,1590962000.0
3,4.0,9.0,12.0,1590962000.0
4,5.0,31.0,7.0,1590962000.0


In [61]:
df.shape

(Delayed('int-a40a1cb8-389b-4ba5-a3c5-9bbbfb1a58bc'), 4)

In [62]:
df['station_id'].nunique().compute()

399

In [63]:
df.shape

(Delayed('int-f54d1e12-a5d5-4cfd-a7d7-a910e84ea183'), 4)

In [64]:
# Check for missing values in the entire DataFrame
missing_values_count = df.isna().sum().compute()

# Display the count of missing values in each column
print(missing_values_count)

station_id             0
num_bikes_available    0
num_docks_available    0
last_reported          0
dtype: int64


### Add day, month, hour and year columns to our df

In [65]:
# Function to convert Unix timestamp to datetime
def convert_unix_to_datetime(s):
    # Convert to numeric, coercing errors to NaN
    s = pd.to_numeric(s, errors='coerce')
    # Convert numeric values to datetime, coercing errors
    return pd.to_datetime(s, unit='s', errors='coerce')

def define_last_reported_column(df):
   # Apply the conversion function to the last_reported column
    df['last_reported'] = df['last_reported'].map_partitions(
    convert_unix_to_datetime, meta=('last_reported', 'datetime64[ns]'))

    return df

# Function to extract datetime components
def extract_datetime_components(df):
    df['year'] = df['last_reported'].dt.year
    df['month'] = df['last_reported'].dt.month
    df['day'] = df['last_reported'].dt.day
    df['hour'] = df['last_reported'].dt.hour
    return df


df = define_last_reported_column(df)
df = df.map_partitions(extract_datetime_components)
df.head()

Unnamed: 0,station_id,num_bikes_available,num_docks_available,last_reported,year,month,day,hour
0,1.0,9.0,35.0,2020-05-31 21:58:55,2020,5,31,21
1,2.0,22.0,3.0,2020-05-31 21:56:31,2020,5,31,21
2,3.0,12.0,15.0,2020-05-31 22:00:04,2020,5,31,22
3,4.0,9.0,12.0,2020-05-31 22:00:10,2020,5,31,22
4,5.0,31.0,7.0,2020-05-31 21:59:53,2020,5,31,21


In [66]:
df.shape

(Delayed('int-fce7fe8b-8d62-4aec-baa8-e78ee0339c4d'), 8)

### Add capacity column by mergin the station information from the Informacio_Estacions_Bicing.csv

In [67]:
def merge_capacity_data(df):
    # Define the path to the capacity data file
    capacity_file_path = '../Informacio_Estacions_Bicing.csv'

    # Read the capacity data
    capacity_df = dd.read_csv(capacity_file_path, usecols=['station_id', 'capacity'], dtype={
                              'station_id': 'float64', 'capacity': 'float64'})
    

    # capacity_df = capacity_df[capacity_df['capacity'].notnull()].compute() quito esto para que no vaya tan lento
    # Merge the main DataFrame with the capacity DataFrame on 'station_id'
    df = df.merge(capacity_df, on='station_id', how='left')

    # Filter rows where 'capacity' is not null
    df = df[df['capacity'].notnull()]

    # Convert 'capacity' to float for any subsequent operations
    df['capacity'] = df['capacity'].astype(float)

    return df

In [68]:
df = merge_capacity_data(df).persist()
df.head()

Unnamed: 0,station_id,num_bikes_available,num_docks_available,last_reported,year,month,day,hour,capacity
0,1.0,9.0,35.0,2020-05-31 21:58:55,2020,5,31,21,45.0
1,2.0,22.0,3.0,2020-05-31 21:56:31,2020,5,31,21,29.0
2,3.0,12.0,15.0,2020-05-31 22:00:04,2020,5,31,22,27.0
3,4.0,9.0,12.0,2020-05-31 22:00:10,2020,5,31,22,21.0
4,5.0,31.0,7.0,2020-05-31 21:59:53,2020,5,31,21,39.0


In [69]:
df.shape

(Delayed('int-e712b0e5-39b5-4a7d-9374-f75fb14e6b2d'), 9)

### Add percentage dock availability column

In [70]:
import numpy as np


def add_percentage_docks_available(df):
    # Ensure both columns are in float format and handle missing values
    df['num_docks_available'] = df['num_docks_available'].astype(float)

    # Compute the percentage of docks available
    df['percentage_docks_available'] = df['num_docks_available'] / df['capacity']

    # Handle division by zero or missing values by replacing them with zero
    df['percentage_docks_available'] = df['percentage_docks_available'].fillna(
        0)
    df['percentage_docks_available'] = df['percentage_docks_available'].replace(
        [np.inf, -np.inf], 0)

    return df



df = add_percentage_docks_available(df).persist()

In [71]:
df.head()

Unnamed: 0,station_id,num_bikes_available,num_docks_available,last_reported,year,month,day,hour,capacity,percentage_docks_available
0,1.0,9.0,35.0,2020-05-31 21:58:55,2020,5,31,21,45.0,0.777778
1,2.0,22.0,3.0,2020-05-31 21:56:31,2020,5,31,21,29.0,0.103448
2,3.0,12.0,15.0,2020-05-31 22:00:04,2020,5,31,22,27.0,0.555556
3,4.0,9.0,12.0,2020-05-31 22:00:10,2020,5,31,22,21.0,0.571429
4,5.0,31.0,7.0,2020-05-31 21:59:53,2020,5,31,21,39.0,0.179487


In [72]:
df.shape

(Delayed('int-f5c6bdb1-a498-42b7-ad9d-d769e9e33bdf'), 10)

### Group by station and date. Calculate mean value

In [73]:
# Select only the necessary columns for now
def get_selected_columns(df):
    my_var = ['station_id', 'year', 'month', 'day', 'hour', 'percentage_docks_available']
    df_2 = df[my_var]

    df_2.head()
    return df_2


def groupby_station_id_and_date(df):
    # Group by 'station_id' and compute the mean of the 'percentage_docks_available' column
    df_2 = df.groupby(['station_id', 'year', 'month', 'day', 'hour']).mean(numeric_only=True).reset_index()

    return df_2

In [74]:
df_selected_columns = get_selected_columns(df).persist()
df_grouped_by = groupby_station_id_and_date(df_selected_columns).persist()

df_grouped_by.head()

Unnamed: 0,station_id,year,month,day,hour,percentage_docks_available
0,1.0,2020,5,31,21,0.809259
1,1.0,2020,5,31,22,0.806061
2,1.0,2020,5,31,23,0.798291
3,1.0,2020,6,1,0,0.805556
4,1.0,2020,6,1,1,0.8


In [75]:
df_grouped_by.shape

(Delayed('int-10168609-7324-4dba-8028-402ab10909f2'), 6)

In [76]:
df_sorted_values = df_grouped_by.sort_values(
    by=['station_id', 'year', 'month', 'day', 'hour']).reset_index(drop=True).compute().reset_index(drop=True)

df_sorted_values.head()

Unnamed: 0,station_id,year,month,day,hour,percentage_docks_available
0,1.0,1970,1,1,0,35559710.0
1,1.0,2019,12,31,22,0.5111111
2,1.0,2019,12,31,23,0.4925926
3,1.0,2020,1,1,0,0.4592593
4,1.0,2020,1,1,1,0.3944444


In [77]:
df_sorted_values.shape

(5408731, 6)

### Calculate ctx variables




In [78]:
def calculate_context_variables(df):
    # Número de desplazamientos
    max_shift = 4

    # Lista para almacenar las filas con las variables de contexto calculadas
    context_rows = []

    # Iterar sobre cada estación única
    for station_id in df['station_id'].unique():
        # Filtrar el DataFrame por la estación actual
        station_df = df[df['station_id'] ==
                        station_id].copy().reset_index(drop=True)

        # Iterar sobre cada fila del DataFrame de la estación actual
        for i in range(max_shift, len(station_df), max_shift + 1):
            current_row = station_df.iloc[i]
            context_values = []

            # Iterar sobre los desplazamientos para calcular las variables de contexto
            for shift in range(1, max_shift + 1):
                context_index = i - shift

                # Verificar si el índice calculado está dentro del rango del DataFrame de la estación actual
                if 0 <= context_index < len(station_df):
                    context_value = station_df.iloc[context_index]['percentage_docks_available']
                    context_values.append(context_value)
                else:
                    context_values.append(None)

            # Agregar los datos de la fila actual y las variables de contexto a la lista
            context_row = list(current_row) + context_values
            context_rows.append(context_row)

    # Convertir la lista de filas con variables de contexto en un DataFrame
    df_ctx = pd.DataFrame(context_rows, columns=list(
        df.columns) + [f'ctx-{shift}' for shift in range(1, max_shift + 1)])

    return df_ctx

In [79]:
df_ctx = calculate_context_variables(df_sorted_values)

df_ctx.to_csv('../bicing_data_cleaned_nur.csv', index=False)

In [80]:
df_ctx.head()

Unnamed: 0,station_id,year,month,day,hour,percentage_docks_available,ctx-1,ctx-2,ctx-3,ctx-4
0,1.0,2020.0,1.0,1.0,1.0,0.394444,0.459259,0.492593,0.511111,35559710.0
1,1.0,2020.0,1.0,1.0,6.0,0.233333,0.298148,0.364815,0.283333,0.3462963
2,1.0,2020.0,1.0,1.0,11.0,0.337037,0.342593,0.262963,0.248148,0.2351852
3,1.0,2020.0,1.0,1.0,16.0,0.211111,0.244444,0.335185,0.381481,0.3851852
4,1.0,2020.0,1.0,1.0,21.0,0.492593,0.418519,0.209259,0.07963,0.1777778


In [81]:
df_ctx.shape

(1081584, 10)

In [82]:
#shere = pd.read_csv('../shere/all_mean_hour_ctx.csv')
#shere.head()

Unnamed: 0,station_id,Year,Month,Day,Hour,availability_percentage,ctx-1,ctx-2,ctx-3,ctx-4
0,1.0,2020.0,1.0,1.0,2.0,34.62963,39.444444,45.925926,49.259259,51.111111
1,1.0,2020.0,1.0,1.0,7.0,23.518519,23.333333,29.814815,36.481481,28.333333
2,1.0,2020.0,1.0,1.0,12.0,38.518519,33.703704,34.259259,26.296296,24.814815
3,1.0,2020.0,1.0,1.0,17.0,17.777778,21.111111,24.444444,33.518519,38.148148
4,1.0,2020.0,1.0,1.0,22.0,45.0,49.259259,41.851852,20.925926,7.962963


In [83]:
#shere.shape

(3275489, 10)