In [None]:
!pip install dask


In [None]:
import dask.dataframe as dd
import pandas as pd
import glob

# Define the path to the directory containing the CSV files
csv_files = glob.glob('bicing_data/*.csv')

# Read all files into a single Dask DataFrame, treating all columns as strings initially
df = dd.read_csv(csv_files, assume_missing=True, dtype=str)

# Get all unique columns across all CSV files
all_columns = set(df.columns)

# Ensure all columns are present in the DataFrame
for col in all_columns:
    if col not in df.columns:
        df[col] = None


def parse_dates(df):
    for col in ['last_updated', 'last_reported']:
        df[col] = dd.to_datetime(df[col], errors='coerce')
    return df


def clean_is_installed(value):
    if isinstance(value, str):
        value = value.lower()
        if value in ['true', 'false']:
            return 1 if value == 'true' else 0
    try:
        return float(value)
    except ValueError:
        return None

def to_bool(x):
    if pd.isna(x):
        return False
    if isinstance(x, str):
        return x.lower() in ['true', '1']
    return bool(x)

def clean_is_renting_returning(value):
    if value in ['0', '1']:
        return int(value)
    return None


def clean_num_docks_available(value):
    try:
        return float(value)
    except ValueError:
        return None


# Apply the cleaning functions to the appropriate columns

df['is_installed'] = df['is_installed'].map(
    clean_is_installed, meta=('is_installed', 'Int64'))
df['is_renting'] = df['is_renting'].map(
    clean_is_renting_returning, meta=('is_renting', 'Int64'))
df['is_returning'] = df['is_returning'].map(
    clean_is_renting_returning, meta=('is_returning', 'Int64'))
df['num_docks_available'] = df['num_docks_available'].map(
    clean_num_docks_available, meta=('num_docks_available', 'Int64'))


# Convert 'is_charging_station' to boolean using a custom function



df['is_charging_station'] = df['is_charging_station'].map_partitions(
    lambda s: s.map(to_bool))

# Apply date parsing
# df = df.map_partitions(parse_dates)

# Convert columns to their appropriate data types
df = df.astype({
    'station_id': 'Int64',
    'num_bikes_available': 'Int64',
    'num_bikes_available_types.mechanical': 'Int64',
    'num_bikes_available_types.ebike': 'Int64',
    'num_docks_available': 'Int64',
    'is_installed': 'Int64',
    'is_renting': 'Int64',
    'is_returning': 'Int64',
    'is_charging_station': 'Int64',  # Convert to bool directly
    'status': 'object',
    'ttl': 'float64'
})

# Inspect the data before dropping NaN values
sample = df.head(10, compute=True)
print("Sample data before dropping NaNs:")
print(sample)

# Check the number of missing values in each column
missing_values = df.isna().sum().compute()
print("Missing values per column:")
print(missing_values)

# Example strategy: Drop rows with NaNs in specific critical columns
df = df.dropna(subset=['station_id', 'num_bikes_available', 'is_installed'])