In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
file_path = '/content/drive/My Drive/Chicago_Traffic_Crashes.csv'

In [3]:
import pandas as pd
import time

# Measure time taken to read with Pandas
start = time.time()
df_pandas = pd.read_csv(file_path)
pandas_time = time.time() - start

print(f"Pandas read time: {pandas_time:.2f} seconds")

Pandas read time: 17.25 seconds


In [4]:
import dask.dataframe as dd

# Measure time taken to read with Dask
start = time.time()
df_dask = dd.read_csv(file_path)
dask_time = time.time() - start

print(f"Dask read time: {dask_time:.2f} seconds")

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Dask read time: 0.22 seconds


In [9]:
!pip install ray  # Install Ray
import ray
import ray.data

ray.init(ignore_reinit_error=True)

# Measure time taken to read with Ray
start = time.time()
df_ray = ray.data.read_csv(file_path)
ray_time = time.time() - start

print(f"Ray read time: {ray_time:.2f} seconds")

ray.shutdown()


Collecting ray
  Downloading ray-2.40.0-cp310-cp310-manylinux2014_x86_64.whl.metadata (17 kB)
Downloading ray-2.40.0-cp310-cp310-manylinux2014_x86_64.whl (66.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.8/66.8 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ray
Successfully installed ray-2.40.0


2024-12-27 23:53:16,282	INFO worker.py:1821 -- Started a local Ray instance.


Ray read time: 1.05 seconds


In [12]:
!pip install modin
import modin.pandas as mpd
import time

# Measure time taken to read with Modin
start = time.time()
df_modin = mpd.read_csv(file_path)
modin_time = time.time() - start

print(f"Modin read time: {modin_time:.2f} seconds")



2024-12-27 23:55:01,762	INFO worker.py:1821 -- Started a local Ray instance.


Modin read time: 26.12 seconds


In [13]:
print("Summary of File Reading Performance:")
print("===================================")
print(f"Pandas read time: {pandas_time:.2f} seconds")
print(f"Dask read time: {dask_time:.2f} seconds")
print(f"Modin read time: {modin_time:.2f} seconds")
print(f"Ray read time: {ray_time:.2f} seconds")

Summary of File Reading Performance:
Pandas read time: 17.25 seconds
Dask read time: 0.22 seconds
Modin read time: 26.12 seconds
Ray read time: 1.05 seconds


In [14]:
# Clean column names: remove special characters, replace spaces with underscores
df_pandas.columns = df_pandas.columns.str.replace('[^A-Za-z0-9]+', '_').str.strip()
print("Cleaned Column Names:")
print(df_pandas.columns)

Cleaned Column Names:
Index(['CRASH_RECORD_ID', 'CRASH_DATE_EST_I', 'CRASH_DATE',
       'POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION',
       'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE',
       'TRAFFICWAY_TYPE', 'LANE_CNT', 'ALIGNMENT', 'ROADWAY_SURFACE_COND',
       'ROAD_DEFECT', 'REPORT_TYPE', 'CRASH_TYPE', 'INTERSECTION_RELATED_I',
       'NOT_RIGHT_OF_WAY_I', 'HIT_AND_RUN_I', 'DAMAGE', 'DATE_POLICE_NOTIFIED',
       'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE', 'STREET_NO',
       'STREET_DIRECTION', 'STREET_NAME', 'BEAT_OF_OCCURRENCE',
       'PHOTOS_TAKEN_I', 'STATEMENTS_TAKEN_I', 'DOORING_I', 'WORK_ZONE_I',
       'WORK_ZONE_TYPE', 'WORKERS_PRESENT_I', 'NUM_UNITS',
       'MOST_SEVERE_INJURY', 'INJURIES_TOTAL', 'INJURIES_FATAL',
       'INJURIES_INCAPACITATING', 'INJURIES_NON_INCAPACITATING',
       'INJURIES_REPORTED_NOT_EVIDENT', 'INJURIES_NO_INDICATION',
       'INJURIES_UNKNOWN', 'CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH

In [15]:
!pip install pyyaml
import yaml

# Define schema
schema = {
    'separator': '|',
    'columns': list(df_pandas.columns)
}

# Save schema to a YAML file
with open('schema.yaml', 'w') as file:
    yaml.dump(schema, file)

print("Schema file 'schema.yaml' created successfully.")


Schema file 'schema.yaml' created successfully.


In [16]:
# Load the schema
with open('schema.yaml') as file:
    schema = yaml.load(file, Loader=yaml.FullLoader)

# Validate column names
assert list(df_pandas.columns) == schema['columns'], "Column names do not match!"
print("Column names match the schema.")


Column names match the schema.


In [17]:
# Save the cleaned dataset in pipe-separated gzip format
df_pandas.to_csv('Chicago_Traffic_Crashes_cleaned.txt.gz', sep='|', index=False, compression='gzip')

print("Dataset saved successfully in pipe-separated gzip format.")


Dataset saved successfully in pipe-separated gzip format.


In [18]:
import os

# Total rows and columns
rows, columns = df_pandas.shape

# File size
file_size = os.path.getsize('Chicago_Traffic_Crashes_cleaned.txt.gz')

# Print summary
print("Summary of the Cleaned Dataset:")
print(f"Total Rows: {rows}")
print(f"Total Columns: {columns}")
print(f"File Size: {file_size / (1024 * 1024):.2f} MB")


Summary of the Cleaned Dataset:
Total Rows: 904032
Total Columns: 48
File Size: 130.55 MB
