In [None]:
from google.colab import files

In [None]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d sobhanmoosavi/us-accidents

Dataset URL: https://www.kaggle.com/datasets/sobhanmoosavi/us-accidents
License(s): CC-BY-NC-SA-4.0
Downloading us-accidents.zip to /content
 96% 629M/653M [00:06<00:00, 44.7MB/s]
100% 653M/653M [00:06<00:00, 101MB/s] 


In [None]:
!unzip us-accidents.zip

Archive:  us-accidents.zip
  inflating: US_Accidents_March23.csv  


In [None]:
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 597, done.[K
remote: Counting objects: 100% (163/163), done.[K
remote: Compressing objects: 100% (81/81), done.[K
remote: Total 597 (delta 128), reused 82 (delta 82), pack-reused 434 (from 3)[K
Receiving objects: 100% (597/597), 196.59 KiB | 15.12 MiB/s, done.
Resolving deltas: 100% (302/302), done.
Installing RAPIDS remaining 25.04 libraries
Using Python 3.11.13 environment at: /usr
Resolved 175 packages in 12.10s
Downloading libcuvs-cu12 (1.1GiB)
Downloading rmm-cu12 (1.5MiB)
Downloading ucx-py-cu12 (2.2MiB)
Downloading librmm-cu12 (2.9MiB)
Downloading libcuspatial-cu12 (31.1MiB)
Downloading pylibcudf-cu12 (26.4MiB)
Downloading nvidia-nvcomp-cu12 (44.1MiB)
Downloading cucim-cu12 (5.6MiB)
Downloading libcugraph-cu12 (1.4GiB)
Downloading cuspatial-cu12 (4.1MiB)
Downloading cuml-cu12 (9.4MiB)
Downloading raft-dask-cu12 (274.9MiB)
Downloading pylibcugraph-cu12 (2.0MiB)
Downloading cuproj-cu12 (1.1MiB)
Downloading libkv

In [None]:
import cudf    # pandas
import cupy    # numpy

In [None]:
df = cudf.read_csv('/content/US_Accidents_March23.csv')
df.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 7728394 entries, 0 to 7728393
Data columns (total 46 columns):
 #   Column                 Dtype
---  ------                 -----
 0   ID                     object
 1   Source                 object
 2   Severity               int64
 3   Start_Time             object
 4   End_Time               object
 5   Start_Lat              float64
 6   Start_Lng              float64
 7   End_Lat                float64
 8   End_Lng                float64
 9   Distance(mi)           float64
 10  Description            object
 11  Street                 object
 12  City                   object
 13  County                 object
 14  State                  object
 15  Zipcode                object
 16  Country                object
 17  Timezone               object
 18  Airport_Code           object
 19  Weather_Timestamp      object
 20  Temperature(F)         float64
 21  Wind_Chill(F)          float64
 22  Humidity(%)            float64
 23  

In [None]:
# dropping useless columns
cols_to_drop = ['ID', 'Airport_Code', 'Zipcode', 'County']
df.drop(columns=cols_to_drop, inplace=True)

df.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 7728394 entries, 0 to 7728393
Data columns (total 42 columns):
 #   Column                 Dtype
---  ------                 -----
 0   Source                 object
 1   Severity               int64
 2   Start_Time             object
 3   End_Time               object
 4   Start_Lat              float64
 5   Start_Lng              float64
 6   End_Lat                float64
 7   End_Lng                float64
 8   Distance(mi)           float64
 9   Description            object
 10  Street                 object
 11  City                   object
 12  State                  object
 13  Country                object
 14  Timezone               object
 15  Weather_Timestamp      object
 16  Temperature(F)         float64
 17  Wind_Chill(F)          float64
 18  Humidity(%)            float64
 19  Pressure(in)           float64
 20  Visibility(mi)         float64
 21  Wind_Direction         object
 22  Wind_Speed(mph)        float64
 2

In [None]:
print(cupy.unique(df.Severity))

df['Severity'] = df['Severity'].astype('uint8')
df.info()

[1 2 3 4]
<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 7728394 entries, 0 to 7728393
Data columns (total 42 columns):
 #   Column                 Dtype
---  ------                 -----
 0   Source                 object
 1   Severity               uint8
 2   Start_Time             object
 3   End_Time               object
 4   Start_Lat              float64
 5   Start_Lng              float64
 6   End_Lat                float64
 7   End_Lng                float64
 8   Distance(mi)           float64
 9   Description            object
 10  Street                 object
 11  City                   object
 12  State                  object
 13  Country                object
 14  Timezone               object
 15  Weather_Timestamp      object
 16  Temperature(F)         float64
 17  Wind_Chill(F)          float64
 18  Humidity(%)            float64
 19  Pressure(in)           float64
 20  Visibility(mi)         float64
 21  Wind_Direction         object
 22  Wind_Speed(mph)        

In [None]:
float_cols = df.select_dtypes(include='float64').columns
df[float_cols] = df[float_cols].astype('float32')

df.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 7728394 entries, 0 to 7728393
Data columns (total 42 columns):
 #   Column                 Dtype
---  ------                 -----
 0   Source                 object
 1   Severity               uint8
 2   Start_Time             object
 3   End_Time               object
 4   Start_Lat              float32
 5   Start_Lng              float32
 6   End_Lat                float32
 7   End_Lng                float32
 8   Distance(mi)           float32
 9   Description            object
 10  Street                 object
 11  City                   object
 12  State                  object
 13  Country                object
 14  Timezone               object
 15  Weather_Timestamp      object
 16  Temperature(F)         float32
 17  Wind_Chill(F)          float32
 18  Humidity(%)            float32
 19  Pressure(in)           float32
 20  Visibility(mi)         float32
 21  Wind_Direction         object
 22  Wind_Speed(mph)        float32
 2

In [None]:
category_cols = ['Source', 'City', 'State', 'Timezone',
                 'Wind_Direction', 'Weather_Condition', 'Sunrise_Sunset',
                 'Civil_Twilight', 'Nautical_Twilight', 'Astronomical_Twilight']

for col in category_cols:
  df[col] = df[col].astype('category')


df.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 7728394 entries, 0 to 7728393
Data columns (total 42 columns):
 #   Column                 Dtype
---  ------                 -----
 0   Source                 category
 1   Severity               uint8
 2   Start_Time             object
 3   End_Time               object
 4   Start_Lat              float32
 5   Start_Lng              float32
 6   End_Lat                float32
 7   End_Lng                float32
 8   Distance(mi)           float32
 9   Description            object
 10  Street                 object
 11  City                   category
 12  State                  category
 13  Country                object
 14  Timezone               category
 15  Weather_Timestamp      object
 16  Temperature(F)         float32
 17  Wind_Chill(F)          float32
 18  Humidity(%)            float32
 19  Pressure(in)           float32
 20  Visibility(mi)         float32
 21  Wind_Direction         category
 22  Wind_Speed(mph)        