# Getting Started

In [12]:
import pandas as pd
import numpy as np

import joblib # to save the model

This training notebook is going to heavily abuse cuml, a GPU-optimized scikit-learn like ML library.

If you are not equipped with a GPU, replace all existence of "cuml" with "sklearn".

In [2]:
import cudf
print(cudf.Series([1, 2, 3]))

0    1
1    2
2    3
dtype: int64


In [3]:
from cuml.svm import SVR
from cuml.svm import LinearSVR
from cuml.model_selection import train_test_split
from cuml.common.device_selection import using_device_type
from cuml.preprocessing import StandardScaler
from cuml.metrics import mean_absolute_error, mean_squared_error

## Dataset Loading - Dummify Categorical Variables

In [4]:
df = pd.read_csv('./cleaned_data.csv')

df.drop(['Start_Time', 'End_Time', 'Zipcode'], axis=1, inplace=True)

df['Severity'] = df['Severity'].astype('category')
df['month'] = df['month'].astype('category')
df['year'] = df['year'].astype('category')
df['day_of_week'] = df['day_of_week'].astype('category')
df['Sunrise_Sunset'] = df['Sunrise_Sunset'].astype('category')

df

Unnamed: 0,Severity,Distance(mi),Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Precipitation(in),Sunrise_Sunset,Day_Night,duration_minutes,month,year,day_of_week,zipcode_population
0,3,0.010,37.9,35.5,97.0,29.63,7.0,3.5,0.03,Day,1,30.000000,2,2016,1,64746.0
1,3,0.010,37.4,33.8,100.0,29.62,3.0,4.6,0.02,Day,1,30.000000,2,2016,1,64746.0
2,3,0.010,37.4,33.8,100.0,29.62,3.0,4.6,0.02,Day,1,30.000000,2,2016,1,58916.0
3,2,0.010,37.4,33.8,100.0,29.62,3.0,4.6,0.02,Day,1,30.000000,2,2016,1,36724.0
4,2,0.000,33.8,29.6,100.0,29.62,2.0,4.6,0.01,Day,1,30.000000,2,2016,1,58079.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5227101,2,0.543,86.0,86.0,40.0,28.92,10.0,13.0,0.00,Day,1,28.600000,8,2019,5,22136.0
5227102,2,0.338,70.0,70.0,73.0,29.39,10.0,6.0,0.00,Day,1,26.883333,8,2019,5,24201.0
5227103,2,0.561,73.0,73.0,64.0,29.74,10.0,10.0,0.00,Day,1,28.466667,8,2019,5,14873.0
5227104,2,0.772,71.0,71.0,81.0,29.62,10.0,8.0,0.00,Day,1,29.350000,8,2019,5,32005.0


In [5]:
df = pd.get_dummies(df, columns=['Severity', 'month', 'day_of_week', 'Sunrise_Sunset'], drop_first=True)

In [6]:
len(df)

5227106

In [7]:
df.dropna(inplace=True)

In [8]:
len(df)

5225448

# Training

## Data Preprocessing

In [9]:
def split_data(df):

    X = df.drop(columns=['duration_minutes']).astype('float32')
    y = df['duration_minutes'].astype('float32')

    # Convert to cuDF
    X = cudf.DataFrame.from_pandas(X)
    y = cudf.Series.from_pandas(y)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Standardize the data
    scaler = StandardScaler()

    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test

In [83]:
# Split data
X_train, X_test, y_train, y_test = split_data(df)

## (skip) Estimate Training Time by training subsampled dataset

In [72]:
# X_sample = X.sample(frac=0.001, random_state=42)
# y_sample = y.loc[X_sample.index]  # Select corresponding labels for the sampled indices

# X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

# scaler = StandardScaler()

# # Fit on X_train and transform X_train and X_test
# X_train = scaler.fit_transform(X_train)
# X_test  = scaler.transform(X_test)

## Linear SVR - Raw

In [16]:
def train_model(X_train, y_train):

    print("X_train.shape:", X_train.shape)

    model = LinearSVR(epsilon=0.1, C=1.0, fit_intercept=True, tol=0.0001, max_iter=1000, verbose=4) 

    with using_device_type('gpu'):
        model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    # Calculate mae, mse

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    print(f'Mean Absolute Error: {mae}')
    print(f'Mean Squared Error: {mse}')

    return model

In [89]:
train_model(X_train, y_train)

Mean Absolute Error: 513.8905029296875
Mean Squared Error: 239963072.0


## Linear SVR - Remove Duration Outliers

### Remove top 10\% of duration

In [10]:
threshold = df['duration_minutes'].quantile(0.90)

# Filter out rows where 'duration_minutes' is above the 90th percentile
df_filtered = df[df['duration_minutes'] <= threshold]

In [11]:
X_train, X_test, y_train, y_test = split_data(df_filtered)

In [17]:
joblib.dump(train_model(X_train, y_train), 'linear_svr.joblib')

X_train.shape: (3762341, 32)
[W] [15:21:21.137053] L-BFGS line search failed (code 4); stopping at the last valid step
Mean Absolute Error: 52.656349182128906
Mean Squared Error: 3782.899658203125


['linear_svr.joblib']