In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from hyperopt import hp

In [2]:
from insolver.transforms import (
    TransformExp,
    InsolverTransform,
    TransformAge,
    TransformMapValues,
    TransformPolynomizer,
    TransformAgeGender,
)
from insolver.wrappers import InsolverGLMWrapper, InsolverGBMWrapper

In [3]:
file_path = 'US_Accidents_June20.csv'
# https://smoosavi.org/datasets/us_accidents

In [4]:
df = pd.read_csv(file_path, low_memory=False)
df.shape

(3513617, 49)

In [5]:
df = df.sample(int(3513740/100))

In [6]:
df = df.drop(columns=['ID', 'Source', 'End_Lat', 'End_Lng'])
df.dropna(how='all', axis=1, inplace=True)

In [7]:
iglm = InsolverGLMWrapper(backend='h2o', family='gamma', link='log')

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.9.1" 2020-11-04; OpenJDK Runtime Environment (build 11.0.9.1+1-Ubuntu-0ubuntu1.18.04); OpenJDK 64-Bit Server VM (build 11.0.9.1+1-Ubuntu-0ubuntu1.18.04, mixed mode, sharing)
  Starting server from /home/alex/new_env/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpbyv6wxhd
  JVM stdout: /tmp/tmpbyv6wxhd/h2o_alex_started_from_python.out
  JVM stderr: /tmp/tmpbyv6wxhd/h2o_alex_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Europe/Moscow
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.1.3
H2O_cluster_version_age:,1 month and 23 days
H2O_cluster_name:,H2O_from_python_alex_knm6it
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,29.97 Gb
H2O_cluster_total_cores:,120
H2O_cluster_allowed_cores:,120


In [8]:
InsTransforms = InsolverTransform(df, [])
InsTransforms.ins_transform()
InsTransforms.save('transforms.pkl')

In [9]:
train, valid, test = InsTransforms.split_frame(val_size=0.15, test_size=0.15, random_state=0, shuffle=True)

In [10]:
features = [
    'Start_Time',
    'End_Time',
    'Start_Lat',
    'Start_Lng',
    'Distance(mi)',
#     'Number',
#     'Street',
#     'Side',
#     'City',
#     'County',
#     'State',
#     'Zipcode',
#     'Country',
#     'Timezone',
#     'Airport_Code',
    'Weather_Timestamp',
    'Temperature(F)',
#     'Wind_Chill(F)',
#     'Humidity(%)',
#     'Pressure(in)',
#     'Visibility(mi)',
#     'Wind_Direction',
#     'Wind_Speed(mph)',
#     'Precipitation(in)',
#     'Weather_Condition',
#     'Amenity',
#     'Bump',
#     'Crossing',
#     'Give_Way',
#     'Junction',
#     'No_Exit',
#     'Railway',
#     'Roundabout',
#     'Station',
#     'Stop',
#     'Traffic_Calming',
#     'Traffic_Signal',
#     'Turning_Loop',
#     'Sunrise_Sunset',
#     'Civil_Twilight',
#     'Nautical_Twilight',
    'Astronomical_Twilight'
]
target = 'Severity'

In [11]:
x_train, x_valid, x_test = train[features], valid[features], test[features]
y_train, y_valid, y_test = train[target], valid[target], test[target]

In [12]:
params = {'lambda': [1, 0.5, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0],
          'alpha': [i * 0.1 for i in range(0, 11)]}

In [13]:
iglm.optimize_hyperparam(params, x_train, y_train, X_valid=x_valid, y_valid=y_valid)

{'alpha': 0.0, 'lambda': 0.0}

In [14]:
iglm.save_model(name='insolver_glm_h2o_US_Accidents')

In [15]:
predict_glm = iglm.predict(x_test)

In [16]:
predict_glm

array([2.20857704, 2.18323429, 2.3304208 , ..., 2.37043065, 2.32745266,
       2.34453192])

In [17]:
test.sample(1).to_json('request_example.json')

In [None]:
!insolver_serving -model insolver_glm_h2o_US_Accidents -transforms transforms.pkl -service flask