In [None]:
# pip install hopsworks[python]

import hopsworks
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Connect to Hopsworks
project = hopsworks.login(host="PROJECTURL",
                          port=PROJECTPORT,
                          api_key_value="YOUKEY")

# Get the feature store
fs = project.get_feature_store()

2024-11-27 22:04:12,550 INFO: Initializing external client
2024-11-27 22:04:12,550 INFO: Base URL: https://15.235.46.114:28181
2024-11-27 22:04:14,777 INFO: Python Engine initialized.

Logged in to project, explore it here https://15.235.46.114:28181/p/119


In [None]:
# 2. Generate the very important data
def generate_data(n_samples=1000):
    data = pd.DataFrame({
        'procrastination_level': np.random.randint(1, 11, n_samples),    # 1-10 scale
        'coffee_cups': np.random.randint(0, 8, n_samples),               # Number of coffee cups
        'last_minute_panic': np.random.randint(0, 2, n_samples),         # 1 for True, 0 for False
        'zodiac_sign': np.random.randint(1, 13, n_samples),              # Zodiac sign as integer (1-12)
        'task_completion': np.random.randint(0, 101, n_samples)          # Task completion as integer (0-100%)
    })
    return data

data = generate_data()

     procrastination_level  coffee_cups  last_minute_panic  zodiac_sign  \
0                        1            7                  0            3   
1                        3            2                  1            4   
2                       10            3                  0            2   
3                        7            3                  0            1   
4                        3            3                  1           11   
..                     ...          ...                ...          ...   
995                      3            7                  0            6   
996                      9            6                  0           11   
997                      1            1                  1            4   
998                      7            7                  0            8   
999                      2            4                  1            3   

     task_completion  
0                 74  
1                 46  
2                 62  
3      

In [3]:
# 3. Create feature group
fg = fs.get_or_create_feature_group(
    name='procrastinator_integer',
    version=1,
    primary_key=['procrastination_level', 'coffee_cups', 'last_minute_panic', 'zodiac_sign'],
    description='Features for predicting task completion based on procrastination',
    online_enabled=False
)
fg.insert(data, wait=True)

Uploading Dataframe: 100.00% |██████████| Rows 1000/1000 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: procrastinator_integer_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://15.235.46.114:28181/p/119/jobs/named/procrastinator_integer_1_offline_fg_materialization/executions
2024-11-27 22:04:36,852 INFO: Waiting for execution to finish. Current state: INITIALIZING. Final status: UNDEFINED
2024-11-27 22:04:40,010 INFO: Waiting for execution to finish. Current state: RUNNING. Final status: UNDEFINED


%6|1732741516.956|FAIL|rdkafka#producer-1| [thrd:ssl://15.235.47.52:9093/bootstrap]: ssl://15.235.47.52:9093/0: Disconnected (after 54059ms in state UP)
%6|1732741568.047|FAIL|rdkafka#producer-1| [thrd:ssl://15.235.47.52:9093/bootstrap]: ssl://15.235.47.52:9093/0: Disconnected (after 50001ms in state UP, 1 identical error(s) suppressed)


2024-11-27 22:06:17,689 INFO: Waiting for execution to finish. Current state: AGGREGATING_LOGS. Final status: SUCCEEDED
2024-11-27 22:06:17,819 INFO: Waiting for log aggregation to finish.
2024-11-27 22:06:26,592 INFO: Execution finished successfully.


(Job('procrastinator_integer_1_offline_fg_materialization', 'SPARK'), None)

In [4]:
# 4. Create a Feature view (mapping of the features for the model)
fv = fs.get_or_create_feature_view(
    name='procrastinator_integer_view2',
    version=1,
    description='Feature view for procrastination prediction',
    query=fg.select_all(),
    labels=['task_completion'],
)

Feature view created successfully, explore it at 
https://15.235.46.114:28181/p/119/fs/67/fv/procrastinator_integer_view2/version/1


In [5]:
# 5. Get training data
X_train, X_test, y_train, y_test = fv.train_test_split(test_size=0.2)

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (0.53s) 



In [6]:
# 6. Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 7. Evaluate model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")


Mean Squared Error: 1057.4717509677419


In [7]:
import os
from hsml.schema import Schema
from hsml.model_schema import ModelSchema
from joblib import dump

input_schema = Schema(X_train)
output_schema = Schema(y_train)
model_schema = ModelSchema(input_schema=input_schema, output_schema=output_schema)

# 8. Save model 
model_dir = "procrastinator_integers_model2"
os.makedirs(model_dir, exist_ok=True)

dump(model, os.path.join(model_dir, 'model.joblib'))

mr = project.get_model_registry()
procrastination_model = mr.sklearn.create_model(
    name="procrastinator_integers_model2",
    metrics={"mse": mse},
    model_schema=model_schema,
    input_example=X_test.sample().to_numpy(),
    description="Predicts task completion based on procrastination"
)
procrastination_model.save(model_dir)


  0%|          | 0/6 [00:00<?, ?it/s]

Uploading: 0.000%|          | 0/5610753 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/14 elapsed<00:00 remaining<?

Uploading: 0.000%|          | 0/493 elapsed<00:00 remaining<?

Model created, explore it at https://15.235.46.114:28181/p/119/models/procrastinator_integers_model2/1


Model(name: 'procrastinator_integers_model2', version: 1)

In [8]:
# 9. Create model serving endpoint
ms = project.get_model_serving()
deployment = procrastination_model.deploy(name="procrastinator40002k")

Deployment created, explore it at https://15.235.46.114:28181/p/119/deployments/1
Before making predictions, start the deployment by using `.start()`


In [9]:
deployment.start()

  0%|          | 0/5 [00:00<?, ?it/s]

Start making predictions by using `.predict()`


In [11]:
# 10 Try to predict something.
input = {
    "instances": [
        [9,                     # procrastination_level
         10,                     # coffee_cups
         0,                     # last_minute_panic
         1                      # zodiac_sign
        ]  
    ]   
}

# Make the prediction
predictions = deployment.predict(input)
prediction_value = predictions['predictions'][0]

# Output the prediction result
print(f"Predicted task completion: {prediction_value:.2f}%")


Predicted task completion: 56.28%
