### --- Step 1: Setup and Imports ---


In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, f1_score, confusion_matrix
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
import hopsworks
import exclude.key

# Hopsworks
HOPSWORKS_API_KEY = exclude.key.HOPSWORKS_API_KEY
FEATURE_GROUP_NAME = "bars_near_london_bridge"
FEATURE_GROUP_VERSION = 1

# Connect to Hopsworks
project = hopsworks.login(api_key_value=HOPSWORKS_API_KEY)
fs = project.get_feature_store()

Connection closed.
Connected. Call `.close()` to terminate connection gracefully.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/25749
Connected. Call `.close()` to terminate connection gracefully.


### --- Step 2: Load Data from the Feature group ---


In [2]:
# Retrieve feature groups.
fg = fs.get_feature_group(
    name=FEATURE_GROUP_NAME, 
    version=FEATURE_GROUP_VERSION,
)

# Select features for training datasets
df = fg.read()
df.head(5)


Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (1.02s) 


Unnamed: 0,venue_name,venue_address,day,hour,busyness,last_updated
0,All Bar One Tower Of London,"14 Byward St, Greater, London EC3R 5BA, United...",Monday,19,Average,2024-05-31 09:05:59.051218+00:00
1,The Old School Yard,"111 Long Ln, London SE1 4PH, United Kingdom",Tuesday,19,Low,2024-05-31 09:05:59.051218+00:00
2,The Two Bridges Ale House & Kitchen,"186 Tooley St, London SE1 2TZ, United Kingdom",Saturday,0,Average,2024-05-31 09:05:59.051218+00:00
3,The Kings Arms,"251 Tooley St, London SE1 2JX, United Kingdom",Sunday,3,Closed,2024-05-31 09:05:59.051218+00:00
4,The Market Porter,"9 Stoney St, London SE1 9AA, United Kingdom",Sunday,2,Closed,2024-05-31 09:05:59.051218+00:00


### --- Step 3: Process the Data ---

In [3]:
# Map categorical values of 'busyness' to numeric values
busyness_mapping = {'Closed': 0, 'Low': 1, 'Below average':2, 'Average': 3, 'Above average':4, 'High': 5}
df['busyness'] = df['busyness'].map(busyness_mapping)

# Load transformation functions from the feature store
min_max_scaler = fs.get_transformation_function(name="min_max_scaler")
label_encoder = fs.get_transformation_function(name="label_encoder")

# Define numerical and categorical features
numerical_features = ["hour"]
categorical_features = ["venue_name", "venue_address", "day"]

# Map features to transformation functions
transformation_functions = {}
for feature in numerical_features:
    transformation_functions[feature] = min_max_scaler
for feature in categorical_features:
    transformation_functions[feature] = label_encoder

df.head(5)

Unnamed: 0,venue_name,venue_address,day,hour,busyness,last_updated
0,All Bar One Tower Of London,"14 Byward St, Greater, London EC3R 5BA, United...",Monday,19,3,2024-05-31 09:05:59.051218+00:00
1,The Old School Yard,"111 Long Ln, London SE1 4PH, United Kingdom",Tuesday,19,1,2024-05-31 09:05:59.051218+00:00
2,The Two Bridges Ale House & Kitchen,"186 Tooley St, London SE1 2TZ, United Kingdom",Saturday,0,3,2024-05-31 09:05:59.051218+00:00
3,The Kings Arms,"251 Tooley St, London SE1 2JX, United Kingdom",Sunday,3,0,2024-05-31 09:05:59.051218+00:00
4,The Market Porter,"9 Stoney St, London SE1 9AA, United Kingdom",Sunday,2,0,2024-05-31 09:05:59.051218+00:00


### --- Step 4: Create the feature view ---

In [4]:
feature_view = fs.get_or_create_feature_view(
    name='busyness_feature_view',
    version=1,
    query=fg.select_all(),
    labels=["busyness"],
    transformation_functions=transformation_functions,
)

Feature view created successfully, explore it at 
https://c.app.hopsworks.ai:443/p/25749/fs/25669/fv/busyness_feature_view/version/1


### --- Step 5: Create the training dataset ---

In [7]:
# Split data into training, validation, and test sets
X_train, X_val, X_test, y_train, y_val, y_test = feature_view.train_validation_test_split(
    validation_size=0.2,
    test_size=0.1,
)

# Drop unnecessary columns
X_train.drop(['venue_address', 'last_updated'], axis=1, inplace=True)
X_val.drop(['venue_address', 'last_updated'], axis=1, inplace=True)
X_test.drop(['venue_address', 'last_updated'], axis=1, inplace=True)

# Add the popularity_score feature to the training, validation, and test sets
X_train['popularity_score'] = X_train['busyness'].rolling(window=3, min_periods=1).mean()
X_val['popularity_score'] = X_val['busyness'].rolling(window=3, min_periods=1).mean()
X_test['popularity_score'] = X_test['busyness'].rolling(window=3, min_periods=1).mean()




Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (3.39s) 



KeyError: 'busyness'

### --- Step 6: Train amd evaluate the model ---

In [None]:
from sklearn.metrics import mean_squared_error, f1_score, confusion_matrix

# Create an instance of the XGBClassifier 
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred_train = model.predict(X_train)
y_pred_val = model.predict(X_val)
y_pred_test = model.predict(X_test)

# Compute metrics
metrics = {"f1_score": f1_score(y_test, y_pred_test, average='macro')}
print(metrics)

# Confusion matrix
results = confusion_matrix(y_test, y_pred_test, labels=[0, 1])
df_cm = pd.DataFrame(results, ['True Low', 'True High'], ['Pred Low', 'Pred High'])
cm = sns.heatmap(df_cm, annot=True)
cm.get_figure().show()


NameError: name 'X_train' is not defined