In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder

df = pd.read_csv('230611_GMM.csv')
# Convert to datetime object
df['deviceTime'] = pd.to_datetime(df['deviceTime'])
# Convert to Unix timestamp
df['unix_timestamp'] = df['deviceTime'].apply(lambda x: x.timestamp())

# initialize OneHotEncoder
encoder = OneHotEncoder()

# fit and transform the data
encoded = encoder.fit_transform(df[['label']]).toarray()

# create new columns in the original dataframe with the encoded values
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names(['label']))
df = pd.concat([df, encoded_df], axis=1)

df = df.drop(['date', 'label', 'vehicleId', 'ending', 'starting', 'gnssPDOP', 'gnssHDOP'], axis = 1)

from sklearn.preprocessing import MinMaxScaler

# Select only the columns that have numeric data types and exclude target
numeric_cols = df.select_dtypes(include=['float', 'int']).columns
numeric_cols = numeric_cols.drop('fuel_diff')

# Apply MinMaxScaler only on the numeric columns
scaler = MinMaxScaler()
df_scaled = df.copy()
df_scaled[numeric_cols] = scaler.fit_transform(df_scaled[numeric_cols])

# The object columns remain unchanged

# Split the data into training, validation, and test sets
val_mask = df_scaled['new_vehicle_id'].isin(['v1', 'v7', 'v10', 'v5', 'v17'])
test_mask = df_scaled['new_vehicle_id'].isin(['v3', 'v8', 'v28', 'v32', 'v2'])
train_mask = ~df_scaled['new_vehicle_id'].isin(np.concatenate((val_mask, test_mask)))
X_train = df_scaled.loc[train_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuel_diff' ,'fuel_level'])
y_train = df_scaled.loc[train_mask, 'fuel_diff']
X_val = df_scaled.loc[val_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuel_diff', 'fuel_level'])
y_val = df_scaled.loc[val_mask, 'fuel_diff']
X_test = df_scaled.loc[test_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuel_diff','fuel_level'])
y_test = df_scaled.loc[test_mask, 'fuel_diff']



In [None]:
# Create a Random Forest regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the regressor
rf_regressor.fit(X_train, y_train)

# Make predictions on the val set
predictions = rf_regressor.predict(X_val)

# Evaluate the mean squared error of the predictions
mse = mean_squared_error(y_val, predictions)
print("Mean Squared Error:", mse)

In [None]:
# Create a Random Forest regressor
rf_regressor = RandomForestRegressor(random_state=42)

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)