In [2]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder

df = pd.read_csv('230611_GMM.csv')
# Convert to datetime object
df['deviceTime'] = pd.to_datetime(df['deviceTime'])
# Convert to Unix timestamp
df['unix_timestamp'] = df['deviceTime'].apply(lambda x: x.timestamp())

# initialize OneHotEncoder
encoder = OneHotEncoder()

# fit and transform the data
encoded = encoder.fit_transform(df[['label']]).toarray()

# create new columns in the original dataframe with the encoded values
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names(['label']))
df = pd.concat([df, encoded_df], axis=1
df = df.drop(['date', 'label', 'vehicleId', 'ending', 'starting', 'gnssPDOP', 'gnssHDOP'], axis = 1)

from sklearn.preprocessing import MinMaxScaler

# Select only the columns that have numeric data types and exclude target
numeric_cols = df.select_dtypes(include=['float', 'int']).columns
numeric_cols = numeric_cols.drop('fuel_diff')


# Apply MinMaxScaler only on the numeric columns
scaler = MinMaxScaler()
df_scaled = df.copy()
df_scaled[numeric_cols] = scaler.fit_transform(df_scaled[numeric_cols])
print(scaler.min_)
print(scaler.data_max_)
# Export scaler
pickle.dump(scaler, open("20230614.scaler.pkl", 'wb'))

# The object columns remain unchanged

# Split the data into training, validation, and test sets
val_mask = df_scaled['new_vehicle_id'].isin(['v1', 'v7', 'v10', 'v5', 'v17'])
test_mask = df_scaled['new_vehicle_id'].isin(['v3', 'v8', 'v28', 'v32', 'v2'])
train_mask = ~df_scaled['new_vehicle_id'].isin(np.concatenate((val_mask, test_mask)))
X_train = df_scaled.loc[train_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuel_diff' ,'fuel_level'])
y_train = df_scaled.loc[train_mask, 'fuel_diff']
X_val = df_scaled.loc[val_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuel_diff', 'fuel_level'])
y_val = df_scaled.loc[val_mask, 'fuel_diff']
X_test = df_scaled.loc[test_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuel_diff','fuel_level'])
y_test = df_scaled.loc[test_mask, 'fuel_diff']


# Define the LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

# Define the hyperparameters to search
params = {
    'objective': ['regression', 'huber'],
    'metric': ['l2'],
    'num_leaves': [10, 20, 30],
    'learning_rate': [0.01, 0.05, 0.1],
    'feature_fraction': [0.5, 0.8, 1.0]
}

# Use GridSearchCV to find the best hyperparameters
gbm = lgb.LGBMRegressor()
grid_search = GridSearchCV(gbm, params, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found
print(f'Best parameters: {grid_search.best_params_}')


SyntaxError: invalid syntax (<ipython-input-2-550b326a2855>, line 24)

In [17]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder

df = pd.read_csv('230620_GMM.csv')
# Convert to datetime object
df['deviceTime'] = pd.to_datetime(df['deviceTime'])
# Convert to Unix timestamp
df['unix_timestamp'] = df['deviceTime'].apply(lambda x: x.timestamp())

# initialize OneHotEncoder
#encoder = OneHotEncoder()

# fit and transform the data
#encoded = encoder.fit_transform(df[['label']]).toarray()

# create new columns in the original dataframe with the encoded values
#encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names(['label']))
#df = pd.concat([df, encoded_df], axis=1)

df = df.drop(['date', 'label', 'vehicleId', 'ending', 'starting', 'gnssPDOP', 'gnssHDOP'], axis = 1)

from sklearn.preprocessing import MinMaxScaler

# Select only the columns that have numeric data types and exclude target
numeric_cols = df.select_dtypes(include=['float', 'int']).columns
numeric_cols = numeric_cols.drop('fuel_diff')

# Apply MinMaxScaler only on the numeric columns
scaler = MinMaxScaler()
df_scaled = df.copy()
df_scaled[numeric_cols] = scaler.fit_transform(df_scaled[numeric_cols])

# The object columns remain unchanged

# Split the data into training, validation, and test sets
val_mask = df_scaled['new_vehicle_id'].isin(['v1', 'v7', 'v10', 'v5', 'v17'])
test_mask = df_scaled['new_vehicle_id'].isin(['v3', 'v8', 'v28', 'v32', 'v2'])
train_mask = ~df_scaled['new_vehicle_id'].isin(np.concatenate((val_mask, test_mask)))
X_train = df_scaled.loc[train_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuel_diff' ,'fuel_level'])
y_train = df_scaled.loc[train_mask, 'fuel_diff']
X_val = df_scaled.loc[val_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuel_diff', 'fuel_level'])
y_val = df_scaled.loc[val_mask, 'fuel_diff']
X_test = df_scaled.loc[test_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuel_diff','fuel_level'])
y_test = df_scaled.loc[test_mask, 'fuel_diff']

In [18]:
# Define the LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

# Define the hyperparameters to search
params = {
    'objective': ['regression'],
    'metric': ['l2'],
    'num_leaves': [10, 20, 30],
    'learning_rate': [0.01, 0.05, 0.1],
    'feature_fraction': [0.5, 0.8, 1.0]
}

# Use GridSearchCV to find the best hyperparameters
gbm = lgb.LGBMRegressor()
grid_search = GridSearchCV(gbm, params, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found
print(f'Best parameters: {grid_search.best_params_}')

Fitting 5 folds for each of 27 candidates, totalling 135 fits




Best parameters: {'feature_fraction': 0.8, 'learning_rate': 0.1, 'metric': 'l2', 'num_leaves': 10, 'objective': 'regression'}


In [33]:
# Train a LightGBM model with the best hyperparameters found
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
best_params = grid_search.best_params_
model = lgb.train(best_params, train_data, valid_sets=[train_data, val_data], num_boost_round=1000, early_stopping_rounds=20, verbose_eval=10)

# Use the trained model to make predictions on the training and test sets
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

# Calculate the training and test errors
train_mse = mean_squared_error(y_train, y_train_pred)
val_mse = mean_squared_error(y_val, y_val_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_r2 = r2_score(y_train, y_train_pred)
val_r2 = r2_score(y_val, y_val_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Train MSE: {train_mse:.4f}, MAE: {train_mae:.4f}, RMSE: {train_rmse:.4f}, R2: {train_r2:.4f}')
print(f'Validation MSE: {val_mse:.4f}, MAE: {val_mae:.4f}, RMSE: {val_rmse:.4f}, R2: {val_r2:.4f}')
print(f'Test MSE: {test_mse:.4f}, MAE: {test_mae:.4f}, RMSE: {test_rmse:.4f}, R2: {test_r2:.4f}')



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2820
[LightGBM] [Info] Number of data points in the train set: 1250024, number of used features: 12
[LightGBM] [Info] Start training from score 0.000710
Training until validation scores don't improve for 20 rounds
[10]	training's l2: 32.7884	valid_1's l2: 25.4599
[20]	training's l2: 31.5266	valid_1's l2: 24.4318
[30]	training's l2: 29.9271	valid_1's l2: 21.0028
[40]	training's l2: 28.9152	valid_1's l2: 19.3734
[50]	training's l2: 28.3252	valid_1's l2: 18.5763
[60]	training's l2: 27.8204	valid_1's l2: 18.0934
[70]	training's l2: 27.3224	valid_1's l2: 17.5528
[80]	training's l2: 26.8921	valid_1's l2: 17.2047
[90]	training's l2: 26.5693	valid_1's l2: 16.9314
[100]	training's l2: 26.2834	valid_1's l2: 16.7819
[110]	training's l2: 25.9859	valid_1's l2: 16.448
[120]	training's l2: 25.777	valid_1's l2: 16.2718
[130]	training's l2: 25.5942	valid

In [32]:
X_train.columns

Index(['speed_generate', 'batteryVoltage', 'altitude', 'heading', 'lat', 'lon',
       'axisX', 'axisY', 'axisZ', 'distance (km)', 'Fuel Capacity (L)',
       'unix_timestamp'],
      dtype='object')

In [10]:
# Train a LightGBM model with the best hyperparameters found
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
best_params = grid_search.best_params_
model = lgb.train(best_params, train_data, valid_sets=[train_data, val_data], num_boost_round=1000, early_stopping_rounds=20, verbose_eval=10)

# Use the trained model to make predictions on the training and test sets
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

# Calculate the training and test errors
train_mse = mean_squared_error(y_train, y_train_pred)
val_mse = mean_squared_error(y_val, y_val_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_r2 = r2_score(y_train, y_train_pred)
val_r2 = r2_score(y_val, y_val_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Train MSE: {train_mse:.4f}, MAE: {train_mae:.4f}, RMSE: {train_rmse:.4f}, R2: {train_r2:.4f}')
print(f'Validation MSE: {val_mse:.4f}, MAE: {val_mae:.4f}, RMSE: {val_rmse:.4f}, R2: {val_r2:.4f}')
print(f'Test MSE: {test_mse:.4f}, MAE: {test_mae:.4f}, RMSE: {test_rmse:.4f}, R2: {test_r2:.4f}')



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2571
[LightGBM] [Info] Number of data points in the train set: 1250024, number of used features: 14
[LightGBM] [Info] Start training from score 0.000710
Training until validation scores don't improve for 20 rounds
[10]	training's l2: 27.7883	valid_1's l2: 24.5676
[20]	training's l2: 25.3616	valid_1's l2: 23.1221
[30]	training's l2: 22.0465	valid_1's l2: 17.0709
[40]	training's l2: 19.9249	valid_1's l2: 13.8052
[50]	training's l2: 18.5565	valid_1's l2: 12.3539
[60]	training's l2: 17.7201	valid_1's l2: 11.3437
[70]	training's l2: 17.0433	valid_1's l2: 10.6857
[80]	training's l2: 16.5326	valid_1's l2: 10.3529
[90]	training's l2: 16.1231	valid_1's l2: 10.1746
[100]	training's l2: 15.8241	valid_1's l2: 9.86474
[110]	training's l2: 15.6208	valid_1's l2: 9.79459
[120]	training's l2: 15.2795	valid_1's l2: 9.44484
[130]	training's l2: 15.0603	val

In [29]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder

df = pd.read_csv('230611_GMM.csv')
# Convert to datetime object
df['deviceTime'] = pd.to_datetime(df['deviceTime'])
# Convert to Unix timestamp
df['unix_timestamp'] = df['deviceTime'].apply(lambda x: x.timestamp())

# initialize OneHotEncoder
#encoder = OneHotEncoder()

# fit and transform the data
#encoded = encoder.fit_transform(df[['label']]).toarray()

# create new columns in the original dataframe with the encoded values
#encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names(['label']))
#df = pd.concat([df, encoded_df], axis=1)

df = df.drop(['date', 'label', 'vehicleId', 'ending', 'starting', 'gnssPDOP', 'gnssHDOP'], axis = 1)

from sklearn.preprocessing import MinMaxScaler

# Select only the columns that have numeric data types and exclude target
numeric_cols = df.select_dtypes(include=['float', 'int']).columns
numeric_cols = numeric_cols.drop('fuel_diff')

# Apply MinMaxScaler only on the numeric columns
scaler = MinMaxScaler()
df_scaled = df.copy()
df_scaled[numeric_cols] = scaler.fit_transform(df_scaled[numeric_cols])

# The object columns remain unchanged

# Split the data into training, validation, and test sets
val_mask = df_scaled['new_vehicle_id'].isin(['v1', 'v7', 'v10', 'v5', 'v17'])
test_mask = df_scaled['new_vehicle_id'].isin(['v3', 'v8', 'v28', 'v32', 'v2'])
train_mask = ~df_scaled['new_vehicle_id'].isin(np.concatenate((val_mask, test_mask)))
X_train = df_scaled.loc[train_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuel_diff' ,'fuel_level'])
y_train = df_scaled.loc[train_mask, 'fuel_diff']
X_val = df_scaled.loc[val_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuel_diff', 'fuel_level'])
y_val = df_scaled.loc[val_mask, 'fuel_diff']
X_test = df_scaled.loc[test_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuel_diff','fuel_level'])
y_test = df_scaled.loc[test_mask, 'fuel_diff']

In [30]:
# Define the LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

# Define the hyperparameters to search
params = {
    'objective': ['regression'],
    'metric': ['l2'],
    'num_leaves': [10, 20, 30],
    'learning_rate': [0.01, 0.05, 0.1],
    'feature_fraction': [0.5, 0.8, 1.0]
}

# Use GridSearchCV to find the best hyperparameters
gbm = lgb.LGBMRegressor()
grid_search = GridSearchCV(gbm, params, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found
print(f'Best parameters: {grid_search.best_params_}')

Fitting 5 folds for each of 27 candidates, totalling 135 fits




Best parameters: {'feature_fraction': 0.8, 'learning_rate': 0.1, 'metric': 'l2', 'num_leaves': 10, 'objective': 'regression'}


In [31]:
# Train a LightGBM model with the best hyperparameters found
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
best_params = grid_search.best_params_
model = lgb.train(best_params, train_data, valid_sets=[train_data, val_data], num_boost_round=1000, early_stopping_rounds=20, verbose_eval=10)

# Use the trained model to make predictions on the training and test sets
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

# Calculate the training and test errors
train_mse = mean_squared_error(y_train, y_train_pred)
val_mse = mean_squared_error(y_val, y_val_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_r2 = r2_score(y_train, y_train_pred)
val_r2 = r2_score(y_val, y_val_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Train MSE: {train_mse:.4f}, MAE: {train_mae:.4f}, RMSE: {train_rmse:.4f}, R2: {train_r2:.4f}')
print(f'Validation MSE: {val_mse:.4f}, MAE: {val_mae:.4f}, RMSE: {val_rmse:.4f}, R2: {val_r2:.4f}')
print(f'Test MSE: {test_mse:.4f}, MAE: {test_mae:.4f}, RMSE: {test_rmse:.4f}, R2: {test_r2:.4f}')



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2820
[LightGBM] [Info] Number of data points in the train set: 1250024, number of used features: 12
[LightGBM] [Info] Start training from score 0.000710
Training until validation scores don't improve for 20 rounds
[10]	training's l2: 32.7884	valid_1's l2: 25.4599
[20]	training's l2: 31.5266	valid_1's l2: 24.4318
[30]	training's l2: 29.9271	valid_1's l2: 21.0028
[40]	training's l2: 28.9152	valid_1's l2: 19.3734
[50]	training's l2: 28.3252	valid_1's l2: 18.5763
[60]	training's l2: 27.8204	valid_1's l2: 18.0934
[70]	training's l2: 27.3224	valid_1's l2: 17.5528
[80]	training's l2: 26.8921	valid_1's l2: 17.2047
[90]	training's l2: 26.5693	valid_1's l2: 16.9314
[100]	training's l2: 26.2834	valid_1's l2: 16.7819
[110]	training's l2: 25.9859	valid_1's l2: 16.448
[120]	training's l2: 25.777	valid_1's l2: 16.2718
[130]	training's l2: 25.5942	valid

In [None]:
# Train a LightGBM model with the best hyperparameters found
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
best_params = grid_search.best_params_
model = lgb.train(best_params, train_data, valid_sets=[train_data, val_data], num_boost_round=1000, early_stopping_rounds=20, verbose_eval=10)

# Use the trained model to make predictions on the training and test sets
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

# Calculate the training and test errors
train_mse = mean_squared_error(y_train, y_train_pred)
val_mse = mean_squared_error(y_val, y_val_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_r2 = r2_score(y_train, y_train_pred)
val_r2 = r2_score(y_val, y_val_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Train MSE: {train_mse:.4f}, MAE: {train_mae:.4f}, RMSE: {train_rmse:.4f}, R2: {train_r2:.4f}')
print(f'Validation MSE: {val_mse:.4f}, MAE: {val_mae:.4f}, RMSE: {val_rmse:.4f}, R2: {val_r2:.4f}')
print(f'Test MSE: {test_mse:.4f}, MAE: {test_mae:.4f}, RMSE: {test_rmse:.4f}, R2: {test_r2:.4f}')

# Attribute 2

In [24]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder

df = pd.read_csv('230620_GMM.csv')
# Convert to datetime object
df['deviceTime'] = pd.to_datetime(df['deviceTime'])
# Convert to Unix timestamp
df['unix_timestamp'] = df['deviceTime'].apply(lambda x: x.timestamp())

# initialize OneHotEncoder
encoder = OneHotEncoder()

# fit and transform the data
encoded = encoder.fit_transform(df[['label']]).toarray()

# create new columns in the original dataframe with the encoded values
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names(['label']))
df = pd.concat([df, encoded_df], axis=1)

df = df.drop(['date', 'label', 'vehicleId', 'ending', 'starting', 'gnssPDOP', 'gnssHDOP'], axis = 1)

from sklearn.preprocessing import MinMaxScaler

# Select only the columns that have numeric data types
numeric_cols = df.select_dtypes(include=['float', 'int']).columns
numeric_cols = numeric_cols.drop('fuel_diff')

# Apply MinMaxScaler only on the numeric columns
scaler = MinMaxScaler()
df_scaled = df.copy()
df_scaled[numeric_cols] = scaler.fit_transform(df_scaled[numeric_cols])

# The object columns remain unchanged

# Split the data into training, validation, and test sets
val_mask = df_scaled['new_vehicle_id'].isin(['v1', 'v7', 'v10', 'v5', 'v17'])
test_mask = df_scaled['new_vehicle_id'].isin(['v3', 'v8', 'v28', 'v32', 'v2'])
train_mask = ~df_scaled['new_vehicle_id'].isin(np.concatenate((val_mask, test_mask)))
X_train = df_scaled.loc[train_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuel_diff' ,'fuel_level', 'batteryVoltage', 'axisX', 'axisY', 'axisZ'])
y_train = df_scaled.loc[train_mask, 'fuel_diff']
X_val = df_scaled.loc[val_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuel_diff', 'fuel_level', 'batteryVoltage', 'axisX', 'axisY', 'axisZ'])
y_val = df_scaled.loc[val_mask, 'fuel_diff']
X_test = df_scaled.loc[test_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuel_diff','fuel_level', 'batteryVoltage', 'axisX', 'axisY', 'axisZ'])
y_test = df_scaled.loc[test_mask, 'fuel_diff']


# Define the LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

# Define the hyperparameters to search
params = {
    'objective': ['regression', 'huber'],
    'metric': ['l2'],
    'num_leaves': [10, 20, 30],
    'learning_rate': [0.01, 0.05, 0.1],
    'feature_fraction': [0.5, 0.8, 1.0]
}

# Use GridSearchCV to find the best hyperparameters
gbm = lgb.LGBMRegressor()
grid_search = GridSearchCV(gbm, params, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found
print(f'Best parameters: {grid_search.best_params_}')




Fitting 5 folds for each of 54 candidates, totalling 270 fits








Best parameters: {'feature_fraction': 1.0, 'learning_rate': 0.1, 'metric': 'l2', 'num_leaves': 20, 'objective': 'regression'}


In [25]:
# Train a LightGBM model with the best hyperparameters found
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
best_params = grid_search.best_params_
model = lgb.train(best_params, train_data, valid_sets=[train_data, val_data], num_boost_round=1000, early_stopping_rounds=20, verbose_eval=10)

# Use the trained model to make predictions on the training and test sets
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

# Calculate the training and test errors
train_mse = mean_squared_error(y_train, y_train_pred)
val_mse = mean_squared_error(y_val, y_val_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_r2 = r2_score(y_train, y_train_pred)
val_r2 = r2_score(y_val, y_val_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Train MSE: {train_mse:.4f}, MAE: {train_mae:.4f}, RMSE: {train_rmse:.4f}, R2: {train_r2:.4f}')
print(f'Validation MSE: {val_mse:.4f}, MAE: {val_mae:.4f}, RMSE: {val_rmse:.4f}, R2: {val_r2:.4f}')
print(f'Test MSE: {test_mse:.4f}, MAE: {test_mae:.4f}, RMSE: {test_rmse:.4f}, R2: {test_r2:.4f}')



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1806
[LightGBM] [Info] Number of data points in the train set: 1250024, number of used features: 11
[LightGBM] [Info] Start training from score 0.000710
Training until validation scores don't improve for 20 rounds
[10]	training's l2: 27.8082	valid_1's l2: 24.188
[20]	training's l2: 24.0607	valid_1's l2: 19.6773
[30]	training's l2: 21.9212	valid_1's l2: 16.5353
[40]	training's l2: 20.1721	valid_1's l2: 14.0601
[50]	training's l2: 19.1326	valid_1's l2: 12.8612
[60]	training's l2: 18.0719	valid_1's l2: 11.5133
[70]	training's l2: 17.5117	valid_1's l2: 10.816
[80]	training's l2: 16.9345	valid_1's l2: 10.4204
[90]	training's l2: 16.597	valid_1's l2: 10.1116
[100]	training's l2: 16.2903	valid_1's l2: 9.84715
[110]	training's l2: 16.0532	valid_1's l2: 9.72121
[120]	training's l2: 15.7546	valid_1's l2: 9.47355
[130]	training's l2: 15.5652	valid_

In [22]:
# Train a LightGBM model with the best hyperparameters found
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
best_params = grid_search.best_params_
model = lgb.train(best_params, train_data, valid_sets=[train_data, val_data], num_boost_round=1000, early_stopping_rounds=20, verbose_eval=10)

# Use the trained model to make predictions on the training and test sets
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

# Calculate the training and test errors
train_mse = mean_squared_error(y_train, y_train_pred)
val_mse = mean_squared_error(y_val, y_val_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_r2 = r2_score(y_train, y_train_pred)
val_r2 = r2_score(y_val, y_val_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Train MSE: {train_mse:.4f}, MAE: {train_mae:.4f}, RMSE: {train_rmse:.4f}, R2: {train_r2:.4f}')
print(f'Validation MSE: {val_mse:.4f}, MAE: {val_mae:.4f}, RMSE: {val_rmse:.4f}, R2: {val_r2:.4f}')
print(f'Test MSE: {test_mse:.4f}, MAE: {test_mae:.4f}, RMSE: {test_rmse:.4f}, R2: {test_r2:.4f}')

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2361
[LightGBM] [Info] Number of data points in the train set: 1134675, number of used features: 15
[LightGBM] [Info] Start training from score 0.000514
Training until validation scores don't improve for 20 rounds




[10]	training's l2: 2.85867	valid_1's l2: 2.61402
[20]	training's l2: 2.48187	valid_1's l2: 2.21161
[30]	training's l2: 2.2048	valid_1's l2: 1.90592
[40]	training's l2: 2.09168	valid_1's l2: 1.82358
[50]	training's l2: 2.03528	valid_1's l2: 1.76956
[60]	training's l2: 1.97988	valid_1's l2: 1.73791
[70]	training's l2: 1.93687	valid_1's l2: 1.7005
[80]	training's l2: 1.89651	valid_1's l2: 1.67396
[90]	training's l2: 1.86105	valid_1's l2: 1.65157
[100]	training's l2: 1.8316	valid_1's l2: 1.63101
[110]	training's l2: 1.81026	valid_1's l2: 1.61449
[120]	training's l2: 1.78049	valid_1's l2: 1.53326
[130]	training's l2: 1.75676	valid_1's l2: 1.51065
[140]	training's l2: 1.73295	valid_1's l2: 1.47737
[150]	training's l2: 1.71992	valid_1's l2: 1.47349
[160]	training's l2: 1.70591	valid_1's l2: 1.45245
[170]	training's l2: 1.68458	valid_1's l2: 1.44819
[180]	training's l2: 1.67178	valid_1's l2: 1.43509
[190]	training's l2: 1.6496	valid_1's l2: 1.41971
[200]	training's l2: 1.62909	valid_1's l2: 1

In [7]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70092 entries, 212505 to 663557
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   speed_generate  70092 non-null  float64
 1   batteryVoltage  70092 non-null  float64
 2   altitude        70092 non-null  float64
 3   gnssHDOP        70092 non-null  float64
 4   gnssPDOP        70092 non-null  float64
 5   heading         70092 non-null  float64
 6   lat             70092 non-null  float64
 7   lon             70092 non-null  float64
 8   axisX           70092 non-null  float64
 9   axisY           70092 non-null  float64
 10  axisZ           70092 non-null  float64
 11  distance (km)   70092 non-null  float64
 12  unix_timestamp  70092 non-null  float64
 13  label_abnormal  70092 non-null  float64
 14  label_normal    70092 non-null  float64
 15  label_refuel    70092 non-null  float64
dtypes: float64(16)
memory usage: 9.1 MB


In [12]:
print(f'Train MSE: {train_mse:.4f}, MAE: {train_mae:.4f}, RMSE: {train_rmse:.4f}, R2: {train_r2:.4f}')
print(f'Validation MSE: {val_mse:.4f}, MAE: {val_mae:.4f}, RMSE: {val_rmse:.4f}, R2: {val_r2:.4f}')
print(f'Test MSE: {test_mse:.4f}, MAE: {test_mae:.4f}, RMSE: {test_rmse:.4f}, R2: {test_r2:.4f}')

Train MSE: 0.0000, MAE: 0.0017, RMSE: 0.0071, R2: 0.6645
Validation MSE: 0.0000, MAE: 0.0016, RMSE: 0.0066, R2: 0.6943
Test MSE: 0.0000, MAE: 0.0016, RMSE: 0.0067, R2: 0.6888


In [13]:
print("dataset:\n", df['vehicleId'].value_counts())

dataset:
 v33    308041
v30    245087
v14    214091
v9     201734
v16    168400
v28    133861
v27    123365
v7     119186
v1     109914
v31     97093
v29     96586
v4      89473
v3      85924
v17     77302
v10     75396
v26     74425
v21     74315
v32     73166
v5      71219
v23     67269
v11     63071
v8      61825
v18     51266
v25     50913
v13     50361
v6      50240
v20     48820
v15     45540
v12     43041
v2      40024
Name: vehicleId, dtype: int64


In [18]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3010948 entries, 0 to 3010947
Data columns (total 24 columns):
 #   Column           Dtype  
---  ------           -----  
 0   batteryVoltage   float64
 1   altitude         float64
 2   gnssStatus       float64
 3   engineStatus     float64
 4   speed            float64
 5   gnssHDOP         float64
 6   isMoving         float64
 7   gnssPDOP         float64
 8   createdAt        float64
 9   heading          float64
 10  gpsStatus        float64
 11  externalVoltage  float64
 12  lat              float64
 13  lon              float64
 14  axisX            float64
 15  axisY            float64
 16  axisZ            float64
 17  distance (km)    float64
 18  time_diff        int64  
 19  engine           int64  
 20  BDM              int64  
 21  BGK              int64  
 22  BTM              int64  
 23  fuel_filter      float64
dtypes: float64(19), int64(5)
memory usage: 574.3 MB


# Swapping attribute 1

In [2]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder

df = pd.read_csv('230611_GMM.csv')
# Convert to datetime object
df['deviceTime'] = pd.to_datetime(df['deviceTime'])
# Convert to Unix timestamp
df['unix_timestamp'] = df['deviceTime'].apply(lambda x: x.timestamp())

# initialize OneHotEncoder
encoder = OneHotEncoder()

# fit and transform the data
encoded = encoder.fit_transform(df[['label']]).toarray()

# create new columns in the original dataframe with the encoded values
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names(['label']))
df = pd.concat([df, encoded_df], axis=1)

df = df.drop(['date', 'label', 'vehicleId', 'ending', 'starting', 'gnssPDOP', 'gnssHDOP'], axis = 1)

from sklearn.preprocessing import MinMaxScaler

# Select only the columns that have numeric data types
numeric_cols = df.select_dtypes(include=['float', 'int']).columns
numeric_cols = numeric_cols.drop('fuel_diff')

# Apply MinMaxScaler only on the numeric columns
scaler = MinMaxScaler()
df_scaled = df.copy()
df_scaled[numeric_cols] = scaler.fit_transform(df_scaled[numeric_cols])

# The object columns remain unchanged

# Split the data into training, validation, and test sets
test_mask = df_scaled['new_vehicle_id'].isin(['v1', 'v7', 'v10', 'v5', 'v17'])
val_mask = df_scaled['new_vehicle_id'].isin(['v3', 'v8', 'v28', 'v32', 'v2'])
train_mask = ~df_scaled['new_vehicle_id'].isin(np.concatenate((val_mask, test_mask)))
X_train = df_scaled.loc[train_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuel_diff' ,'fuel_level'])
y_train = df_scaled.loc[train_mask, 'fuel_diff']
X_val = df_scaled.loc[val_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuel_diff', 'fuel_level'])
y_val = df_scaled.loc[val_mask, 'fuel_diff']
X_test = df_scaled.loc[test_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuel_diff','fuel_level'])
y_test = df_scaled.loc[test_mask, 'fuel_diff']


# Define the LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

# Define the hyperparameters to search
params = {
    'objective': ['regression', 'huber'],
    'metric': ['l2'],
    'num_leaves': [10, 20, 30],
    'learning_rate': [0.01, 0.05, 0.1],
    'feature_fraction': [0.5, 0.8, 1.0]
}

# Use GridSearchCV to find the best hyperparameters
gbm = lgb.LGBMRegressor()
grid_search = GridSearchCV(gbm, params, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found
print(f'Best parameters: {grid_search.best_params_}')




Fitting 5 folds for each of 54 candidates, totalling 270 fits








Best parameters: {'feature_fraction': 0.8, 'learning_rate': 0.1, 'metric': 'l2', 'num_leaves': 30, 'objective': 'regression'}


In [3]:
# Train a LightGBM model with the best hyperparameters found
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
best_params = grid_search.best_params_
model = lgb.train(best_params, train_data, valid_sets=[train_data, val_data], num_boost_round=1000, early_stopping_rounds=20, verbose_eval=10)

# Use the trained model to make predictions on the training and test sets
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

# Calculate the training and test errors
train_mse = mean_squared_error(y_train, y_train_pred)
val_mse = mean_squared_error(y_val, y_val_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_r2 = r2_score(y_train, y_train_pred)
val_r2 = r2_score(y_val, y_val_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Train MSE: {train_mse:.4f}, MAE: {train_mae:.4f}, RMSE: {train_rmse:.4f}, R2: {train_r2:.4f}')
print(f'Validation MSE: {val_mse:.4f}, MAE: {val_mae:.4f}, RMSE: {val_rmse:.4f}, R2: {val_r2:.4f}')
print(f'Test MSE: {test_mse:.4f}, MAE: {test_mae:.4f}, RMSE: {test_rmse:.4f}, R2: {test_r2:.4f}')



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2826
[LightGBM] [Info] Number of data points in the train set: 1250024, number of used features: 15
[LightGBM] [Info] Start training from score 0.000710
Training until validation scores don't improve for 20 rounds
[10]	training's l2: 25.9101	valid_1's l2: 37.3233
[20]	training's l2: 22.8571	valid_1's l2: 32.2348
[30]	training's l2: 20.5038	valid_1's l2: 26.4532
[40]	training's l2: 18.0481	valid_1's l2: 18.6738
[50]	training's l2: 16.8774	valid_1's l2: 15.7999
[60]	training's l2: 16.1015	valid_1's l2: 14.0706
[70]	training's l2: 15.2989	valid_1's l2: 11.8934
[80]	training's l2: 14.7363	valid_1's l2: 10.3041
[90]	training's l2: 14.3576	valid_1's l2: 9.5799
[100]	training's l2: 14.1053	valid_1's l2: 9.35743
[110]	training's l2: 13.8441	valid_1's l2: 8.66387
[120]	training's l2: 13.5918	valid_1's l2: 8.40012
[130]	training's l2: 13.4107	valid_1's l2: 8.11608
[140]	training's l2: 13.2478	valid_1's l2: 8.0

In [2]:
# Train a LightGBM model with the best hyperparameters found
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
best_params = grid_search.best_params_
model = lgb.train(best_params, train_data, valid_sets=[train_data, val_data], num_boost_round=1000, early_stopping_rounds=20, verbose_eval=10)

# Use the trained model to make predictions on the training and test sets
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

# Calculate the training and test errors
train_mse = mean_squared_error(y_train, y_train_pred)
val_mse = mean_squared_error(y_val, y_val_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_r2 = r2_score(y_train, y_train_pred)
val_r2 = r2_score(y_val, y_val_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Train MSE: {train_mse:.4f}, MAE: {train_mae:.4f}, RMSE: {train_rmse:.4f}, R2: {train_r2:.4f}')
print(f'Validation MSE: {val_mse:.4f}, MAE: {val_mae:.4f}, RMSE: {val_rmse:.4f}, R2: {val_r2:.4f}')
print(f'Test MSE: {test_mse:.4f}, MAE: {test_mae:.4f}, RMSE: {test_rmse:.4f}, R2: {test_r2:.4f}')



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3320
[LightGBM] [Info] Number of data points in the train set: 1134675, number of used features: 16
[LightGBM] [Info] Start training from score 0.000514
Training until validation scores don't improve for 20 rounds
[10]	training's l2: 3.06046	valid_1's l2: 3.92086
[20]	training's l2: 2.64228	valid_1's l2: 2.98782
[30]	training's l2: 2.43698	valid_1's l2: 2.4687
[40]	training's l2: 2.28433	valid_1's l2: 2.04446
[50]	training's l2: 2.13158	valid_1's l2: 1.72616
[60]	training's l2: 2.04082	valid_1's l2: 1.63936
[70]	training's l2: 1.96908	valid_1's l2: 1.54192
[80]	training's l2: 1.91595	valid_1's l2: 1.50072
[90]	training's l2: 1.87295	valid_1's l2: 1.47344
[100]	training's l2: 1.83901	valid_1's l2: 1.41699
[110]	training's l2: 1.80473	valid_1's l2: 1.36703
[120]	training's l2: 1.77336	valid_1's l2: 1.30031
[130]	training's l2: 1.74916	valid_1's l2: 1.26796
[140]	training's l2: 1.72394	valid_1's l2: 1.2

# Swap attribute 2

In [1]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder

df = pd.read_csv('230611_GMM.csv')
# Convert to datetime object
df['deviceTime'] = pd.to_datetime(df['deviceTime'])
# Convert to Unix timestamp
df['unix_timestamp'] = df['deviceTime'].apply(lambda x: x.timestamp())

# initialize OneHotEncoder
encoder = OneHotEncoder()

# fit and transform the data
encoded = encoder.fit_transform(df[['label']]).toarray()

# create new columns in the original dataframe with the encoded values
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names(['label']))
df = pd.concat([df, encoded_df], axis=1)

df = df.drop(['date', 'label', 'vehicleId', 'ending', 'starting', 'gnssPDOP', 'gnssHDOP'], axis = 1)

from sklearn.preprocessing import MinMaxScaler

# Select only the columns that have numeric data types
numeric_cols = df.select_dtypes(include=['float', 'int']).columns
numeric_cols = numeric_cols.drop('fuel_diff')

# Apply MinMaxScaler only on the numeric columns
scaler = MinMaxScaler()
df_scaled = df.copy()
df_scaled[numeric_cols] = scaler.fit_transform(df_scaled[numeric_cols])

# The object columns remain unchanged

# Split the data into training, validation, and test sets
test_mask = df_scaled['new_vehicle_id'].isin(['v1', 'v7', 'v10', 'v5', 'v17'])
val_mask = df_scaled['new_vehicle_id'].isin(['v3', 'v8', 'v28', 'v32', 'v2'])
train_mask = ~df_scaled['new_vehicle_id'].isin(np.concatenate((val_mask, test_mask)))
X_train = df_scaled.loc[train_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuel_diff' ,'fuel_level', 'batteryVoltage', 'axisX', 'axisY', 'axisZ'])
y_train = df_scaled.loc[train_mask, 'fuel_diff']
X_val = df_scaled.loc[val_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuel_diff', 'fuel_level', 'batteryVoltage', 'axisX', 'axisY', 'axisZ'])
y_val = df_scaled.loc[val_mask, 'fuel_diff']
X_test = df_scaled.loc[test_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuel_diff','fuel_level', 'batteryVoltage', 'axisX', 'axisY', 'axisZ'])
y_test = df_scaled.loc[test_mask, 'fuel_diff']


# Define the LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

# Define the hyperparameters to search
params = {
    'objective': ['regression', 'huber'],
    'metric': ['l2'],
    'num_leaves': [10, 20, 30],
    'learning_rate': [0.01, 0.05, 0.1],
    'feature_fraction': [0.5, 0.8, 1.0]
}

# Use GridSearchCV to find the best hyperparameters
gbm = lgb.LGBMRegressor()
grid_search = GridSearchCV(gbm, params, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found
print(f'Best parameters: {grid_search.best_params_}')




Fitting 5 folds for each of 54 candidates, totalling 270 fits








Best parameters: {'feature_fraction': 1.0, 'learning_rate': 0.1, 'metric': 'l2', 'num_leaves': 20, 'objective': 'regression'}


In [2]:
# Train a LightGBM model with the best hyperparameters found
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
best_params = grid_search.best_params_
model = lgb.train(best_params, train_data, valid_sets=[train_data, val_data], num_boost_round=1000, early_stopping_rounds=20, verbose_eval=10)

# Use the trained model to make predictions on the training and test sets
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

# Calculate the training and test errors
train_mse = mean_squared_error(y_train, y_train_pred)
val_mse = mean_squared_error(y_val, y_val_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_r2 = r2_score(y_train, y_train_pred)
val_r2 = r2_score(y_val, y_val_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Train MSE: {train_mse:.4f}, MAE: {train_mae:.4f}, RMSE: {train_rmse:.4f}, R2: {train_r2:.4f}')
print(f'Validation MSE: {val_mse:.4f}, MAE: {val_mae:.4f}, RMSE: {val_rmse:.4f}, R2: {val_r2:.4f}')
print(f'Test MSE: {test_mse:.4f}, MAE: {test_mae:.4f}, RMSE: {test_rmse:.4f}, R2: {test_r2:.4f}')



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1806
[LightGBM] [Info] Number of data points in the train set: 1250024, number of used features: 11
[LightGBM] [Info] Start training from score 0.000710
Training until validation scores don't improve for 20 rounds
[10]	training's l2: 27.8082	valid_1's l2: 40.5848
[20]	training's l2: 24.0607	valid_1's l2: 31.8698
[30]	training's l2: 21.9212	valid_1's l2: 26.4924
[40]	training's l2: 20.1721	valid_1's l2: 21.9321
[50]	training's l2: 19.1326	valid_1's l2: 19.1659
[60]	training's l2: 18.0719	valid_1's l2: 14.3719
[70]	training's l2: 17.5117	valid_1's l2: 12.8622
[80]	training's l2: 16.9345	valid_1's l2: 11.2951
[90]	training's l2: 16.597	valid_1's l2: 10.6507
[100]	training's l2: 16.2903	valid_1's l2: 10.2167
[110]	training's l2: 16.0532	valid_1's l2: 10.1186
[120]	training's l2: 15.7546	valid_1's l2: 9.52073
[130]	training's l2: 15.5652	vali

In [4]:
# Train a LightGBM model with the best hyperparameters found
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
best_params = grid_search.best_params_
model = lgb.train(best_params, train_data, valid_sets=[train_data, val_data], num_boost_round=1000, early_stopping_rounds=20, verbose_eval=10)

# Use the trained model to make predictions on the training and test sets
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

# Calculate the training and test errors
train_mse = mean_squared_error(y_train, y_train_pred)
val_mse = mean_squared_error(y_val, y_val_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_r2 = r2_score(y_train, y_train_pred)
val_r2 = r2_score(y_val, y_val_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Train MSE: {train_mse:.4f}, MAE: {train_mae:.4f}, RMSE: {train_rmse:.4f}, R2: {train_r2:.4f}')
print(f'Validation MSE: {val_mse:.4f}, MAE: {val_mae:.4f}, RMSE: {val_rmse:.4f}, R2: {val_r2:.4f}')
print(f'Test MSE: {test_mse:.4f}, MAE: {test_mae:.4f}, RMSE: {test_rmse:.4f}, R2: {test_r2:.4f}')



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1791
[LightGBM] [Info] Number of data points in the train set: 1134675, number of used features: 10
[LightGBM] [Info] Start training from score 0.000514
Training until validation scores don't improve for 20 rounds
[10]	training's l2: 2.73009	valid_1's l2: 3.20999
[20]	training's l2: 2.32998	valid_1's l2: 2.37626
[30]	training's l2: 2.16294	valid_1's l2: 2.02766
[40]	training's l2: 2.05503	valid_1's l2: 1.77
[50]	training's l2: 1.98408	valid_1's l2: 1.62092
[60]	training's l2: 1.92806	valid_1's l2: 1.5177
[70]	training's l2: 1.89856	valid_1's l2: 1.4878
[80]	training's l2: 1.87714	valid_1's l2: 1.48419
[90]	training's l2: 1.85375	valid_1's l2: 1.47028
[100]	training's l2: 1.82747	valid_1's l2: 1.45306
[110]	training's l2: 1.81014	valid_1's l2: 1.44796
[120]	training's l2: 1.79555	valid_1's l2: 1.42842
[130]	training's l2: 1.77707	valid_1'

# Original dataset

In [1]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder
df = pd.read_csv('df_clean.csv')
df = df.dropna()

In [2]:
# Convert to datetime object
df['deviceTime'] = pd.to_datetime(df['deviceTime'])
# Convert to Unix timestamp
df['unix_timestamp'] = df['deviceTime'].apply(lambda x: x.timestamp())

from sklearn.preprocessing import MinMaxScaler

# Select only the columns that have numeric data types
numeric_cols = df.select_dtypes(include=['float', 'int']).columns
numeric_cols = numeric_cols.drop('fuel_diff')
 
# Apply MinMaxScaler only on the numeric columns
scaler = MinMaxScaler()
df_scaled = df.copy()
df_scaled[numeric_cols] = scaler.fit_transform(df_scaled[numeric_cols])

# The object columns remain unchanged

# Split the data into training, validation, and test sets
val_mask = df_scaled['new_vehicle_id'].isin(['v1', 'v7', 'v10', 'v5', 'v17'])
test_mask = df_scaled['new_vehicle_id'].isin(['v3', 'v8', 'v28', 'v32', 'v2'])
train_mask = ~df_scaled['new_vehicle_id'].isin(np.concatenate((val_mask, test_mask)))
X_train = df_scaled.loc[train_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuel_diff' ,'fuel', 'time_diff'])
y_train = df_scaled.loc[train_mask, 'fuel_diff']
X_val = df_scaled.loc[val_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuel_diff' ,'fuel', 'time_diff'])
y_val = df_scaled.loc[val_mask, 'fuel_diff']
X_test = df_scaled.loc[test_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuel_diff' ,'fuel', 'time_diff'])
y_test = df_scaled.loc[test_mask, 'fuel_diff']

In [4]:
import lightgbm as lgb
# Define the LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

# Define the hyperparameters to search
params = {
    'objective': ['regression', 'huber'],
    'metric': ['l2'],
    'num_leaves': [10, 20, 30],
    'learning_rate': [0.01, 0.05, 0.1],
    'feature_fraction': [0.5, 0.8, 1.0]
}

# Use GridSearchCV to find the best hyperparameters
gbm = lgb.LGBMRegressor()
grid_search = GridSearchCV(gbm, params, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found
print(f'Best parameters: {grid_search.best_params_}')


Fitting 5 folds for each of 54 candidates, totalling 270 fits








Best parameters: {'feature_fraction': 0.5, 'learning_rate': 0.05, 'metric': 'l2', 'num_leaves': 30, 'objective': 'huber'}


In [5]:
# Train a LightGBM model with the best hyperparameters found
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
best_params = grid_search.best_params_
model = lgb.train(best_params, train_data, valid_sets=[train_data, val_data], num_boost_round=1000, early_stopping_rounds=20, verbose_eval=10)

# Use the trained model to make predictions on the training and test sets
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

# Calculate the training and test errors
train_mse = mean_squared_error(y_train, y_train_pred)
val_mse = mean_squared_error(y_val, y_val_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_r2 = r2_score(y_train, y_train_pred)
val_r2 = r2_score(y_val, y_val_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Train MSE: {train_mse:.4f}, MAE: {train_mae:.4f}, RMSE: {train_rmse:.4f}, R2: {train_r2:.4f}')
print(f'Validation MSE: {val_mse:.4f}, MAE: {val_mae:.4f}, RMSE: {val_rmse:.4f}, R2: {val_r2:.4f}')
print(f'Test MSE: {test_mse:.4f}, MAE: {test_mae:.4f}, RMSE: {test_rmse:.4f}, R2: {test_r2:.4f}')



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2964
[LightGBM] [Info] Number of data points in the train set: 3134927, number of used features: 14
[LightGBM] [Info] Start training from score -0.015254
Training until validation scores don't improve for 20 rounds
[10]	training's l2: 0.02574	valid_1's l2: 0.0149983
[20]	training's l2: 0.0256655	valid_1's l2: 0.0149737
[30]	training's l2: 0.0256137	valid_1's l2: 0.0149602
[40]	training's l2: 0.025583	valid_1's l2: 0.0149507
[50]	training's l2: 0.02556	valid_1's l2: 0.0149412
[60]	training's l2: 0.0255413	valid_1's l2: 0.0149354
[70]	training's l2: 0.0255278	valid_1's l2: 0.014933
[80]	training's l2: 0.0255164	valid_1's l2: 0.0149296
[90]	training's l2: 0.0255047	valid_1's l2: 0.0149254
[100]	training's l2: 0.0254933	valid_1's l2: 0.0149226
[110]	training's l2: 0.0254837	valid_1's l2: 0.0149187
[120]	training's l2: 0.0254745	valid_1's l2: 0.0149148
[130]	training's l2: 0.0254655	valid_1's l2: 0.014911

# Export model

In [11]:
# Import pickle
import pickle
filename='230614_LGBM.sav'
pickle.dump(model, open(filename, 'wb'))

Use this to run the model

In [1]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)

NameError: name 'pickle' is not defined

# OSRM data

In [41]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder

df = pd.read_csv('230707_osrm.csv')
# Convert to datetime object
df['deviceTime'] = pd.to_datetime(df['deviceTime'])
# Convert to Unix timestamp
df['unix_timestamp'] = df['deviceTime'].apply(lambda x: x.timestamp())

# initialize OneHotEncoder
encoder = OneHotEncoder()

# fit and transform the data
encoded = encoder.fit_transform(df[['fuelStatus']]).toarray()

# create new columns in the original dataframe with the encoded values
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names(['label']))
df = pd.concat([df, encoded_df], axis=1)
df.drop(['date', 'fuelStatus', 'vehicleId', 'ending', 'starting', 'gnssPDOP', 'gnssHDOP', 'haversineSpeedGenerate',
             'haversineDistance', 'provider'], axis = 1, inplace = True)

from sklearn.preprocessing import MinMaxScaler

# Select only the columns that have numeric data types and exclude target
numeric_cols = df.select_dtypes(include=['float', 'int']).columns
numeric_cols = numeric_cols.drop('fuelDiff')

scaler = MinMaxScaler()
df_scaled = df.copy()
df_scaled[numeric_cols] = scaler.fit_transform(df_scaled[numeric_cols])
# The object columns remain unchanged

# Split the data into training, validation, and test sets
val_mask = df_scaled['new_vehicle_id'].isin(['v1', 'v7', 'v10', 'v5', 'v17'])
test_mask = df_scaled['new_vehicle_id'].isin(['v3', 'v8', 'v28', 'v32', 'v2'])
train_mask = ~df_scaled['new_vehicle_id'].isin(np.concatenate((val_mask, test_mask)))
X_train = df_scaled.loc[train_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuelDiff' ,'fuel'])
y_train = df_scaled.loc[train_mask, 'fuelDiff']
X_val = df_scaled.loc[val_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuelDiff', 'fuel'])
y_val = df_scaled.loc[val_mask, 'fuelDiff']
X_test = df_scaled.loc[test_mask, :].drop(columns=['new_vehicle_id', 'deviceTime', 'fuelDiff','fuel'])
y_test = df_scaled.loc[test_mask, 'fuelDiff']


# Define the LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

# Define the hyperparameters to search
params = {
    'objective': ['regression', 'huber'],
    'metric': ['l2'],
    'num_leaves': [10, 20, 30],
    'learning_rate': [0.01, 0.05, 0.1],
    'feature_fraction': [0.5, 0.8, 1.0]
}

# Use GridSearchCV to find the best hyperparameters
gbm = lgb.LGBMRegressor()
grid_search = GridSearchCV(gbm, params, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found
print(f'Best parameters: {grid_search.best_params_}')




Fitting 5 folds for each of 54 candidates, totalling 270 fits








Best parameters: {'feature_fraction': 0.5, 'learning_rate': 0.05, 'metric': 'l2', 'num_leaves': 20, 'objective': 'regression'}


In [42]:
# Train a LightGBM model with the best hyperparameters found
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
best_params = grid_search.best_params_
model = lgb.train(best_params, train_data, valid_sets=[train_data, val_data], num_boost_round=1000, early_stopping_rounds=20, verbose_eval=10)

# Use the trained model to make predictions on the training and test sets
y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)

# Calculate the training and test errors
train_mse = mean_squared_error(y_train, y_train_pred)
val_mse = mean_squared_error(y_val, y_val_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
train_r2 = r2_score(y_train, y_train_pred)
val_r2 = r2_score(y_val, y_val_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f'Train MSE: {train_mse:.4f}, MAE: {train_mae:.4f}, RMSE: {train_rmse:.4f}, R2: {train_r2:.4f}')
print(f'Validation MSE: {val_mse:.4f}, MAE: {val_mae:.4f}, RMSE: {val_rmse:.4f}, R2: {val_r2:.4f}')
print(f'Test MSE: {test_mse:.4f}, MAE: {test_mae:.4f}, RMSE: {test_rmse:.4f}, R2: {test_r2:.4f}')



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3066
[LightGBM] [Info] Number of data points in the train set: 1247621, number of used features: 15
[LightGBM] [Info] Start training from score 0.000690
Training until validation scores don't improve for 20 rounds
[10]	training's l2: 32.9442	valid_1's l2: 26.7061
[20]	training's l2: 30.4108	valid_1's l2: 23.9198
[30]	training's l2: 28.7489	valid_1's l2: 21.7447
[40]	training's l2: 27.2822	valid_1's l2: 19.7268
[50]	training's l2: 26.57	valid_1's l2: 19.1251
[60]	training's l2: 25.7168	valid_1's l2: 17.7688
[70]	training's l2: 24.9023	valid_1's l2: 16.2299
[80]	training's l2: 24.2899	valid_1's l2: 15.0829
[90]	training's l2: 23.8792	valid_1's l2: 14.881
[100]	training's l2: 23.4628	valid_1's l2: 14.4968
[110]	training's l2: 22.9573	valid_1's l2: 14.0991
[120]	training's l2: 22.6149	valid_1's l2: 13.7081
[130]	training's l2: 22.2576	valid_