In [212]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import onnxmltools
from onnxconverter_common import *
from skl2onnx import to_onnx
from onnxmltools.convert import convert_lightgbm

In [213]:
data_df = pd.read_csv('emissions_data/CO2_Emissions_Canada.csv')

In [214]:
data_df.head()

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
0,ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
1,ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
2,ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
3,ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
4,ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244


In [215]:
X = data_df.drop('CO2 Emissions(g/km)', axis=1)
y = data_df['CO2 Emissions(g/km)'].to_frame()

In [216]:
# Convert object type to category type because LGBM needs it
X['Make'] = X['Make'].astype('category')
X['Model'] = X['Model'].astype('category')
X['Vehicle Class'] = X['Vehicle Class'].astype('category')
X['Transmission'] = X['Transmission'].astype('category')
X['Fuel Type'] = X['Fuel Type'].astype('category')

In [217]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=0)

In [218]:
model2 = lgb.LGBMRegressor()
model2.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000288 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1599
[LightGBM] [Info] Number of data points in the train set: 7015, number of used features: 11
[LightGBM] [Info] Start training from score 250.627655


In [219]:
# Lets identify the string columns and store them in an array
string_columns = [col for col in X_train.columns if X_train[col].dtype == 'object']

In [220]:
# Now lets create a lgb dataset
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=string_columns)

In [221]:
# Define the parameters for the lgb algo
params = {'objective':'regression', 'metric':'mean_squared_error'}

# Train the model
model = lgb.train(params, train_data)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000330 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 935
[LightGBM] [Info] Number of data points in the train set: 7015, number of used features: 11
[LightGBM] [Info] Start training from score 250.627655


In [222]:
predictions = model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
print(mse)


7.695593589284788


In [223]:
X_test.head(1)

Unnamed: 0,Make,Model,Vehicle Class,Engine Size(L),Cylinders,Transmission,Fuel Type,Fuel Consumption City (L/100 km),Fuel Consumption Hwy (L/100 km),Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg)
6307,PORSCHE,911 Carrera Cabriolet,MINICOMPACT,3.0,6,M7,Z,12.0,8.2,10.3,27


In [224]:
input_df = pd.DataFrame([["PORSCHE", "911 Carrera Cabriolet", "MINICOMPACT", 3.0, 6, "M7", "Z", 12.0, 8.2, 10.3, 27]], columns=['Make', 'Model', 'Vehicle Class', 'Engine Size(L)', 'Cylinders',
       'Transmission', 'Fuel Type', 'Fuel Consumption City (L/100 km)',
       'Fuel Consumption Hwy (L/100 km)', 'Fuel Consumption Comb (L/100 km)',
       'Fuel Consumption Comb (mpg)'])

input_df['Make'] = input_df['Make'].astype('category')
input_df['Model'] = input_df['Model'].astype('category')
input_df['Vehicle Class'] = input_df['Vehicle Class'].astype('category')
input_df['Transmission'] = input_df['Transmission'].astype('category')
input_df['Fuel Type'] = input_df['Fuel Type'].astype('category')

model.predict(input_df)
# X_test.columns

array([239.21120647])

In [225]:
model2.predict(input_df)

array([240.4379798])

In [226]:
# print(X[:1])
x_0 = X_test[0:1]

In [227]:
import onnx

In [228]:
onnxfile = 'lgbm-regressor2.onnx'
initial_type = [('float_input', FloatTensorType([None, X.shape[1]]))]
onnx_model = convert_lightgbm(model2, initial_types=initial_type, target_opset=1)
onnx.save(onnx_model, onnxfile)

# Get the input names from the ONNX model
input_names = [input.name for input in onnx_model.graph.input]

# Print the input names
for input_name in input_names:
    print("Input node:", input_name)

Input node: float_input


In [229]:
# onnxfile = 'lgbm-regressor.onnx'
# initial_type = [('float_input', FloatTensorType([None, X.shape[1]]))]
# onnx_model = convert_lightgbm(model2, initial_types=initial_type, target_opset=7)
# with open(onnxfile, "wb") as f:
#     f.write(onnx_model.SerializeToString())

The maximum opset needed by this model is only 1.


In [230]:
# # Convert model to onnx and save it
# features_count = len(X.columns)
# onnx_model = onnxmltools.convert_lightgbm(model, name='LightGBM', initial_types=[['input', FloatTensorType([0, features_count])]]) # , initial_types=[['input', FloatTensorType([0, features_count])]]
# with open("lgb.onnx", "wb") as f:
#     f.write(onnx_model.SerializeToString())
# onnxmltools.utils.save_model(onnx_model, 'emi_lgb.onnx')

The maximum opset needed by this model is only 8.
