In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBRegressor
import joblib

In [2]:
df = pd.read_csv('data_model.csv')
df = df[['Carrosserie', 'masse_ordma_min', 'masse_ordma_max', 'co2']]

In [3]:
le = LabelEncoder()
df['Carrosserie'] = le.fit_transform(df['Carrosserie'])
df['Carrosserie'].value_counts()

Carrosserie
6     46107
0      4411
1      1226
10     1080
5       875
2       552
9       319
3       250
7       145
8        59
4         4
Name: count, dtype: int64

In [4]:
# Get the original value of 6
original_value = le.inverse_transform([6])#[0]
original_value

array(['MINIBUS'], dtype=object)

In [5]:
df['Carrosserie'].unique()

array([ 0,  1,  5,  2, 10,  3,  7,  9,  8,  6,  4])

In [6]:
le.inverse_transform(df['Carrosserie'].unique())

array(['BERLINE', 'BREAK', 'COUPE', 'CABRIOLET', 'TS TERRAINS/CHEMINS',
       'COMBISPACE', 'MINISPACE', 'MONOSPACE COMPACT', 'MONOSPACE',
       'MINIBUS', 'COMBISPCACE'], dtype=object)

In [7]:
X = df.drop(['co2'] ,axis =1)
y = df['co2']

In [8]:
# Extract the column as a numpy array
column1 = X['masse_ordma_min'].values.reshape(-1, 1)
column2 = X['masse_ordma_max'].values.reshape(-1, 1)


scaler = StandardScaler()

X['masse_ordma_min'] = scaler.fit_transform(column1)
X['masse_ordma_max'] = scaler.fit_transform(column2)

print(X.sample(3))

       Carrosserie  masse_ordma_min  masse_ordma_max
27088            6         0.178884        -0.368863
46308            6        -0.252470         1.117833
17580            6        -0.092835        -0.628445


In [9]:
# Create an instance of the XGBRegressor class with the specified hyperparameters
top_model = XGBRegressor(learning_rate=0.1, max_depth=5, n_estimators=50)

# Train the top_model on the training data and labels
top_model.fit(X, y)

print("R2 score: {:.4f}".format(top_model.score(X, y)))

R2 score: 0.6411


In [10]:
# Create a dictionary containing all the objects
objects = {
    'X': X,
    'y': y,
    'Encoder': le,
    'Scaler': scaler,
    'Model': top_model
}

# Save the dictionary to a .pkl file
joblib.dump(objects, 'model.pkl')

['model.pkl']

In [11]:
# Load the dictionary from the .pkl file
objects = joblib.load('model.pkl')

# Access the individual objects
X = objects['X']
y = objects['y']
le = objects['Encoder']
scaler = objects['Scaler']
top_model = objects['Model']

In [None]:
################################  TESTS  ################################

In [12]:
# Create a new DataFrame with feature names
# ['MINIBUS', 2186.0, 2185.0]  >>>    203.0
new_data = pd.DataFrame([['MINIBUS', 2076.0, 2185.0]], columns=['Carrosserie', 'masse_ordma_min', 'masse_ordma_max'])

# Print the new DataFrame
print(new_data)

  Carrosserie  masse_ordma_min  masse_ordma_max
0     MINIBUS           2076.0           2185.0


In [13]:
# Encode the Carrosserie column
new_data['Carrosserie'] = le.transform(new_data['Carrosserie'])

# Scale the masse_ordma_min column
column = new_data['masse_ordma_min'].values.reshape(-1, 1)
scaled_column = scaler.transform(column)
new_data['masse_ordma_min'] = scaled_column

# Scale the masse_ordma_max column
column2 = new_data['masse_ordma_max'].values.reshape(-1, 1)
scaled_column2 = scaler.transform(column2)
new_data['masse_ordma_max'] = scaled_column2

print(new_data.head())

   Carrosserie  masse_ordma_min  masse_ordma_max
0            6        -0.626085        -0.368863


In [14]:
predictions = top_model.predict(new_data)

# Print the predictions
print(predictions)

[240.91302]
