## Building and storing the file as pickle file

In [36]:
# Not required now
#!pip install -U scikit-learn==0.24

In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
import pprint
import os

In [38]:
import xgboost
xgboost.__version__

'1.5.2'

In [39]:
import sklearn
sklearn.__version__

'1.5.1'

## Loading the dataset: Used Car Price Prediction

In [40]:
!pip install azure-ai-ml
!pip install azure-identity



In [41]:
import pandas as pd
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

ml_client = MLClient.from_config(credential=DefaultAzureCredential())
data_asset = ml_client.data.get("UsedCarDS", version="1")

cars_df = pd.read_csv(data_asset.path)

Found the config file in: /config.json


In [42]:
#cars_df = pd.read_csv('./used_car.csv')

In [43]:
cars_df.head(5)

Unnamed: 0,Location,Fuel_Type,Transmission,Owner_Type,Seats,Price,mileage_new,engine_new,power_new,age,make,model,KM_Driven
0,Chennai,Petrol,Manual,First,5.0,4.5,18.2,1199,88.7,9,honda,jazz,46
1,Chennai,Diesel,Manual,First,7.0,6.0,20.77,1248,88.76,8,maruti,ertiga,87
2,Jaipur,Diesel,Manual,First,5.0,3.5,23.08,1461,63.1,7,nissan,micra,86
3,Chennai,Diesel,Manual,Second,5.0,1.95,22.3,1248,74.0,8,tata,indica,65
4,Jaipur,Diesel,Manual,First,5.0,5.6,25.2,1248,74.0,5,maruti,swift,64


In [44]:
x_columns = ['KM_Driven', 'Fuel_Type', 'age',
              'Transmission', 'Owner_Type', 'Seats',
              'make', 'mileage_new', 'engine_new', 'model',
              'power_new', 'Location']
## model of the car is not included in the model

In [45]:
cars_df.shape

(3092, 13)

In [46]:
cars_df = cars_df[x_columns + ['Price']].dropna()

In [47]:
cars_df.shape

(3091, 13)

## Identifying numerical and categorical features

In [48]:
cat_features = ['Fuel_Type',
                'Transmission', 'Owner_Type', 'model',
                'make', 'Location']

In [49]:
num_features = list(set(x_columns) - set(cat_features))

## Split the dataset

In [50]:
x_train, x_test, y_train, y_test = train_test_split(cars_df[x_columns],
                                                    cars_df.Price,
                                                    train_size = 0.8,
                                                    random_state = 100)

## Creating the pipeline for the deployment

In [51]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[('onehot', 
                                           OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),                  
        ('cat', categorical_transformer, cat_features),
    ])

params = { "n_estimators": 400,
           "max_depth": 4 }

xgb_regressor = GradientBoostingRegressor(**params)

reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', xgb_regressor)])           

reg.fit(x_train, 
        y_train)

rmse = np.sqrt(mean_squared_error(y_test, 
                                  reg.predict(x_test)))

In [52]:
x_test[0:1]

Unnamed: 0,KM_Driven,Fuel_Type,age,Transmission,Owner_Type,Seats,make,mileage_new,engine_new,model,power_new,Location
1588,75,Diesel,9,Manual,Second,5.0,maruti,21.1,1248,ritz,73.9,Mumbai


In [53]:
print(rmse)

0.6427824527624755


## Creating Pickel File and Storing it

In [54]:
!rm carmodel.pkl

In [55]:
from joblib import dump

In [56]:
dump(reg, "./model/carmodel.pkl")

['./model/carmodel.pkl']

In [57]:
!ls -al 

total 473
drwxrwxrwx 2 root root      0 Apr 29  2024  .
drwxrwxrwx 2 root root      0 Apr 29  2024  ..
-rwxrwxrwx 1 root root    315 Jul 31 04:50  .amlignore
-rwxrwxrwx 1 root root    315 Jul 31 04:50  .amlignore.amltmp
drwxrwxrwx 2 root root      0 Jul 31 04:50  .ipynb_aml_checkpoints
-rwxrwxrwx 1 root root   3143 Jul 31 16:02 'Azure ML Invoke 1.0.ipynb'
-rwxrwxrwx 1 root root  41517 Nov 21 00:37 'Azure Model Used Car Sklearn.ipynb'
-rwxrwxrwx 1 root root  34606 Nov 21 00:55 'Model Build.ipynb'
-rwxrwxrwx 1 root root  27519 Nov 21 01:12 'Model Deploy.ipynb'
-rwxrwxrwx 1 root root  15655 Nov 21 00:58 'Model Register.ipynb'
-rwxrwxrwx 1 root root  10981 Nov 21 00:20 'Usedcar Model Deploy.ipynb'
-rwxrwxrwx 1 root root   3143 Jul 31 16:02 'azure ml invoke 1.0.ipynb.amltmp'
-rwxrwxrwx 1 root root  41517 Nov 21 00:37 'azure model used car sklearn.ipynb.amltmp'
drwxrwxrwx 2 root root      0 Nov 21 00:57  model
-rwxrwxrwx 1 root root  34606 Nov 21 00:55 'model build.ipynb.amltm

In [58]:
import sklearn

from azureml.core import Model
from azureml.core.workspace import Workspace
from azureml.core.resource_configuration import ResourceConfiguration

ws = Workspace.from_config()

model = Model.register(workspace=ws,
                       model_name='usedcarprice_gbm',                # Name of the registered model in your workspace.
                       model_path='./model/carmodel.pkl',  # Local file to upload and register as a model.
                       model_framework=Model.Framework.SCIKITLEARN,  # Framework used to create the model.
                       model_framework_version=sklearn.__version__,  # Version of scikit-learn used to create the model.
                       description='XGBoost Regression model to predict the price of an used car.',
                       tags={'area': 'usedcar', 'type': 'regression'})

Registering model usedcarprice_gbm
