# Serving your SciKit Learn Model as a Prediction API

[Advertisement Dataset](https://www.kaggle.com/datasets/ashydv/advertising-dataset): Use the advertising dataset given in ISLR and analyse the relationship between advertisement channels and sales.

In [1]:
import joblib
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
SEED = 42

## Data Preprocessing

In [3]:
adv_df = pd.read_csv('dataset/advertising.csv')
adv_df.head(5)

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,9.3
3,151.5,41.3,58.5,18.5
4,180.8,10.8,58.4,12.9


|  | TV | Radio | Newspaper | Sales |
| -- | -- | -- | -- | -- |
| 0 | 230.1 | 37.8 | 69.2 | 22.1 |
| 1 | 44.5 | 39.3 | 45.1 | 10.4 |
| 2 | 17.2 | 45.9 | 69.3 | 9.3 |
| 3 | 151.5 | 41.3 | 58.5 | 18.5 |
| 4 | 180.8 | 10.8 | 58.4 | 12.9 |

In [4]:
adv_df.describe()

Unnamed: 0,TV,Radio,Newspaper,Sales
count,200.0,200.0,200.0,200.0
mean,147.0425,23.264,30.554,14.0225
std,85.854236,14.846809,21.778621,5.217457
min,0.7,0.0,0.3,1.6
25%,74.375,9.975,12.75,10.375
50%,149.75,22.9,25.75,12.9
75%,218.825,36.525,45.1,17.4
max,296.4,49.6,114.0,27.0


|  | TV | Radio | Newspaper | Sales |
| -- | -- | -- | -- | -- |
| count | 200.000000 | 200.000000 | 200.000000 | 200.000000 |
| mean | 147.042500 | 23.264000 | 30.554000 | 14.022500 |
| std | 85.854236 | 14.846809 | 21.778621 | 5.217457 |
| min | 0.700000 | 0.000000 | 0.300000 | 1.600000 |
| 25% | 74.375000 | 9.975000 | 12.750000 | 10.375000 |
| 50% | 149.750000 | 22.900000 | 25.750000 | 12.900000 |
| 75% | 218.825000 | 36.525000 | 45.100000 | 17.400000 |
| max | 296.400000 | 49.600000 | 114.000000 | 27.000000 |

In [5]:
# features / labels split
X = adv_df.drop('Sales', axis=1)
y = adv_df['Sales']

In [6]:
# train, validation and test split
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=SEED
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=SEED
)

print(len(X), len(X_train), len(X_val), len(X_test))
# 200 140 30 30

200 140 30 30


## Model Training

In [7]:
model = RandomForestRegressor(
    n_estimators=3,
    random_state=SEED
)

In [8]:
model.fit(X_train, y_train)

### Model Validation

In [9]:
# validate model performance and tune hyper parameter
val_preds = model.predict(X_val)

In [10]:
print(mean_absolute_error(y_val, val_preds))
print(mean_squared_error(y_val, val_preds)**0.5)
# 0.7566666666666663 MAE should be small compared to MEAN 14.022500
# 0.9330753611738063 RMSE should be small compared to STD 5.217457
adv_df.describe()['Sales']

0.7566666666666663
0.9330753611738063


count    200.000000
mean      14.022500
std        5.217457
min        1.600000
25%       10.375000
50%       12.900000
75%       17.400000
max       27.000000
Name: Sales, dtype: float64

| | |
| -- | -- |
| count    | 200.000000 |
| mean      | 14.022500 |
| std        | 5.217457 |
| min        | 1.600000 |
| 25%       | 10.375000 |
| 50%       | 12.900000 |
| 75%       | 17.400000 |
| max       | 27.000000 |
_Name: Sales, dtype: float64_

In [11]:
# try to improve the model by adding estimators
model2 = RandomForestRegressor(
    n_estimators=30,
    random_state=SEED
)
model2.fit(X_train, y_train)
val_preds2 = model2.predict(X_val)

print(mean_absolute_error(y_val, val_preds2))
print(mean_squared_error(y_val, val_preds2)**0.5)
# 0.483111111111111 MAE should be small compared to MEAN 14.022500
# 0.6177971619660723 RMSE should be small compared to STD 5.217457

0.483111111111111
0.6177971619660723


### Model Testing

In [12]:
# retest the optimized model on unseen data
test_preds = model2.predict(X_test)

In [13]:
print(mean_absolute_error(y_test, test_preds))
print(mean_squared_error(y_test, test_preds)**0.5)
# 0.5649999999999998 MAE should be small compared to MEAN 14.022500
# 0.6758333675845999 RMSE should be small compared to STD 5.217457

0.5649999999999998
0.6758333675845999


## Model Saving

In [14]:
production_model = RandomForestRegressor(
    n_estimators=30,
    random_state= SEED
)

# fit production model to entire dataset
production_model.fit(X, y)

In [15]:
# save model for deployment
joblib.dump(production_model, 'models/production_model.pkl')

['models/production_model.pkl']

In [16]:
list(X.columns)
# ['TV', 'Radio', 'Newspaper']

['TV', 'Radio', 'Newspaper']

In [17]:
joblib.dump(list(X.columns), 'models/production_model_column_names.pkl')

['models/production_model_column_names.pkl']

## Model Loading

In [18]:
column_names = joblib.load('models/production_model_column_names.pkl')
column_names
# ['TV', 'Radio', 'Newspaper']

['TV', 'Radio', 'Newspaper']

In [19]:
loaded_model = joblib.load('models/production_model.pkl')
print(loaded_model.predict([X.iloc[42]]))
print(loaded_model.predict([[180.8, 10.8, 58.4]]))
# [20.68666667] TRUE 20.7
#[13.28] TRUE 12.9

[20.68666667]
[13.28]


