# Saving Models

In [13]:
# Initially, let's create one scikit-learn model. 
# We'll use a Logistic Regression model and the Iris dataset. 
# Let's import the needed libraries, load the data, and split it in training and test sets.

import numpy as np
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from pathlib import Path

data_path = './data'

# Load and split data
data = load_iris()
Xtrain, Xtest, Ytrain, Ytest = train_test_split(data.data, data.target, test_size=0.3, random_state=4)

# Create a model
model = LogisticRegression(C=0.1, 
                           max_iter=20, 
                           fit_intercept=True, 
                           n_jobs=3, 
                           solver='liblinear')
model.fit(Xtrain, Ytrain)


# And our resulting model
LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
    intercept_scaling=1, max_iter=20, multi_class='ovr', n_jobs=3,
    penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
    verbose=0, warm_start=False)



LogisticRegression(C=0.1, max_iter=20, multi_class='ovr', n_jobs=3,
                   solver='liblinear')

## Pickle

In [14]:
import pickle

# Save to file in the current working directory
pkl_filename = "pickle_model_logistic_regression.pkl"
with open(Path(data_path) / pkl_filename, 'wb') as file:
    pickle.dump(model, file)

# Load from file
with open(Path(data_path) / pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
    
# Calculate the accuracy score and predict target values
score = pickle_model.score(Xtest, Ytest)
print("Test score: {0:.2f} %".format(100 * score))
Ypredict = pickle_model.predict(Xtest)

Test score: 91.11 %


## Joblib

In [15]:
import joblib

# Save to file in the current working directory
joblib_file = "joblib_model.pkl"
joblib.dump(model, Path(data_path) / joblib_file)

# Load from file
joblib_model = joblib.load(joblib_file)

# Calculate the accuracy and predictions
score = joblib_model.score(Xtest, Ytest)
print("Test score: {0:.2f} %".format(100 * score))
Ypredict = pickle_model.predict(Xtest)

Test score: 91.11 %


### New

In [16]:
data_test = np.random.randn(1000000).reshape(250000,4)
data_target_test = np.random.randint(0,2,250000)

In [17]:
Xtrain, Xtest, ytrain, ytest = train_test_split(data_test, data_target_test, test_size=0.3)
print(Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape)

(175000, 4) (75000, 4) (175000,) (75000,)


In [18]:
model_test = LogisticRegression(C=0.1, 
                           max_iter=20, 
                           fit_intercept=True, 
                           n_jobs=1, 
                           solver='liblinear')

model_test.fit(Xtrain, ytrain)

LogisticRegression(C=0.1, max_iter=20, n_jobs=1, solver='liblinear')

##### Pickle

In [19]:
pkl_test = 'pkl_test.pkl'
with open(pkl_test, 'wb') as file:
    pickle.dump(model_test, Path(data_path) / file)

In [20]:
with open(Path(data_path) / pkl_test, 'rb') as file:
    pickle_test_model = pickle.load(file)

In [21]:
# Calculate the accuracy and predictions
score = pickle_test_model.score(Xtest, ytest)
print("Test score: {0:.2f} %".format(100 * score))
Ypredict = pickle_test_model.predict(Xtest)

Test score: 34.36 %


##### Joblib

In [22]:
# Save to file in the current working directory
joblib_test = "job_lib.pkl"
joblib.dump(model_test, Path(data_path) / joblib_test)

['job_lib.pkl']

In [23]:
# Load from file
joblib_test_model = joblib.load(Path(data_path) / joblib_test)

In [24]:
# Calculate the accuracy and predictions
score = joblib_test_model.score(Xtest, ytest)
print("Test score: {0:.2f} %".format(100 * score))
Ypredict = joblib_test_model.predict(Xtest)

Test score: 50.33 %


### Pipelines

#### Construction

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA
estimators = [('reduce_dim', PCA()), ('clf', SVC())]
pipe = Pipeline(estimators)
pipe

Pipeline(steps=[('reduce_dim', PCA()), ('clf', SVC())])

In [2]:
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import Binarizer
make_pipeline(Binarizer(), MultinomialNB())

Pipeline(steps=[('binarizer', Binarizer()), ('multinomialnb', MultinomialNB())])

#### Accessing Steps

In [3]:
pipe.steps[0]
pipe[0]
pipe['reduce_dim']

PCA()

In [4]:
pipe.named_steps.reduce_dim is pipe['reduce_dim']

True

In [5]:
pipe[:1]
pipe[-1:]

Pipeline(steps=[('clf', SVC())])

#### Nested Parameters

In [6]:
pipe.set_params(clf__C=10)

Pipeline(steps=[('reduce_dim', PCA()), ('clf', SVC(C=10))])

In [7]:
from sklearn.model_selection import GridSearchCV
param_grid = dict(reduce_dim__n_components=[2, 5, 10],
                  clf__C=[0.1, 10, 100])
grid_search = GridSearchCV(pipe, param_grid=param_grid)

In [8]:
from sklearn.linear_model import LogisticRegression
param_grid = dict(reduce_dim=['passthrough', PCA(5), PCA(10)],
                  clf=[SVC(), LogisticRegression()],
                  clf__C=[0.1, 10, 100])
grid_search = GridSearchCV(pipe, param_grid=param_grid)

In [9]:
pipe[0]

PCA()

In [10]:
pipe['reduce_dim']

PCA()

In [11]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
iris = load_iris()
pipe = Pipeline(steps=[
   ('select', SelectKBest(k=2)),
   ('clf', LogisticRegression())])
pipe.fit(iris.data, iris.target)

pipe[:-1].get_feature_names_out()

array(['x2', 'x3'], dtype=object)

In [12]:
pipe[:-1].get_feature_names_out(iris.feature_names)

array(['petal length (cm)', 'petal width (cm)'], dtype=object)