In [26]:
import random

random.seed(42)

random.random()

0.6394267984578837

Models need to start from the same random state for:

- fair comparison
- **reproduciblility**


In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression   # <-- regression
from sklearn.linear_model import LogisticRegression # <-- classification

clf = LogisticRegression(random_state=0)

X = [[ 1,  2,  3],  # 2 samples, 3 features
     [11, 12, 13]]

y = [0, 1]  # classes of each sample

clf.fit(X, y)

In [44]:
x_new1 = [4, 5, 6]
x_new2 = [14, 15, 16]
y_pred = clf.predict([x_new1, x_new2])
y_pred

array([0, 1])

In [60]:
from sklearn.preprocessing import StandardScaler
X = [[0, 15],
     [1, -10]]
# scale data according to computed scaling values
X_scaled = StandardScaler().fit(X).transform(X)
X_scaled = StandardScaler().fit_transform(X)
X_scaled

array([[-1.,  1.],
       [ 1., -1.]])

Question: Why `fit` and `transform` are separate?

Answer: prevent data leakage. We should fit on training data, and use the fitted parameters to transform the testing data. If however we fit on the entire dataset (inculding testing data), then, we have leaked information from the test set into the model.

In [56]:
X = [[0, 15],
     [1, -10]]
X

[[0, 15], [1, -10]]

In [57]:
scaler.transform(X)

array([[0., 1.],
       [1., 0.]])

In [67]:
import pandas as pd
X = pd.DataFrame(
    {'city': ['London', 'London', 'Paris', 'Sallisaw'],
     'title': ["His Last Bow", "How Watson Learned the Trick",
               "A Moveable Feast", "The Grapes of Wrath"],
     'expert_rating': [5, 3, 4, 5],
     'user_rating': [4, 5, 4, 3],
     'area': [10, 5, 10, 20]
     })

X

Unnamed: 0,city,title,expert_rating,user_rating,area
0,London,His Last Bow,5,4,10
1,London,How Watson Learned the Trick,3,5,5
2,Paris,A Moveable Feast,4,4,10
3,Sallisaw,The Grapes of Wrath,5,3,20


In [70]:
X.select_dtypes(include='number')

Unnamed: 0,expert_rating,user_rating,area
0,5,4,10
1,3,5,5
2,4,4,10
3,5,3,20


In [72]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

column_trans = ColumnTransformer(
    [('categories', OneHotEncoder(dtype='int'), ['city']),
     ('title_bow', CountVectorizer(), 'title'),
     ('area_scaled', StandardScaler(), ['area'])
     ],
    remainder='drop', verbose_feature_names_out=False)

column_trans.fit(X)

column_trans.get_feature_names_out()

column_trans.transform(X)

array([[ 1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        , -0.22941573],
       [ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  1.        ,  1.        ,  1.        ,
         0.        , -1.14707867],
       [ 0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        , -0.22941573],
       [ 0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  1.        ,  0.        ,  0.        ,
         1.        

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score

In [3]:
# create a pipeline object
pipe = make_pipeline(
    StandardScaler(),
    LogisticRegression()
)

<img src="https://media.licdn.com/dms/image/D4D12AQF5vivFTAdZjQ/article-cover_image-shrink_600_2000/0/1700911428185?e=2147483647&v=beta&t=RaJufpE5-ZMvIMZFVTy4dNtvnKHVgmThtTORx-_qu6Q" height="150">

In [10]:
# load the iris dataset and split it into train and test sets
X, y = load_iris(return_X_y=True)
X[:5], y[:5]

(array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2]]),
 array([0, 0, 0, 0, 0]))

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.25)


In [17]:
# fit the whole pipeline
pipe.fit(X_train, y_train)

In [30]:
y_pred = pipe.predict(X_test)

In [31]:
print(y_pred[:35])
print(y_test[:35])


[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2]
[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0 2 1 0 0 2 0 0 1 1 0 2 1 0 2 2]


In [32]:
# we can now use it like any other estimator
accuracy_score(y_pred, y_test)

0.9736842105263158

In [38]:
from sklearn.datasets import make_regression

X, y = make_regression(n_samples=100, n_features=3, random_state=0)

X[:5], y[:5]

(array([[-0.63432209,  0.3024719 , -0.36274117],
        [ 1.78587049,  0.01050002,  0.12691209],
        [ 1.71334272, -0.06824161, -0.74475482],
        [-0.31932842,  0.14195316,  0.69153875],
        [ 1.53277921, -0.18718385,  1.46935877]]),
 array([-8.52601181, 72.61144649, 59.30316246,  0.54111782, 58.51399448]))

In [39]:
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate

X, y = make_regression(n_samples=1000, random_state=0)

lr = LinearRegression()

result = cross_validate(lr, X, y)  # defaults to 5-fold CV
result['test_score'].mean()  # r_squared score is high because dataset is easy

1.0

In [40]:
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import randint

In [41]:
X, y = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [43]:
# define the parameter space that will be searched over
param_distributions = {'n_estimators': randint(1, 5),
                       'max_depth': randint(5, 10)}
# now create a searchCV object and fit it to the data
search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=0),
    n_iter=5,
    param_distributions=param_distributions,
    random_state=0)

search.fit(X_train, y_train)

search.best_params_

{'max_depth': 9, 'n_estimators': 4}

In [49]:
# the search object now acts like a normal random forest estimator
# with max_depth=9 and n_estimators=4
search.predict(X_test)

0.735363411343253