In [None]:
#pip install scikit-learn

In [9]:
from sklearn import preprocessing

Encoding Categorical data with scikit-learn

In [6]:
X = [['male', 'from US', 'uses Coinbase'], ['female', 'from UK', 'uses Binance']]
encode = preprocessing.OrdinalEncoder()
encode.fit(X)

encode.transform([['male', 'from UK', 'uses Coinbase']])

array([[1., 0., 1.]])

In [8]:
one_hot = preprocessing.OneHotEncoder()
one_hot.fit(X)

one_hot.transform([['male', 'from UK', 'uses Coinbase'],
                   ['female', 'from US', 'uses Binance']]).toarray()

array([[0., 1., 1., 0., 0., 1.],
       [1., 0., 0., 1., 1., 0.]])

In [10]:
one_hot.categories_

[array(['female', 'male'], dtype=object),
 array(['from UK', 'from US'], dtype=object),
 array(['uses Binance', 'uses Coinbase'], dtype=object)]

Scaling data with scikit-learn

In [10]:
import numpy as np

scaler = preprocessing.StandardScaler()
X = np.random.rand(3,4)
X

array([[0.72478401, 0.22285347, 0.94969243, 0.81965177],
       [0.08764589, 0.6394406 , 0.37524133, 0.05781259],
       [0.51633396, 0.11856665, 0.26239836, 0.41984248]])

In [33]:
X_scaled = scaler.fit_transform(X)
X_scaled

array([[-0.20176096,  1.41156087, -0.94728207, -0.68150144],
       [-1.11133624, -0.6308009 , -0.43575025, -0.73240669],
       [ 1.3130972 , -0.78075997,  1.38303232,  1.41390814]])

In [34]:
print(f'The scaled mean is: {X_scaled.mean(axis=0)}\nThe scaled variance is: {X_scaled.std(axis=0)}')

The scaled mean is: [ 0.00000000e+00  1.85037171e-16  2.22044605e-16 -7.40148683e-17]
The scaled variance is: [1. 1. 1. 1.]


In [36]:
norm = preprocessing.Normalizer()

X_norm = norm.transform(X)
X_norm

array([[0.25424326, 0.84816742, 0.40149539, 0.23403812],
       [0.17532072, 0.5339978 , 0.74954022, 0.34971197],
       [0.3618432 , 0.20896422, 0.47882843, 0.77209248]])

Treating Missing Values with scikit-learn

In [3]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer.fit_transform([[10,np.nan],[2,4],[10,9]])

array([[10. ,  6.5],
       [ 2. ,  4. ],
       [10. ,  9. ]])

In [6]:
import pandas as pd

df = pd.DataFrame([['i', 'g'],
                   ['o', 'r'],
                   ['i', np.nan],
                   [np.nan, 'r']], dtype='category')

imputer = SimpleImputer(strategy='most_frequent')
imputer.fit_transform(df)

array([['i', 'g'],
       ['o', 'r'],
       ['i', 'r'],
       ['i', 'r']], dtype=object)

In [8]:
from sklearn.impute import MissingIndicator

# Image the 3's were imputed by the SimpleImputer()
Y = np.array([[3,1], 
              [5,3],
              [9,4], 
              [3,7]])

missing = MissingIndicator(missing_values=3)
missing.fit_transform(Y)

array([[ True, False],
       [False,  True],
       [False, False],
       [ True, False]])

In [10]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(max_iter=15, random_state=42)
imputer.fit_transform(([1,5],[4,6],[2, np.nan], [np.nan, 8]))

array([[1.        , 5.        ],
       [4.        , 6.        ],
       [2.        , 6.33291692],
       [2.42391423, 8.        ]])

Split the data into training and testing sample in scikit-learn

In [5]:
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split

# Create a random dataset
X, y = make_blobs(n_samples=1500)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

print(f'X training set {X_train.shape}\nX testing set {X_test.shape}\ny training set {y_train.shape}\ny testing set {y_test.shape}')

X training set (1200, 2)
X testing set (300, 2)
y training set (1200,)
y testing set (300,)


In [7]:
from sklearn.datasets import make_classification
from collections import Counter

# Create an imablanced dataset
X, y = make_classification(n_samples=1000, weights=[0.95], flip_y=0, random_state=42)
print(f'Number of y before splitting is {Counter(y)}')

# Split the data the usual way
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
print(f'Number of y in the training set after splitting is {Counter(y_train)}')
print(f'Number of y in the testing set after splitting is {Counter(y_test)}')

Number of y before splitting is Counter({0: 950, 1: 50})
Number of y in the training set after splitting is Counter({0: 757, 1: 43})
Number of y in the testing set after splitting is Counter({0: 193, 1: 7})


In [8]:
# Split the data by stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=42)
print(f'Number of y in the training set after splitting is {Counter(y_train)}')
print(f'Number of y in the testing set after splitting is {Counter(y_test)}')

Number of y in the training set after splitting is Counter({0: 760, 1: 40})
Number of y in the testing set after splitting is Counter({0: 190, 1: 10})
