In [1]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

## Create dataset

In [2]:
# data = make_classification(n_samples=10, n_features=3, n_informative=2, n_redundant=0, n_classes=2) # generates data randomly
data = make_classification(n_samples=10, n_features=3, n_informative=2, n_redundant=0, n_classes=2, random_state=42) # seed is provided to random_state

In [3]:
data

(array([[-0.58723065, -1.97171753, -1.05771093],
        [ 1.06833894, -0.97007347,  0.2088636 ],
        [-1.14021544, -0.83879234,  0.82254491],
        [-0.9382051 , -0.54304815, -1.22084365],
        [ 1.72725924, -1.18582677, -1.95967012],
        [-2.8953973 ,  1.97686236,  0.19686124],
        [-1.96287438, -0.99225135, -1.32818605],
        [ 1.89969252,  0.83444483,  0.17136828],
        [-0.72063436, -0.96059253, -0.01349722],
        [ 1.77736657,  1.51157598,  0.73846658]]),
 array([0, 1, 0, 1, 1, 0, 0, 1, 0, 1]))

In [4]:
type(data), len(data), len(data[0])

(tuple, 2, 10)

In [5]:
data[0], type(data[0])

(array([[-0.58723065, -1.97171753, -1.05771093],
        [ 1.06833894, -0.97007347,  0.2088636 ],
        [-1.14021544, -0.83879234,  0.82254491],
        [-0.9382051 , -0.54304815, -1.22084365],
        [ 1.72725924, -1.18582677, -1.95967012],
        [-2.8953973 ,  1.97686236,  0.19686124],
        [-1.96287438, -0.99225135, -1.32818605],
        [ 1.89969252,  0.83444483,  0.17136828],
        [-0.72063436, -0.96059253, -0.01349722],
        [ 1.77736657,  1.51157598,  0.73846658]]),
 numpy.ndarray)

### Preprocess data
- Extract Features & Labels
- Train test split
- Data Normalization (Min-Max Scaling --> range [0,1])

In [6]:
# Extract Features & Labels
X = data[0]
y = data[1]

In [7]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape,

((8, 3), (2, 3), (8,), (2,))

In [9]:
# Data Normalization
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler()

#### Transform train data

In [10]:
# 1111111111111111111111111111111111
sc.fit(X_train) # it gets the min and max value for later scaling

In [11]:
X_train.min(), X_train.max()

(np.float64(-2.8953973017246817), np.float64(1.9768623561397665))

In [12]:
X_train_scaled_1 = sc.transform(X_train)
X_train_scaled_1

array([[0.        , 1.        , 0.77511311],
       [0.48136046, 0.        , 0.32418745],
       [1.        , 0.71067636, 0.76595029],
       [0.36603733, 0.28691966, 1.        ],
       [0.97448933, 0.88216362, 0.96978007],
       [0.96403961, 0.19903124, 0.        ],
       [0.40816591, 0.36181853, 0.26555333],
       [0.19447455, 0.2480553 , 0.2269717 ]])

In [13]:
X_train_scaled_1.min(), X_train_scaled_1.max()

(np.float64(0.0), np.float64(1.0))

In [14]:
# 2222222222222222222222222222222222
X_train_scaled_2 = sc.fit_transform(X_train)
X_train_scaled_2

array([[0.        , 1.        , 0.77511311],
       [0.48136046, 0.        , 0.32418745],
       [1.        , 0.71067636, 0.76595029],
       [0.36603733, 0.28691966, 1.        ],
       [0.97448933, 0.88216362, 0.96978007],
       [0.96403961, 0.19903124, 0.        ],
       [0.40816591, 0.36181853, 0.26555333],
       [0.19447455, 0.2480553 , 0.2269717 ]])

In [15]:
X_train_scaled_2.min(), X_train_scaled_2.max()

(np.float64(0.0), np.float64(1.0))

#### Transform Test data

In [16]:
X_test_scaled = sc.transform(X_test) # we'll not use fit to avoid data leakage and hence used only during training
X_test_scaled

array([[0.45353956, 0.25607308, 0.69950485],
       [0.82662398, 0.25367198, 0.77942707]])

### MinMax Scaline in range [-2,2]

In [17]:
sc1 = MinMaxScaler(feature_range=(-2, 2))

In [18]:
X_train_sc1 = sc1.fit_transform(X_train)
X_train_sc1

array([[-2.        ,  2.        ,  1.10045246],
       [-0.07455815, -2.        , -0.7032502 ],
       [ 2.        ,  0.84270542,  1.06380115],
       [-0.5358507 , -0.85232137,  2.        ],
       [ 1.89795732,  1.52865447,  1.8791203 ],
       [ 1.85615846, -1.20387503, -2.        ],
       [-0.36733636, -0.55272587, -0.93778667],
       [-1.22210181, -1.00777879, -1.0921132 ]])

In [19]:
X_train_sc1.min(), X_train_sc1.max()

(np.float64(-2.0), np.float64(1.9999999999999998))

In [20]:
X_test_sc1 = sc1.transform(X_test)
X_test_sc1

array([[-0.18584175, -0.9757077 ,  0.79801938],
       [ 1.30649592, -0.9853121 ,  1.11770829]])

In [21]:
X_test_sc1.min(), X_test_sc1.max()

(np.float64(-0.9853120962695808), np.float64(1.3064959224574983))