# Normalize

In [6]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

MinMax Normalization

In [7]:
X = np.array([[10], [20], [40], [100]])
scaler = MinMaxScaler(feature_range=(0, 1))
X_transformed = scaler.fit_transform(X)
print(X_transformed)

[[0.        ]
 [0.11111111]
 [0.33333333]
 [1.        ]]


Z score Normalization

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
X = np.array([[10], [20], [30], [40], [-100]])
j = len(str(np.max(np.abs(X))))
X_transformed = X/10**j
print(X_transformed)

[[ 0.01]
 [ 0.02]
 [ 0.03]
 [ 0.04]
 [-0.1 ]]


Feature Encoding
 Types:
    Nominal
    and
    Ordinal

In [10]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
cities = ['Kathmandu', 'Pokhara', 'Biratnagar']
encoded = le.fit_transform(cities)
print(encoded)

[1 2 0]


One Hot Encoding

In [11]:
import pandas as pd

df = pd.DataFrame({'source': ['river', 'lake', 'well']})
df_encoded = pd.get_dummies(df, columns=['source'])
print(df_encoded)

   source_lake  source_river  source_well
0        False          True        False
1         True         False        False
2        False         False         True


Ordinal Encoding

In [13]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(categories=[['poor', 'fair', 'good']])
quality = [['poor'], ['fair'], ['good']]
encoded = encoder.fit_transform(quality)
print(encoded)

[[0.]
 [1.]
 [2.]]


# Memory Optimization & Efficient Data Processing

In [20]:
import numpy as np
import time

X = np.random.random(10000000)
print(X[:10])
start = time.time()
for i in range(X.shape[0]):
    X[i] += 2
end = time.time()
print(X[:10])
print(f'Time taken: {end - start} secs')

[0.3408985  0.04159077 0.41991129 0.48335644 0.49168937 0.66037556
 0.04142384 0.95709229 0.5109502  0.01257733]
[2.3408985  2.04159077 2.41991129 2.48335644 2.49168937 2.66037556
 2.04142384 2.95709229 2.5109502  2.01257733]
Time taken: 3.626986503601074 secs


In [21]:
import numpy as np
import time

X = np.random.random(10000000)
print(X[:10])
start = time.time()
X += 2
end = time.time()
print(X[:10])
print(f'Time taken: {end - start} secs')

[0.70458889 0.28324216 0.33032251 0.90790362 0.45430656 0.37620327
 0.13956399 0.92917268 0.31308972 0.53964101]
[2.70458889 2.28324216 2.33032251 2.90790362 2.45430656 2.37620327
 2.13956399 2.92917268 2.31308972 2.53964101]
Time taken: 0.02102971076965332 secs
