<a href="https://colab.research.google.com/github/MohammadrezaPourreza/Scikit-learn-tutorial/blob/main/preprocessing(normalizing%2Cscaling).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dataset loading

scikit learn has datasets for different purposes
1)classification:
  1.1)iris 1.2)digits
2)regression:
  2.1)boston house prices

In [2]:
from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data
Y = iris.target
feature_names = iris.feature_names
target_names =  iris.target_names
print(f"feature names are {feature_names}")
print(f"target names are {target_names}")
print(f"features are {X[:5]}")
print(f"targets are {Y[:5]}")

feature names are ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
target names are ['setosa' 'versicolor' 'virginica']
features are [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]
targets are [0 0 0 0 0]


Splitting the dataset

In [8]:
from sklearn.model_selection import train_test_split

#shuffle is true by default
X_train , X_test, y_train , y_test  = train_test_split(X,Y,test_size=0.3,random_state = None,shuffle = True)
# random_state simply sets a seed to the random generator, so that your train-test splits are always deterministic.
# If you don't set a seed, it is different each time
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(105, 4)
(45, 4)
(105,)
(45,)


KNN classifier to classify the data

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

knn_classifier = KNeighborsClassifier(n_neighbors=3)
knn_classifier.fit(X_train,y_train)
y_pred = knn_classifier.predict(X_test)
#finding the accuracy
print(f"accuracy is {metrics.accuracy_score(y_test,y_pred)}")


# Providing sample data and the model will make prediction out of that data
sample = [[5, 5, 3, 2], [2, 4, 3, 5]]
preds = knn_classifier.predict(sample)
pred_species = [iris.target_names[p] for p in preds]
print("Predictions:", pred_species)

accuracy is 0.9555555555555556
Predictions: ['setosa', 'virginica']


Save and load a model

In [11]:
import joblib

joblib.dump(knn_classifier, 'iris_classifier_knn.joblib')

['iris_classifier_knn.joblib']

In [13]:
knn_classifier = joblib.load('/content/iris_classifier_knn.joblib')

Preprocessing the data

In [19]:
#binarization

import numpy as np
from sklearn import preprocessing
Input_data = np.array([
   [2.1, -1.9, 5.5],
   [-1.5, 2.4, 3.5],
   [0.5, -7.9, 5.6],
   [5.9, 2.3, -5.8]])
print(f"binarization :  \n {preprocessing.Binarizer(threshold=0.5).transform(Input_data)}")

binarization :  
 [[1. 0. 1.]
 [0. 1. 1.]
 [0. 0. 1.]
 [1. 1. 0.]]


In [21]:
#mean removal 

print(f"the mean of the data {Input_data.mean(axis=0)}")
print(f"the mean of the data {Input_data.std(axis=0)}")

#removing mean and standard deviation
data_scaled = preprocessing.scale(Input_data)
print("Mean_removed =", data_scaled.mean(axis=0))
print("Stddeviation_removed =", data_scaled.std(axis=0))

the mean of the data [ 1.75  -1.275  2.2  ]
the mean of the data [2.71431391 4.20022321 4.69414529]
Mean_removed = [1.11022302e-16 0.00000000e+00 0.00000000e+00]
Stddeviation_removed = [1. 1. 1.]


In [24]:
# to scale the data to be between a specific min and max

data_scaler_minmax =  preprocessing.MinMaxScaler(feature_range=(0,1))
data_scaled_minmax = data_scaler_minmax.fit_transform(Input_data)
print ("\nMin max scaled data:\n", data_scaled_minmax)


Min max scaled data:
 [[0.48648649 0.58252427 0.99122807]
 [0.         1.         0.81578947]
 [0.27027027 0.         1.        ]
 [1.         0.99029126 0.        ]]


Normalization

In [26]:
#l1
data_normalized_l1 = preprocessing.normalize(Input_data, norm='l1')
print("L1 normalized data:\n", data_normalized_l1)

L1 normalized data:
 [[ 0.22105263 -0.2         0.57894737]
 [-0.2027027   0.32432432  0.47297297]
 [ 0.03571429 -0.56428571  0.4       ]
 [ 0.42142857  0.16428571 -0.41428571]]


In [29]:
#l2
data_normalized_l2 = preprocessing.normalize(Input_data, norm='l2')
print("L1 normalized data:\n", data_normalized_l2)

L1 normalized data:
 [[ 0.33946114 -0.30713151  0.88906489]
 [-0.33325106  0.53320169  0.7775858 ]
 [ 0.05156558 -0.81473612  0.57753446]
 [ 0.68706914  0.26784051 -0.6754239 ]]


In [30]:
#max
data_normalized_l2 = preprocessing.normalize(Input_data, norm='max')
print("max normalized data:\n", data_normalized_l2)

max normalized data:
 [[ 0.38181818 -0.34545455  1.        ]
 [-0.42857143  0.68571429  1.        ]
 [ 0.06329114 -1.          0.70886076]
 [ 1.          0.38983051 -0.98305085]]
