<h1 style="color:black" align='center'>Data preprocessing</h1>

In [1]:
# importing packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Importing our Data using pandas

In [3]:
Dataset = pd.read_csv('unprocessed_data.csv')
Dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# seperating the data to inputs and output

In [4]:
#seperating the features "inputs"
features_matrix = Dataset.iloc[:,:-1].values #passing the data to a numpy array called features_matrix
features_matrix
print(type(features_matrix))

<class 'numpy.ndarray'>


In [5]:
#seperating the output -> "purchased or not"
goal_vector = Dataset.iloc[:,-1].values
goal_vector

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

# Handling the missing data by replacing it with the mean

In [6]:
# import the Imputer module which can handle the missing data
from sklearn.preprocessing import Imputer
# creating a new object
imputer = Imputer(missing_values='NaN', strategy='mean',axis=0) #we want to replace a row -> axis=0 , with it's mean -> strategy='mean'
features_matrix[:, 1:3] = imputer.fit_transform(features_matrix[:, 1:3])#apply changes to all rows and cols from 1 to 3 
features_matrix

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## after all in training a model we must use number so transforming a categorical data to numbers is an important step

In [7]:
from sklearn.preprocessing import LabelEncoder #importing LabelEncoder class 
encoder = LabelEncoder() #creating a new object
features_matrix[:, 0] = encoder.fit_transform(features_matrix[:, 0]) #transforming the categorical data to numbers
features_matrix

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

# don't forget to transform the output if it's a boolean one

In [8]:
goal_vector = encoder.fit_transform(goal_vector)
goal_vector

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

## in Dealing with numbers "2" is greater than "0" which might affect the model as it will be more biased towards "2"

In [9]:
# import the oneHotEncoder class
from sklearn.preprocessing import OneHotEncoder
oneHotEncoder = OneHotEncoder(categorical_features=[0]) #creating a new object and sending the targeted data to it
features_matrix = oneHotEncoder.fit_transform(features_matrix).toarray()#transforming numbers
features_matrix

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04]])

# think about training The model 

## we must have a training set and a test set 

In [10]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features_matrix, goal_vector, train_size = 0.8, random_state = 0)
print(len(x_train))
print(len(x_test))
print(len(features_matrix))
# see the size percentage !
# note :
## the random state just to have the same result each time you run
## the train size is changable and it depend on several things[the data size, the problem itself, ...]

8
2
10




# a large diversity in numbers leads to slowing down the computation 
## it's better to scale your Data 

In [11]:
# import the library we need from our beloved sklearn !
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# take a look how the values was before and after the scalling 
print('before scalling, max is %d and min is %d'%(np.max(x_train), np.min(x_train)))
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
print('after scalling, max is %d and min is %d'%(np.max(x_train), np.min(x_train)))
x_train

before scalling, max is 79000 and min is 0
after scalling, max is 2 and min is -1


array([[-1.        ,  2.64575131, -0.77459667,  0.26306757,  0.12381479],
       [ 1.        , -0.37796447, -0.77459667, -0.25350148,  0.46175632],
       [-1.        , -0.37796447,  1.29099445, -1.97539832, -1.53093341],
       [-1.        , -0.37796447,  1.29099445,  0.05261351, -1.11141978],
       [ 1.        , -0.37796447, -0.77459667,  1.64058505,  1.7202972 ],
       [-1.        , -0.37796447,  1.29099445, -0.0813118 , -0.16751412],
       [ 1.        , -0.37796447, -0.77459667,  0.95182631,  0.98614835],
       [ 1.        , -0.37796447, -0.77459667, -0.59788085, -0.48214934]])

In [12]:
from sklearn import preprocessing
X_scaled = preprocessing.scale(x_train)
X_scaled
X_scaled.mean(axis=0) # mean is zero
X_scaled.std(axis=0) # variance is 1

array([1., 1., 1., 1., 1.])

# Scaling the data to a specific range

In [13]:
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1)) #setting the range
X_train_minmax = min_max_scaler.fit_transform(x_train)
X_train_minmax

array([[-1.        ,  1.        , -1.        ,  0.23809524,  0.01792115],
       [ 1.        , -1.        , -1.        , -0.04761905,  0.22580645],
       [-1.        , -1.        ,  1.        , -1.        , -1.        ],
       [-1.        , -1.        ,  1.        ,  0.12169312, -0.74193548],
       [ 1.        , -1.        , -1.        ,  1.        ,  1.        ],
       [-1.        , -1.        ,  1.        ,  0.04761905, -0.16129032],
       [ 1.        , -1.        , -1.        ,  0.61904762,  0.5483871 ],
       [ 1.        , -1.        , -1.        , -0.23809524, -0.35483871]])

# Binarization

In [14]:
binarizer = preprocessing.Binarizer().fit(X_train_minmax)  # fit does nothing
binarizer.transform(X_train_minmax)

array([[0., 1., 0., 1., 1.],
       [1., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 1.],
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 1.],
       [1., 0., 0., 0., 0.]])