# Data Preprocessing 

### Importing the libraries

In [35]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
# ignore warnings
warnings.filterwarnings('ignore')

## Importing the dataset

In [36]:
df=pd.read_csv('data.csv')
print(df.head(14))

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


In [37]:
df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

### Check the Missing the values and handle with pandas libraries

In [38]:
# Fill the missing values
df.fillna(method='bfill', inplace=True)

In [39]:
x=df.iloc[:,:-1].values
y=df.iloc[:,-1].values

In [40]:
print(x)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 58000.0]
 ['France' 35.0 58000.0]
 ['Spain' 48.0 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [41]:
print(y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Encoding categorical data


### Encoding the Independent Variable
In the below code we have imported the Label Encoder class of the sklearn library.
This class has successfully encoded variables into digits.
But in our example there are three variables named country and as you can see in the above output these variables are encoded as 1,0 and 2. By these values ​​the machine learning model can assume that there is some correlation between these variables. Hence it can produce wrong output. So to overcome this problem we will use dummy encoding.
D variables are variables that have values ​​of 0 or 1. A value of 1 indicates the presence of that variable in a particular column and all other variables become 0. By dummy encoding we will have an equal number of columns and equal number of categories. Our data set has three groups, so it will produce three columns containing the values ​​0 and 1. For dummy encoding we will use the One HotEncoder class of the preprocessing library.

In [42]:
# Encoding the Independent Variable
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
x = np.array(ct.fit_transform(x))

In [43]:
print(x)

[[1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 58000.0]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 48.0 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable
 y is the veriable of Purchased column contains labels like ['yes', 'no', 'no', 'yes'], LabelEncoder will convert them to [0, 1, 1, 0], where 'No' is mapped to 0 and 'YES' to 1.

In [44]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print(y)

[0 1 0 0 1 1 0 1 0 1]


# Split the Dataset for Training and Testing

In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

In [12]:
print(x_train)

[[0.0 0.0 1.0 48.0 52000.0]
 [0.0 1.0 0.0 40.0 58000.0]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]


In [13]:
print(x_test)

[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


In [14]:
print(y_train)

[0 1 0 0 1 1 0 1]


In [15]:
print(y_test)

[0 1]


# Feature Scaling
If our data contains some categorical features (which are usually text or labels) and some numerical features (which are continuous values), then you do not scale the categorical features. There is no need to scale categorical features because they already have limited and discrete values (such as 'Male', 'Female', 'Yes', 'No'), which do not benefit from scaling.
Scaling a categorical feature would have no practical meaning. Therefore, scaling is applied only to numerical features (such as Age and Income).

In [16]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:, 3:] = sc.fit_transform(x_train[:, 3:])
x_test[:, 3:] = sc.transform(x_test[:, 3:])

In [17]:
print(x_train)

[[0.0 0.0 1.0 0.9212281118121696 -0.9987833872617508]
 [0.0 1.0 0.0 -0.1705977984837351 -0.4941349389610767]
 [1.0 0.0 0.0 0.3753151566642172 0.6833781070738295]
 [0.0 0.0 1.0 -0.44355427605771125 -0.24181071481073965]
 [0.0 0.0 1.0 -1.9448149027145802 -1.3352156861288669]
 [1.0 0.0 0.0 0.9212281118121696 1.2721346300912826]
 [0.0 1.0 0.0 1.1941845893861458 1.6085669289583986]
 [1.0 0.0 0.0 -0.8529889924186755 -0.4941349389610767]]


In [18]:
print(x_test)

[[0.0 1.0 0.0 -1.5353801863536158 -0.8305672378281928]
 [1.0 0.0 0.0 -0.5800325148446993 0.2628377334899344]]
