# 1. Importing Dataset and Data Pre-Processing

In [1]:
# Import library for reading the dataset
import pandas as pd

## Loading the dataset. Make sure that the dataset file is in the same path of the notebook

In [2]:
# Importing the dataset and store it in variable named dataset
dataset = pd.read_csv('Data.csv')
#here 'Data.csv' is the dataset file name given as argument to read_csv function

## A peek at the top/head of the dataset

In [3]:
dataset.head()
# notice that the null or missing value is stored as NaN (5th row salary column) in the output

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


## How to check whether the dataset has missing values????

In [4]:
# isnull().any() will tell if there are any null values in any of the columns of the dataset
dataset.isnull().any()
# So in the Age and Salary column has null values

Country      False
Age           True
Salary        True
Purchased    False
dtype: bool

## How to to know how many missing values are in the dataset in each column????

In [5]:
# isnull().sum() will tell the number of null values in each column
dataset.isnull().sum()
#Age and Salary column has one missing value each

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

## observe the missing values in the Age and Salary column

In [6]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# 2. Taking care of missing data
## Using imputer class that is used for handling missing data

In [7]:
#import required library
from sklearn.preprocessing import Imputer

## Create imputer object: Argeuments of imputer     
### 1. missing_values= 'NaN'              (as we saw earlier missing value is stored as NaN in the dataset variable)
### 2. strategy= 'mean'                         (method used for handling missing value is by replacing it with the mean)
### 3. axis=0                                          (mean is computed along x-axis)

In [8]:
#imputer object creation
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)

## Apply imputer object's fit function to replace the missing values
## Arguments of fit function: columns containing missing values (Age & Salary in our example) 

In [9]:
#apply imputer object's fit function
imputer = imputer.fit(dataset[['Age','Salary']])

## Replace the old columns containing missing values with new columns same columns but that is free of missing values using imputer object's trasform function

In [10]:
dataset[['Age','Salary']] = imputer.transform(dataset[['Age','Salary']])

## Now check the dataset for missing values

In [11]:
dataset.isnull().any()

Country      False
Age          False
Salary       False
Purchased    False
dtype: bool

## False indicating no missing values .Now the data set is free of missing values! 

In [12]:
dataset

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# 3. Seperating the independent and dependent(target) feature
## iloc[ :, [ cols ] ] (: indicates all rows , [cols] required column numbers)
## Independent feature stored in nparray x


In [13]:
X=dataset.iloc[:,[0,1,2]].values

In [14]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Independent feature stored in nparray y

In [15]:
y=dataset.iloc[:,[3]].values

In [16]:
y

array([['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes']], dtype=object)

# 4. Categorical encoding

In [17]:
# Import required library
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

## 0th column of X has country as non ordinal categorical variable

In [18]:
X[:,0]

array(['France', 'Spain', 'Germany', 'Spain', 'Germany', 'France',
       'Spain', 'France', 'Germany', 'France'], dtype=object)

## first label encode 0th column of X

In [19]:
#first label encoding column 0 (country) in X 
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])

In [20]:
X[:,0]

array([0, 2, 1, 2, 1, 0, 2, 0, 1, 0], dtype=object)

In [21]:
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

## now one-hot encode 0th column of X

In [22]:
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()

In [23]:
X[:,0]

array([1., 0., 0., 0., 0., 1., 0., 1., 0., 1.])

In [24]:
X

array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 3.00000000e+01,
        5.40000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 5.00000000e+01,
        8.30000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04]])

## categoriacal encoding for target variable

In [25]:
y

array([['No'],
       ['Yes'],
       ['No'],
       ['No'],
       ['Yes'],
       ['Yes'],
       ['No'],
       ['Yes'],
       ['No'],
       ['Yes']], dtype=object)

In [26]:
# Encoding the Dependent Variable
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

  y = column_or_1d(y, warn=True)


## ignore dataconversion warning

In [27]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

# 5. Splitting the data into test and train set

In [28]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split



## Using train_test_split function
## Argumnets:
### 1. X (independent feature)
### 2. y (dependent/target feature)
### 3. test_size (ratio of split)
### 4. random state (0 :- selects same random rows each time you run the program , any other number:- takes random rows each time u run the program)

## The function peforms splitting and returns 4 nparrays: X_train , X_test , y_train , y_test  

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [30]:
X_train

array([[0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04]])

In [31]:
X_test

array([[0.0e+00, 1.0e+00, 0.0e+00, 3.0e+01, 5.4e+04],
       [0.0e+00, 1.0e+00, 0.0e+00, 5.0e+01, 8.3e+04]])

In [32]:
y_train

array([1, 1, 1, 0, 1, 0, 0, 1], dtype=int64)

In [33]:
y_test

array([0, 0], dtype=int64)

## Total samples in dataset

In [34]:
len(X)

10

## Number of samples in Train set

In [35]:
len(X_train)

8

## Number of samples in Test set

In [36]:
len(X_test)

2

# 6. Feature Scaling (using standardization)

In [37]:
#import required library
from sklearn.preprocessing import StandardScaler

## Scaling the independent features

In [38]:
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

## After Scaling

In [39]:
X_train

array([[-1.        ,  2.64575131, -0.77459667,  0.26306757,  0.12381479],
       [ 1.        , -0.37796447, -0.77459667, -0.25350148,  0.46175632],
       [-1.        , -0.37796447,  1.29099445, -1.97539832, -1.53093341],
       [-1.        , -0.37796447,  1.29099445,  0.05261351, -1.11141978],
       [ 1.        , -0.37796447, -0.77459667,  1.64058505,  1.7202972 ],
       [-1.        , -0.37796447,  1.29099445, -0.0813118 , -0.16751412],
       [ 1.        , -0.37796447, -0.77459667,  0.95182631,  0.98614835],
       [ 1.        , -0.37796447, -0.77459667, -0.59788085, -0.48214934]])

In [40]:
X_test

array([[-1.        ,  2.64575131, -0.77459667, -1.45882927, -0.90166297],
       [-1.        ,  2.64575131, -0.77459667,  1.98496442,  2.13981082]])

## Scaling dependent feature
### not actually required for this dataset as y is already in range of (0,1)
### just for example purpose it is being performed

In [41]:
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train.reshape(len(y_train),1))



## After Scaling

In [42]:
y_train

array([[ 0.77459667],
       [ 0.77459667],
       [ 0.77459667],
       [-1.29099445],
       [ 0.77459667],
       [-1.29099445],
       [-1.29099445],
       [ 0.77459667]])

# Inverse or reverse Scaling!
## If you feed the ML algo. with scaled target feature, it will learn to predict the scaled version of target feature.
## Inorder to get the original information Inverse / reverse scaling is performed as shown below

In [43]:
# here y_train in reverse scaled as an example,in your program you have to reverse scale y_pred
y_train=sc_y.inverse_transform(y_train)

In [44]:
# observe that we got back the original 1s and 0s 
y_train

array([[1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.]])

# Data Pre-Processing Completed !