
# Import the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Import the dataset

In [2]:
dataset = pd.read_csv('Data.csv')

In [3]:
type(dataset)

pandas.core.frame.DataFrame

In [4]:
print(dataset)

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
4  Germany  40.0      NaN       Yes
5   France  35.0  58000.0       Yes
6    Spain   NaN  52000.0        No
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes


# Make a matrix of features by splitting the dependent and independent variables
## The dependent variable in this case is the last column

In [27]:
# This is the matrix of features (independent variables)
X = dataset.iloc[:,:-1].values

In [28]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, nan],
       ['France', 35.0, 58000.0],
       ['Spain', nan, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [29]:
# This is the dependent variables column
y = dataset.iloc[:,-1].values
# notice that you can either index as 'iloc[:,-1]' , or as iloc[:,3]. Both will work equally

In [30]:
y

array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
      dtype=object)

In [31]:
type(X)

numpy.ndarray

### from the table above you may notice that there are two cells with missing data. One at 'Salary' and the other at 'Age'
#### You may handle this by two possible solutions:
#### 1) by deleting the lines w/ missing data. However, this is a dangerous way since these lines may contain crucial data
#### 2) by putting the mean of the columns values
### We will try the second option

### For this purpose we import the Imputer class from the sklearn library

In [32]:
# Taking care of missing data
from sklearn.preprocessing import Imputer

### and we also need to make an instance of the class
### In order to make this object we have to take care of the parameters for the Imputer() class. In Jupyter you can press Shift+Tab when pointing at brackets () and take a look at the pop-up window showing you the possible parameters to input
### you can also find the classes documentation by following this link: http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Imputer.html

In [43]:
imputer = Imputer(missing_values='NaN', strategy= 'mean', axis = 0)

### Now it is a time to apply our created object on our data by using fit() method. Notice that we need to apply it not on the whole X dataset, but only a portion of it

In [44]:
imputer = imputer.fit(X[:, 1:3])

### In order to apply it on X, use transform method

In [45]:
X[:, 1:3] = imputer.transform(X[:,1:3])

In [46]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

### now please check if the library was right in computing the mean (either in Excel or manually)

## We are done with missing data. Now let's move on towards the categorical data

In [47]:
# Encoding the categorical data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [49]:
# make an object of the class
labelEncoderX = LabelEncoder()

In [51]:
X[:,0] = labelEncoderX.fit_transform(X[:,0])

In [52]:
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

### Now you can see that we got rid of the String values in 'Countries' and now it makes more sense in mathematical terms. However, there might appear some problems with these values? Can you guess which kind of? Take a look at the values...

### This problems can be tackled by encoding. There are several types of encoding: dummy, one-hot encoder etc.

In [53]:
onehotEncoder = OneHotEncoder(categorical_features = [0])

In [54]:
X

array([[0, 44.0, 72000.0],
       [2, 27.0, 48000.0],
       [1, 30.0, 54000.0],
       [2, 38.0, 61000.0],
       [1, 40.0, 63777.77777777778],
       [0, 35.0, 58000.0],
       [2, 38.77777777777778, 52000.0],
       [0, 48.0, 79000.0],
       [1, 50.0, 83000.0],
       [0, 37.0, 67000.0]], dtype=object)

In [55]:
X = onehotEncoder.fit_transform(X).toarray()

In [57]:
print(X)

[[1.00000000e+00 0.00000000e+00 0.00000000e+00 4.40000000e+01
  7.20000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 2.70000000e+01
  4.80000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 3.00000000e+01
  5.40000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.80000000e+01
  6.10000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 4.00000000e+01
  6.37777778e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.50000000e+01
  5.80000000e+04]
 [0.00000000e+00 0.00000000e+00 1.00000000e+00 3.87777778e+01
  5.20000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 4.80000000e+01
  7.90000000e+04]
 [0.00000000e+00 1.00000000e+00 0.00000000e+00 5.00000000e+01
  8.30000000e+04]
 [1.00000000e+00 0.00000000e+00 0.00000000e+00 3.70000000e+01
  6.70000000e+04]]


### Now let's encode 'y' with label encoder. The reason that we use label encoder here is because we know that the output is binary

In [64]:
labelEncoderY = LabelEncoder()
y = labelEncoderY.fit_transform(y)

In [65]:
y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

# Now we should split the dataset

In [60]:
from sklearn.cross_validation import train_test_split



In [66]:
# random state is a pseudo-random number generator
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [67]:
print(len(X_train))

8


In [63]:
print(len(X_test))

2


In [68]:
X_train

array([[0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 4.00000000e+01,
        6.37777778e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.70000000e+01,
        6.70000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 2.70000000e+01,
        4.80000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.87777778e+01,
        5.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.80000000e+01,
        7.90000000e+04],
       [0.00000000e+00, 0.00000000e+00, 1.00000000e+00, 3.80000000e+01,
        6.10000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 4.40000000e+01,
        7.20000000e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, 3.50000000e+01,
        5.80000000e+04]])

In [69]:
y_train

array([1, 1, 1, 0, 1, 0, 0, 1], dtype=int64)

# Now let's do the Feature Scaling on numerical values like 'Salary' and 'Age'

In [70]:
from sklearn.preprocessing import StandardScaler

In [71]:
sc_X = StandardScaler()

### The Scaler is already fitted into the training set, therefore, we don't want to fit it, we only want to transform

In [84]:
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

### The important question here is: do we need to scale the dummy variables? If you google it, you'll find several answers, both - for and against the scaling

In [85]:
print(X_train)

[[-1.          2.64575131 -0.77459667  0.26306757  0.12381479]
 [ 1.         -0.37796447 -0.77459667 -0.25350148  0.46175632]
 [-1.         -0.37796447  1.29099445 -1.97539832 -1.53093341]
 [-1.         -0.37796447  1.29099445  0.05261351 -1.11141978]
 [ 1.         -0.37796447 -0.77459667  1.64058505  1.7202972 ]
 [-1.         -0.37796447  1.29099445 -0.0813118  -0.16751412]
 [ 1.         -0.37796447 -0.77459667  0.95182631  0.98614835]
 [ 1.         -0.37796447 -0.77459667 -0.59788085 -0.48214934]]


In [86]:
print(X_test)

[[ 0.00000000e+00 -1.38777878e-16 -8.32667268e-17 -1.00000000e+00
  -1.00000000e+00]
 [ 0.00000000e+00 -1.38777878e-16 -8.32667268e-17  1.00000000e+00
   1.00000000e+00]]


## We don't need to apply feature scaling for 'y' because it is a categorical variable of binary type, but for the regression problems there is a need to apply the scaling

In [88]:
X.mean()

12763.511111111111

In [89]:
X.std()

25974.702949144506

In [67]:
X[0][0]

1.0

In [None]:
z = X[0][0] - X.mean() / 