In [111]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [112]:
dataset = pd.read_csv('test_data.csv')

In [113]:
X = dataset.iloc[:, :-1].values

In [114]:
print(X)

[['Cat' 4.0 72000.0]
 ['Dog' 17.0 48000.0]
 ['Moose' 6.0 54000.0]
 ['Dog' 8.0 61000.0]
 ['Moose' 4.0 nan]
 ['Cat' 15.0 58000.0]
 ['Dog' nan 52000.0]
 ['Cat' 12.0 79000.0]
 ['Moose' 5.0 83000.0]
 ['Cat' 7.0 67000.0]]


In [115]:
Y = dataset.iloc[:, 3].values

In [116]:
print(Y)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [117]:
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = np.nan, strategy = 'mean', axis = 0)

In [118]:
imputer = imputer.fit(X[:, 1:3])

In [119]:
print(X)

[['Cat' 4.0 72000.0]
 ['Dog' 17.0 48000.0]
 ['Moose' 6.0 54000.0]
 ['Dog' 8.0 61000.0]
 ['Moose' 4.0 nan]
 ['Cat' 15.0 58000.0]
 ['Dog' nan 52000.0]
 ['Cat' 12.0 79000.0]
 ['Moose' 5.0 83000.0]
 ['Cat' 7.0 67000.0]]


In [120]:
X[:, 1:3] = imputer.transform(X[:, 1:3])

In [121]:
print(X)

[['Cat' 4.0 72000.0]
 ['Dog' 17.0 48000.0]
 ['Moose' 6.0 54000.0]
 ['Dog' 8.0 61000.0]
 ['Moose' 4.0 63777.77777777778]
 ['Cat' 15.0 58000.0]
 ['Dog' 8.666666666666666 52000.0]
 ['Cat' 12.0 79000.0]
 ['Moose' 5.0 83000.0]
 ['Cat' 7.0 67000.0]]


Time to take care of the String categories ('Cat', 'Moose', and 'Dog')

In [122]:
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:,0])

In [123]:
print(X)

[[0 4.0 72000.0]
 [1 17.0 48000.0]
 [2 6.0 54000.0]
 [1 8.0 61000.0]
 [2 4.0 63777.77777777778]
 [0 15.0 58000.0]
 [1 8.666666666666666 52000.0]
 [0 12.0 79000.0]
 [2 5.0 83000.0]
 [0 7.0 67000.0]]


In [127]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features = [0])

In [125]:
X = onehotencoder.fit_transform(X)

In [131]:
print(X)

  (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 1)	1.0
  (4, 2)	1.0
  (5, 0)	1.0
  (6, 1)	1.0
  (7, 0)	1.0
  (8, 2)	1.0
  (9, 0)	1.0
  (0, 3)	4.0
  (0, 4)	72000.0
  (1, 3)	17.0
  (1, 4)	48000.0
  (2, 3)	6.0
  (2, 4)	54000.0
  (3, 3)	8.0
  (3, 4)	61000.0
  (4, 3)	4.0
  (4, 4)	63777.7777778
  (5, 3)	15.0
  (5, 4)	58000.0
  (6, 3)	8.66666666667
  (6, 4)	52000.0
  (7, 3)	12.0
  (7, 4)	79000.0
  (8, 3)	5.0
  (8, 4)	83000.0
  (9, 3)	7.0
  (9, 4)	67000.0


In [132]:
X = X.toarray()

In [100]:
print(X)

[[  1.00000000e+00   0.00000000e+00   0.00000000e+00   4.00000000e+00
    7.20000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   1.70000000e+01
    4.80000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   6.00000000e+00
    5.40000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   8.00000000e+00
    6.10000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   4.00000000e+00
    6.37777778e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   1.50000000e+01
    5.80000000e+04]
 [  0.00000000e+00   1.00000000e+00   0.00000000e+00   8.66666667e+00
    5.20000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   1.20000000e+01
    7.90000000e+04]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   5.00000000e+00
    8.30000000e+04]
 [  1.00000000e+00   0.00000000e+00   0.00000000e+00   7.00000000e+00
    6.70000000e+04]]


Use label encoding for our Y column (Yes or No)

In [101]:
labelencoder_y = LabelEncoder()
Y = labelencoder_y.fit_transform(Y)

In [102]:
print(Y)

[0 1 0 0 1 1 0 1 0 1]


Split into testing and training data

In [103]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = .2, random_state = 0 )

Do some feature scaling

In [104]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()

In [105]:
X_train = sc_X.fit_transform(X_train)
X_test  = sc_X.transform(X_test)

I might need to reshape my data. throwing an error that a 1d array will be deprecated later
 "Y.reshape(-1,1)"

In [109]:
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)



In [133]:
print(X_train)

[[-1.         -0.77459667  2.64575131 -1.21135429  0.12381479]
 [ 1.         -0.77459667 -0.37796447 -0.54557178  0.46175632]
 [-1.          1.29099445 -0.37796447  1.67370326 -1.53093341]
 [-1.          1.29099445 -0.37796447 -0.17569261 -1.11141978]
 [ 1.         -0.77459667 -0.37796447  0.56406574  1.7202972 ]
 [-1.          1.29099445 -0.37796447 -0.32364428 -0.16751412]
 [ 1.         -0.77459667 -0.37796447 -1.21135429  0.98614835]
 [ 1.         -0.77459667 -0.37796447  1.22984825 -0.48214934]]


In [134]:
print(X_test)

[[-1.         -0.77459667  2.64575131 -0.76749928 -0.90166297]
 [-1.         -0.77459667  2.64575131 -0.98942679  2.13981082]]
