# IMPORT LIBRARIES

In [12]:
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.cross_validation import train_test_split

# GET THE DATA

In [13]:
dataset = pd.read_csv('Data.csv')
# Save all columns except the last (independent variables) as features
X = dataset.iloc[:, :-1].values
# Save the last column (dependent variable)
Y = dataset.iloc[:, 3].values

# HANDLE MISSING DATA

In [14]:
# Replace missing value (NaN) by the mean of the column (axis=0) data
imputer = Imputer(missing_values='NaN', strategy='mean', axis=0)
# Fit the second and third column. (1:3 because the upper bound is excluded)
imputer = imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])

# ENCODE CATEGORICAL DATA

In [15]:
label_encoder_X = LabelEncoder()
# Apply Label Encoder to the first column -> Numerical values
X[:, 0] = label_encoder_X.fit_transform(X[:, 0])

label_encoder_Y = LabelEncoder()
Y = label_encoder_Y.fit_transform(Y)

# Dummy Encoding, BECAUSE ML-Algorithms would interpret: SPAIN (2) is "better" than France (0)
one_hot_encoder = OneHotEncoder(categorical_features=[0])
X = one_hot_encoder.fit_transform(X).toarray()
# --> First, second and third columns are now France, Spain and Germany (1 = true, 0 = false)
# No need to do the same for Y (Last column), because it is only {0,1}

# SPLIT DATASET INTO TRAINING SET AND TEST SET

In [16]:
# Preventing overfitting by learning with one dataset and test with a different but similar dataset
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

# FEATURE SCALING

In [17]:
# For example: Standardisation or Normalisation
# -->   1. No Variable is dominated by another
#       2. Algorithms run a lot faster on scaled variables
from sklearn.preprocessing import StandardScaler
standard_scaler_X = StandardScaler()
X_train = standard_scaler_X.fit_transform(X_train)
X_test = standard_scaler_X.transform(X_test)
# IMPORTANT: Do we have to fit and scale the dummy variables?!
# --> Depends on context! Here we don't.