In [None]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd 

In [None]:
# Creating a dataset from data frame
# Features or independent variables are those used in predicting results
# Dependent variable is the result of the prediction in the last column
# iloc is used in locating indexes. The range includes the lower bound and excludes the upper bound

dataset = pd.read_csv('Data.csv')
x = dataset.iloc[:, :-1].values # : means the range and -1 means less the last column
y = dataset.iloc[:,-1].values
# print(dataset)
# print(x)
# print(y)

In [None]:
# Handling missing data
# We can replace missing values with the mean, median or most used variable
# Transform returns the columns with the replacement done. To update the replacement, we assign transform to x 

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan,strategy='mean') # replace all missing values represented by na with the mean
imputer.fit(x[:, 1:3]) # Fit expects only columns with numerical values
x[:, 1:3] = imputer.transform(x[:, 1:3]) # Returns the new updated version of x
print(x)

In [None]:
# Encoding the independent variables
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder='passthrough') # Pass through will not encode the other columns
x = np.array(ct.fit_transform(x))
print(x)

In [9]:
# Encoding the dependent variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print(y)

[0 1 0 0 1 1 0 1 0 1]


In [None]:
# Splitting data into Training and Test set

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=1)

print(x_train)
print(x_test)
print(y_train)
print(y_test)

In [None]:
# Feature scaling should be applied after the split
# Standardisation and Normalisation

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
# We do not feature scale dummy variables
x_train[:, 3:] = sc.fit_transform(x_train[:, 3:])   # Exclude dummy colums thus take from third index
x_test[:, 3:] = sc.transform(x_test[:, 3:]) # We need same scaler thus we apply the transform method
print(x_train)
print(x_test)
