In [None]:
import numpy as np 
import matplotlib.pyplot as plt
import pandas as pd 
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

Creating a dataset from data frame

In [None]:
# Features or independent variables are those used in predicting results
# Dependent variable is the result of the prediction in the last column
# iloc is used in locating indexes. The range includes the lower bound and excludes the upper bound

dataset = pd.read_csv('..\data\Data.csv')
x = dataset.iloc[:, :-1].values # ['Take all rows','Take all columns except last column']
y = dataset.iloc[:,-1].values # ['Take all rows','Take onlylast column']
# print(dataset)
print(x)
print(y)

Handling missing data

In [None]:
# We can replace missing values with the mean, median or most used variable
# Transform returns the columns with the replacement done. To update the replacement, we assign transform to x 

imputer = SimpleImputer(missing_values=np.nan,strategy='mean') # replace all missing values represented by na with the mean
imputer.fit(x[:, 1:3]) # Fit expects only columns with numerical values. The upper bound in python is excluded thus we do not say 1:2. This will not take into consideration the salary column
x[:, 1:3] = imputer.transform(x[:, 1:3]) # Returns the new updated version of x
print(x)

Encoding categorical data

In [None]:
# Categorical data is simply information aggregated into groups rather than being in numeric formats, such as Gender, Sex or 
# Education Level. They are present in almost all real-life datasets, yet the current algorithms still struggle to deal with them.
# A one hot encoding is a representation of categorical variables as binary vectors. This first requires that the categorical values 
# be mapped to integer values. Then, each integer value is represented as a binary vector that is all zero values except the index of the integer, 
# which is marked with a 1.


# Encoding the independent variable
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder='passthrough') # Pass through will not encode the other columns
x = np.array(ct.fit_transform(x)) # Convert the transformd vector into a numpy array
print(x)

In [None]:
# Encoding the dependent variable
# We just need to encode the dependent variable to 0 and 1 using the label encoder class

le = LabelEncoder()
y = le.fit_transform(y)
print(y)

Splitting data into Training and Test set

In [None]:
# The need for quality, accurate, complete, and relevant data starts early on in the training process. Only if the algorithm is fed 
# with good training data can it easily pick up the features and find relationships that it needs to predict down the line.
# The machine learning models thus expect this format as inputs

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=1)

print(x_train)
print(x_test)
print(y_train)
print(y_test)

Feature scaling

In [None]:
# Feature scaling should be applied after the split
# Feature scaling is a method used to normalize the range of independent variables or features of data. In data processing, 
# it is also known as data normalization and is generally performed during the data preprocessing step. Just to give you an 
# example — if you have multiple independent variables like age, salary, and height; With their range as (18–100 Years), 
# (25,000–75,000 Euros), and (1–2 Meters) respectively, feature scaling would help them all to be in the same range, for 
# example- centered around 0 or in the range (0,1) depending on the scaling technique.
# https://towardsdatascience.com/all-about-feature-scaling-bcc0ad75cb35

# Standardisation and Normalisation are two techniques used in scalling data


sc = StandardScaler()
# We do not feature scale dummy variables
x_train[:, 3:] = sc.fit_transform(x_train[:, 3:])   # Exclude dummy colums thus take from third index
                                                    # ['Take all rows','Take from 3rd column onwards']
x_test[:, 3:] = sc.transform(x_test[:, 3:]) # We need same scaler thus we apply the transform method
print(x_train)
print(x_test)


  Steps to follow
- Create a dataset from data frame
- Handle missing data
- Encode categorical data
- Split data into Training and Test set
- Feature scaling