In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# 1) Getting the data ready

Three of the main steps you'll often have to take are:

1. splitting the data into features (usually X) and labels (usually y)
2. Filling data (also called imputing) or disregarding missing values - (Cleaning up the data)
3. Converting non-numerical values to numerical values (also call feature encoding) - (Cleaning up the data)

### We will learn how to put it all together later

for now we will dive into separate sections  and focus on them individually

In [3]:
heart_df = pd.read_csv("data/heart-disease.csv")
heart_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
# axis = 1 --> column axis, axis = 0 -> row axis

# X --> every single column except target (data or features)
X = heart_df.drop("target",axis=1)
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [5]:
# y --> target column(labels)
y = heart_df["target"]
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [6]:
# most imp principle: - In ML, never evaluate or test ur models on data that it has learned from 
# thats why, we split data.

In [7]:
X.shape,len(heart_df) # number of data (rows) in data frame

((303, 13), 303)

## Split data into training and test sets  

after converting it to numberical and cleaning data

In [8]:
from sklearn.model_selection import train_test_split

# it will return 4 different values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

# test_size=0.2 => means test data set will be 20% of overall data
# so, X.shape[0]*0.8 --> 242 data in train set, rest in test set
# same done for y

In [9]:
# data is in the form of matrices or numpy nd array in a data frame

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

In [10]:
# Clean data --> Transform (manipulate) data --> Reduce data

# Clean data --> remove and replace data (when data or labels is missing)
# remove empty row/column or fill it with average values

# Transform data --> convert information into numbers the computer can understand
# e.g. convert colors into numbers - transform data between 0 and 1

# Reduce data --> The more data we have, the more CPU or energy needed to run our computation, costs more money
# So, sometimes having more data is not necessarily good 
# This process of data reduction is known as dimensionality reduction or column reduction
# remove irrelevant data. Change it into a form that is useful for you.

# Remember(IMP): - Don't assume that all data will be perfect or needed.  

### 1.1) Make sure all the data is numerical - conversion

In [11]:
car_df = pd.read_csv("data/car-sales-extended.csv")
car_df.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [12]:
len(car_df) # number of data (rows) in data frame

1000

In [13]:
car_df.dtypes # check out their data types - for int and string columns

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [14]:
# use the first 4 columns (X) to predict prices (y) 
# --> although we need more information in real life

# build a ml model that will train on the training data, and predict on the test data

# But a ml model cannot deal with strings, only numerical data

In [15]:
# Split into X & y and train/test set
X = car_df.drop("Price", axis=1)
y = car_df["Price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [16]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 4), (200, 4), (800,), (200,))

In [17]:
car_df["Doors"].value_counts()
# Door is also based on categories

4    856
5     79
3     65
Name: Doors, dtype: int64

In [18]:
# Turn the categories (Make and Colour) into numbers, as well as Door
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer 

categorical_features = ["Make","Colour","Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([
    ("one_hot",one_hot,categorical_features)],
    remainder="passthrough")

transformed_X = transformer.fit_transform(X)
transformed_X

# remainder="passthrough" --> ignore all other columns other than the ones mentioned above

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [19]:
X.head() # Data in Training set of X

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3


In [20]:
pd.DataFrame(transformed_X) # all data is now numerical
# The categorical categories are now either 1 or 0

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [21]:
# Another way to transform data 
# usign pandas and pd.get_dummies()

# Have to convert doors to object for dummies to work on it
car_df["Doors"] = car_df["Doors"].astype(object)
dummies = pd.get_dummies(car_df[["Make","Colour","Doors"]])
dummies # u can then assign dummies to transformed X

Unnamed: 0,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White,Doors_3,Doors_4,Doors_5
0,0,1,0,0,0,0,0,0,1,0,1,0
1,1,0,0,0,0,1,0,0,0,0,0,1
2,0,1,0,0,0,0,0,0,1,0,1,0
3,0,0,0,1,0,0,0,0,1,0,1,0
4,0,0,1,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,1,1,0,0,0,0,0,1,0
996,0,0,1,0,0,0,0,0,1,1,0,0
997,0,0,1,0,0,1,0,0,0,0,1,0
998,0,1,0,0,0,0,0,0,1,0,1,0


In [22]:
X["Make"].value_counts()

Toyota    398
Honda     304
Nissan    198
BMW       100
Name: Make, dtype: int64

In [23]:
# Finally, refit the model
# by refitting the dataset
X_train, X_test, y_train, y_test = train_test_split(transformed_X,y,test_size=0.2)

In [24]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 13), (200, 13), (800,), (200,))

In [25]:
# -----------------------------------------------------------

### Feature Scaling

Once your data is all in numerical format, making sure all of your numerical data is on the same scale.

For example, say you were trying to predict the sale price of cars and the number of kilometres on their odometers varies from 6,000 to 345,000 but the median previous repair cost varies from 100 to 1,700. A machine learning algorithm may have trouble finding patterns in these wide-ranging variables.

To fix this, there are two main types of feature scaling.

* Normalization (also called min-max scaling) - This rescales all the numerical values to between 0 and 1, with the lowest value being close to 0 and the highest previous value being close to 1. Scikit-Learn provides functionality for this in the [ MinMaxScalar class](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)

* Standardization - This subtracts the mean value from all of the features (so the resulting features have 0 mean). It then  scales the features to unit variance (by dividing the feature by the standard deviation). Scikit-Learn provides functionality for this in the [StandardScalar class](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)

A couple of things to note.

* Feature scaling usually isn't required for your target variable.

* Feature scaling is usually not required with tree-based models (e.g. Random Forest) since they can handle varying features.

Extra reading

* [Feature Scaling with Scikit-Learn](https://benalexkeen.com/feature-scaling-with-scikit-learn/) by Ben Alex Keen

* [Feature Scaling for Machine Learning: Understanding the Difference Between Normalization vs. Standardization](https://www.analyticsvidhya.com/blog/2020/04/feature-scaling-machine-learning-normalization-standardization/) by Aniruddha Bhandari

In [30]:
transformed_pd = pd.DataFrame(transformed_X) # transformed_X contains numerical data for training set of X
transformed_pd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


## Min-Max Scaler

The MinMaxScaler is the probably the most famous scaling algorithm

If the distribution is not Gaussian or the standard deviation is very small, the min-max scaler works better.

However, it is sensitive to outliers

### Methods to know

* fit(X,y) - Compute the minimum and maximum to be used for later scaling.

* fit_transform(X,y) - Fit to data, then transform it.

* transform(X) - Scale features of X according to feature_range

In [33]:
from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
scaled_df = scaler.fit_transform(transformed_pd)
scaled_df = pd.DataFrame(scaled_df)

In [36]:
scaled_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.105472
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.761606
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.311065
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.601626
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.715146
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.107095
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.604876
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.235516
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.858259


In [None]:
# -----------------------------------------------------------------------------------------------