In [None]:
# Standard imports
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

#### 1. Getting our data ready to be used with machine learning

Three main things we have to do: (not limited)
1. Split the data into features and labels (usually X & y)
2. Filling (also called imputing) or disregarding missing values
3. Converting non-numerical values to numerical values (also called feature encoding)

##### 1.1 Split the data

In [None]:
# Import the data set
heart_disease = pd.read_csv("data/heart-disease.csv")
heart_disease.head(1)

In [None]:
# Choose the features and label
x = heart_disease.drop("target", axis=1)
y = heart_disease.target

In [None]:
# Split the the data into training and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [None]:
x.shape

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

##### 1.2 Make sure all the features are numerical

In [None]:
# Import the data set
car_sales = pd.read_csv("data/car-sales-extended.csv")
car_sales.head(1)

In [None]:
len(car_sales)

In [None]:
car_sales.dtypes

In [None]:
# Choose the features and label
x = car_sales.drop("Price", axis=1)
y = car_sales.Price

In [None]:
# Split the the data into training and test sets
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
# Why doors is a categorial ? 
car_sales.Doors.value_counts();

In [None]:
# 1.Turn the categories into numbers - using sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorial_features =  ["Make", "Colour", "Doors"]

one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorial_features)], remainder="passthrough")

transformed_x = transformer.fit_transform(x)

In [None]:
pd.DataFrame(transformed_x).head()

In [None]:
# 2.Turn the categories into numbers - using pandas.dummies
categorial_features =  ["Make", "Colour", "Doors"]
dummies = pd.get_dummies(car_sales[categorial_features])
dummies.head()

##### 1.3 What if there were missing values?

1. Fill them with some value (also known as imputation). 
2. Remove the samples with missing data altogether.

Option 1: Fill missing data with Pandas

In [None]:
# Import car sales missing data
car_sales_missing = pd.read_csv("data/car-sales-extended-missing-data.csv")
car_sales_missing.head(10);

In [None]:
car_sales_missing.isna().sum

In [None]:
# Create x and y
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [None]:
# Add the missing data: (make sure you fill the train and test data seperately - here its done together)

# Fill the "Make" column
car_sales_missing["Make"] = car_sales_missing["Make"].fillna("missing")

# Fill the "Colour" column
car_sales_missing.fillna({"Colour": "missing"}, inplace=True)

# Fill the "Odometer (KM)" column
car_sales_missing["Odometer (KM)"] = car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean())

# Fill the "Doors" column
car_sales_missing["Doors"] = car_sales_missing["Doors"].fillna(4)

In [None]:
# Check our dataframe again
car_sales_missing.isna().sum();

In [None]:
# Dropping the rows with missing data:

# Remove rows with missing Price value
car_sales_missing.dropna(inplace=True)

In [None]:
# Check our dataframe again
car_sales_missing.isna().sum();

In [None]:
len(car_sales_missing);

In [None]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorial_features =  ["Make", "Colour", "Doors"]

one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorial_features)], remainder="passthrough")

transformed_x = transformer.fit_transform(x)

In [None]:
pd.DataFrame(transformed_x).head();

Option 2: Filling missing data and transforming categorical data with Scikit-Learn

The main takeaways:

1. Split your data first (into train/test)
2. Fill/transform the training set and test sets separately

In [None]:
car_sales_missing = pd.read_csv("data/car-sales-extended-missing-data.csv")
car_sales_missing.head();

In [None]:
car_sales_missing.isna().sum();

In [None]:
# Drop the rows with no labels
car_sales_missing.dropna(subset=["Price"], inplace=True)
car_sales_missing.isna().sum();

In [None]:
# Split into X and y
x = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [None]:
# Split data into train and test
from sklearn.model_selection import train_test_split

np.random.seed(42)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
x.isna().sum();

In [None]:
# Fill missing values with Scikit-Learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with 'missing' & numerical values with mean
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

# Define columns
cat_features = ["Make", "Colour"]
door_feature = ["Doors"]
num_features = ["Odometer (KM)"]

# Create an imputer (something that fills missing data)
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_features),
    ("door_imputer", door_imputer, door_feature),
    ("num_imputer", num_imputer, num_features)
])

# Fill train and test values separately
filled_X_train = imputer.fit_transform(X_train)
filled_X_test = imputer.transform(X_test)

In [None]:
# Check filled X_train = (without column names)
pd.DataFrame(filled_X_train).head();

In [None]:
# Get our transformed data array's back into DataFrame's
car_sales_filled_train = pd.DataFrame(filled_X_train, columns=["Make", "Colour", "Doors", "Odometer (KM)"])

car_sales_filled_test = pd.DataFrame(filled_X_test, columns=["Make", "Colour", "Doors", "Odometer (KM)"])

In [None]:
# Check missing data in training set
car_sales_filled_train.isna().sum();

In [None]:
# Check to see the original... still missing values
car_sales_missing.isna().sum();

In [None]:
# Now let's one hot encode the features with the same code as before 
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

In [None]:
# Fill train and test values separately
transformed_X_train = transformer.fit_transform(car_sales_filled_train)
transformed_X_test = transformer.transform(car_sales_filled_test)

In [None]:
# Check transformed and filled X_train
transformed_X_train.toarray();

In [None]:
# Now we've transformed X, let's see if we can fit a model
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

# Make sure to use transformed (filled and one-hot encoded X data)
model.fit(transformed_X_train, y_train)
model.score(transformed_X_test, y_test)

In [None]:
# Check length of transformed data (filled and one-hot encoded)
# vs. length of original data
len(transformed_X_train.toarray()) + len(transformed_X_test.toarray()), len(car_sales)