- missing values, categorical var
- design pipelines
- cross-validation
- avoid leakage

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the data
data = pd.read_csv('melb_data.csv')

# Select target
y = data.Price

# To keep things simple, we'll use only numerical predictors
melb_predictors = data.drop(['Price'], axis=1)
X = melb_predictors.select_dtypes(exclude=['object'])

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

# Missing Values
- Drop columns with missing values
- Imputation: fill in with number
- Extension to imputation: add new column that show the location of imputed entries

In [6]:
#Approach 1: Drop columns with Missing values

# Get names of columns with missing values
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]

# Drop col in training and validation data
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)

print("MAE from Approach 1 (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

MAE from Approach 1 (Drop columns with missing values):
183550.22137772635


In [7]:
# Approach 2: Imputation
from sklearn.impute import SimpleImputer

 # Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

#Imputation removed column names, put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns
print("MAE from Approach 2 (Imputation): ")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

MAE from Approach 2 (Imputation): 
178166.46269899711


In [8]:
#Approach 3: An extension to imputation
# Make copy to avoid changing original data
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

#Make new columns indicating what will be imputed
for col in cols_with_missing:
  X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
  X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()

#Imputation
my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

#Imputation removed column names, put them back
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns

print("MAE from Approach 3 (An extension to Imputation):")
print(score_dataset(imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid))

MAE from Approach 3 (An extension to Imputation):
178927.503183954


In [9]:
#Shape of training data (num rows, num_columns)
print(X_train.shape)

#Number of missing values in each columns of training data
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

(10864, 12)
Car               49
BuildingArea    5156
YearBuilt       4307
dtype: int64


# Categorical Variables
3 Approach:
- Drop Categorical Variables
- Ordinal Encoding: to different int
- One-hot Encoding: create new columns -> not well with large number of values


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
data = pd.read_csv('melb_data.csv')

# Separate target from predictors
y = data.Price
X = data.drop(['Price'], axis=1)

# Divide data into training and validation subsets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

# Drop columns with missing values (simplest approach)
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()]
X_train_full.drop(cols_with_missing, axis=1, inplace=True)
X_valid_full.drop(cols_with_missing, axis=1, inplace=True)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and
                        X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
12167,u,S,Southern Metropolitan,1,5.0,3182.0,1.0,1.0,0.0,-37.85984,144.9867,13240.0
6524,h,SA,Western Metropolitan,2,8.0,3016.0,2.0,2.0,193.0,-37.858,144.9005,6380.0
8413,h,S,Western Metropolitan,3,12.6,3020.0,3.0,1.0,555.0,-37.7988,144.822,3755.0
2919,u,SP,Northern Metropolitan,3,13.0,3046.0,3.0,1.0,265.0,-37.7083,144.9158,8870.0
6043,h,S,Western Metropolitan,3,13.3,3020.0,3.0,1.0,673.0,-37.7623,144.8272,4217.0


In [11]:
#Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['Type', 'Method', 'Regionname']


- Approach 1: Drop Categorical Variables

In [12]:
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

print("MAE from Approach 1 (Drop Categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))

MAE from Approach 1 (Drop Categorical variables):
183550.22137772635


- Approach 2: Ordinal Encoding

In [13]:
from sklearn.preprocessing import OrdinalEncoder

#Make copy to avoid changing original data
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

#Apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])

print("MAE from Approach 2 (Ordinal Encoding):")
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

MAE from Approach 2 (Ordinal Encoding):
175062.2967599411


- Approach 3: One-Hot Encoding
handle-unknown = 'ignore' -> avoid errors when valid data contains classes arent represented in training data
sparse = False -> ensure encoded columns return as numpy array

In [14]:
from sklearn.preprocessing import OneHotEncoder

#Apply OneHot Encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

#One-Hot encoding removed index, put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

#Remove categorical columns
num_X_train = X_train.drop(object_cols, axis =1)
num_X_valid = X_valid.drop(object_cols, axis =1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

#Ensure all columns have string type
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)

print("MAE from Approach 3 (One-Hot Encoding):")
print(score_dataset(OH_X_train, OH_X_valid, y_train, y_valid))



MAE from Approach 3 (One-Hot Encoding):
176703.63810751104


# Pipelines
1. Cleaner code
2. Fewer Bugs
3. Easier to Productionize
4. More Options for Model Validation

In [18]:
# Step 1: Define Preprocessing Steps
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
categorical_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and
                        X_train_full[cname].dtype == "object"]

#Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

#Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [19]:
#Step 2: Define the Model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=10, random_state = 0)

In [20]:
#Step 3: Create and Evaluate the Pipeline

from sklearn.metrics import mean_absolute_error

#Bundle Preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor',preprocessor),('model',model)])

#Preprocessing of training data, fit model
my_pipeline.fit(X_train, y_train)

#Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

#Evaluate the Model
score = mean_absolute_error(y_valid, preds)
print('MAE: ', score)

MAE:  176703.63810751104


# Cross Validation
- run modeling process on different subsets of the data to get multiple measures of model quality
- give more accurate measure of model quality, but take longer to run
-> small dataset: run cross-validation
-> larger: single validation



In [27]:
my_pipeline = Pipeline(steps=[('preprocessor', SimpleImputer()),
                              ('model', RandomForestRegressor(n_estimators=50, random_state=0))])


In [28]:
from sklearn.model_selection import cross_val_score
cols_to_use = ['Rooms', 'Distance', 'Landsize', 'BuildingArea', 'YearBuilt']
X = data[cols_to_use]

# Select target
y = data.Price
#Multiple by -1 since sklearn calculates negative MAE
scores = -1 * cross_val_score(my_pipeline, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE score:\n", scores)

MAE score:
 [301628.7893587  303164.4782723  287298.331666   236061.84754543
 260383.45111427]


In [30]:
print("Average MAE score (across experiments):")
print(scores.mean())

Average MAE score (across experiments):
277707.3795913405


# XGBoost
- Gradient Boosting
  - Initialize the ensemble with single model
  - start the cycle
    -  use current ensemble to generate each observation in dataset
    - calculate a loss function (ex:mean squared error)
    - fit a new model - added to the ensemble
- Extreme gradient boosting(XGBoost)


In [32]:
from xgboost import XGBRegressor
X_train, X_valid, y_train, y_valid = train_test_split(X, y)
my_model = XGBRegressor()
my_model.fit(X_train,y_train)

In [34]:
from sklearn.metrics import mean_absolute_error

predictions = my_model.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

Mean Absolute Error: 240293.5607154363


Parameter Tuning
- n_estimators: how many times to go through the modeling cycle = the number of models
typical value 100-1000

In [35]:
my_model = XGBRegressor(n_estimators=500)
my_model.fit(X_train, y_train)

- early_stopping_rounds: find the ideal value for n_estimators. Early stopping - stop iterating when valid score stop improving.

In [39]:
my_model = XGBRegressor(n_estimators=500)
my_model.fit(X_train, y_train,
             early_stopping_rounds=5,
             eval_set=[(X_valid, y_valid)],
             verbose = False)



- learning_rate:
multiply predictions from each model by a small number
  -> set higher values for n_estimators without overfitting
As default: learning_rate = 0.1

In [40]:
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
my_model.fit(X_train, y_train,
             early_stopping_rounds=5,
             eval_set=[(X_valid,y_valid)],
             verbose=False)



- n_jobs: on larger dataset, set n_jobs = number of cores on machine

In [41]:
my_model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=4)
my_model.fit(X_train, y_train,
             early_stopping_rounds=5,
             eval_set=[(X_valid, y_valid)],
             verbose=False)



# Data Leakage
- happens when training data contains info but similar data will not be available when used for prediction
- high performance on training set put poorly in production
- 2 types: target leakage and train-test contamination

## Data Leakage
-

In [43]:
import pandas as pd

# Read the data
data = pd.read_csv('AER_credit_card_data.csv',
                   true_values = ['yes'], false_values = ['no'])

# Select target
y = data.card

# Select predictors
X = data.drop(['card'], axis=1)

print("Number of rows in the dataset:", X.shape[0])
X.head()

Number of rows in the dataset: 1319


Unnamed: 0,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,0,37.66667,4.52,0.03327,124.9833,True,False,3,54,1,12
1,0,33.25,2.42,0.005217,9.854167,False,False,3,34,1,13
2,0,33.66667,4.5,0.004156,15.0,True,False,4,58,1,5
3,0,30.5,2.54,0.065214,137.8692,False,False,0,25,1,7
4,0,32.16667,9.7867,0.067051,546.5033,True,False,2,64,1,5


In [45]:
# small dataset -> cross validation
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

#Since there is no preprocessing, we dont need a pipeline
my_pipeline = make_pipeline(RandomForestClassifier(n_estimators=100))
cv_scores = cross_val_score(my_pipeline, X, y, cv=5, scoring='accuracy')

print("Crossing-validation accuracy: %f" % cv_scores.mean())

Crossing-validation accuracy: 0.980292


In [46]:
expenditures_cardholders = X.expenditure[y]
expenditures_noncardholders = X.expenditure[~y]

print('Fraction of those who did not receive a card and had no expenditure: %2f' \
      %((expenditures_noncardholders == 0).mean()))
print('Fraction of those who received a card and had no expenditures: %2f'\
      %((expenditures_cardholders == 0).mean()))

Fraction of those who did not receive a card and had no expenditure: 1.000000
Fraction of those who received a card and had no expenditures: 0.020528


In [48]:
# Drop leaky predictors from dataset
potential_leaks = ['expenditure', 'share', 'active', 'majorcards']
X2 = X.drop(potential_leaks, axis=1)

#Evaluate the model with leaky predictors removed
cv_scores = cross_val_score(my_pipeline, X2, y, cv=5, scoring='accuracy')

print("Cross-val accuracy: %f" % cv_scores.mean())

Cross-val accuracy: 0.830919
