Code based on Intermediate Machine Learning course

Imports:

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.impute import SimpleImputer

Initialization

In [2]:
# Read the data
X = pd.read_csv('train.csv', index_col='Id')
X_test_full = pd.read_csv('test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice              
X.drop(['SalePrice'], axis=1, inplace=True)

# Possible data leakage
X.drop(['MoSold', 'YrSold', 'SaleType', 'SaleCondition'], axis=1, inplace=True)
X_test_full.drop(['MoSold', 'YrSold', 'SaleType', 'SaleCondition'], axis=1, inplace=True)

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

Transforming columns

In [None]:
# Investigating Cardinality
# Categorical columns in the training data
object_cols = [col for col in X_train_full.columns if X_train_full[col].dtype == "object"]

# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: X_train_full[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))

# Print number of unique entries by column, in ascending order
print(sorted(d.items(), key=lambda x: x[1]))

# Select categorical columns with relatively low cardinality
low_cardinality_cols = [cname for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numeric columns
numeric_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = low_cardinality_cols + numeric_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

# Checking if there are missing values
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

One Hot Encode and Simple Imputer

In [None]:
# One Hot Encode 
X_train = pd.get_dummies(X_train)
X_valid = pd.get_dummies(X_valid)
X_test = pd.get_dummies(X_test)

X_train, X_valid = X_train.align(X_valid, join='left', axis=1)
X_train, X_test = X_train.align(X_test, join='left', axis=1)

# Simple Imputer
imput = SimpleImputer(strategy='median') 
imputed_X_train = pd.DataFrame(imput.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(imput.transform(X_valid))
imputed_X_test = pd.DataFrame(imput.transform(X_test))

imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns
imputed_X_test.columns = X_test.columns

# Checking again if there are missing values
missing_val_count_by_column = (imputed_X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

Fitting the XGBRegressor model

In [None]:
# Model
model_test = XGBRegressor(n_estimators=1000, learning_rate=0.05, early_stopping_rounds=5, n_jobs=4)

# Fit the model
model_test.fit(imputed_X_train, y_train, 
             eval_set=[(imputed_X_valid, y_valid)], 
             verbose=False)

best_estimators = model_test.best_iteration
print(best_estimators)

Validation models


In [None]:
models = []

models.append(XGBRegressor(n_estimators=best_estimators, learning_rate=0.05, n_jobs=4))
models.append(RandomForestRegressor(n_estimators=50))
models.append(RandomForestRegressor(n_estimators=150))
models.append(RandomForestRegressor(n_estimators=300))
models.append(RandomForestRegressor(n_estimators=450))
models.append(RandomForestRegressor(n_estimators=550))
models.append(RandomForestRegressor(max_depth=7))
models.append(LinearRegression())
models.append(KNeighborsRegressor())
models.append(SVR())

print(models)

Mean Squared Error

In [None]:
for i, model in enumerate(models):
    model.fit(imputed_X_train, y_train)
    prediction = model.predict(imputed_X_valid)
    mae = mean_absolute_error(prediction, y_valid)
    print(f'Mean Absolute Error from model {i+1}: {mae}')

r2_score from the best model

In [None]:
print(models[0])
pred = models[0].predict(imputed_X_valid)
print(f'r2_score from XGBRegressor model: {r2_score(y_valid, pred)}')

Cross Validation


In [None]:
score1 = -1 * cross_val_score(models[0], imputed_X_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print(f'Model with splitted train dataset: {score1.mean()}')

train_complete = pd.concat([imputed_X_train, imputed_X_valid])
y_complete = pd.concat([y_train, y_valid])
score2 = -1 * cross_val_score(models[0], train_complete, y_complete, cv=5, scoring='neg_mean_absolute_error')
print(f'Model with full train dataset: {score2.mean()}')

Final Prediction

In [16]:
final_model = XGBRegressor(n_estimators=best_estimators, learning_rate=0.05, n_jobs=4)
final_model.fit(train_complete, y_complete)
preds_test = final_model.predict(imputed_X_test)

Saving results


In [17]:
# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)