In [None]:
# This is the code for the Kaggle competition: House Prices - Advanced Regression Techniques.
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error

In [2]:
train_file_path = "data/train.csv"
dataset_df = pd.read_csv(train_file_path)
print("Full train dataset shape is {}".format(dataset_df.shape))

Full train dataset shape is (1460, 81)


In [3]:
dataset_df.head(3)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500


In [4]:
dataset_df = dataset_df.drop('Id', axis=1)
dataset_df.head(3)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500


In [5]:
dataset_df.info()

<class 'pandas.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   str    
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   str    
 5   Alley          91 non-null     str    
 6   LotShape       1460 non-null   str    
 7   LandContour    1460 non-null   str    
 8   Utilities      1460 non-null   str    
 9   LotConfig      1460 non-null   str    
 10  LandSlope      1460 non-null   str    
 11  Neighborhood   1460 non-null   str    
 12  Condition1     1460 non-null   str    
 13  Condition2     1460 non-null   str    
 14  BldgType       1460 non-null   str    
 15  HouseStyle     1460 non-null   str    
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuilt      1460

In [6]:
print(dataset_df['SalePrice'].describe())

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64


In [7]:
# We see what type of data we have in the data set. We will handle them differently (onehotcoder for categorical, imputation for numerical).
list(set(dataset_df.dtypes.tolist()))

[<StringDtype(storage='python', na_value=nan)>,
 dtype('float64'),
 dtype('int64')]

In [8]:
# We separate the target variable and the features.
X = dataset_df.drop(columns=['SalePrice'])
y = dataset_df["SalePrice"]

#We know there are only three data types: int64, float64, and string (object).
# Select categorical columns
categorical_cols = [cname for cname in X.columns if 
                    X[cname].dtype in ["object", "category", "string"]]

# Select numerical columns
numerical_cols = [cname for cname in X.columns if 
                X[cname].dtype in ['int64', 'float64']]

# We obtain the numerical and categorical dataframes.
df_num = dataset_df[numerical_cols]
df_str = dataset_df[categorical_cols]

print(df_num.shape, df_str.shape)

(1460, 36) (1460, 43)


In [9]:
# we check that there are no missing values in the target
print("Number of missing values in target variable:", pd.isnull(dataset_df['SalePrice']).sum())
# Find missing values in the dataset 
na_counts_num = df_num.isna().sum().sort_values(ascending=False)
na_counts_str = df_str.isna().sum().sort_values(ascending=False)
#Display
missing = (na_counts_num[na_counts_num > 0]
           .to_frame("missing_count(num)"))
with pd.option_context("display.max_rows", None):
    display(missing)
missing = (na_counts_str[na_counts_str > 0]
           .to_frame("missing_count(str)"))
with pd.option_context("display.max_rows", None):
    display(missing)

# We can see that there are columns where missing values is near the total number of rows (1460)
# PoolQc: means no pool, MiscFeature: means no additional information, Alley: means no alley access, Fence...
# For all the string values, we could easily use the onehotencoder to specify the status of the feature (its either they have it or not) ()
# Except electrical, which doesn't meantion what does that mean exactly.
# We could drop columns with mostly Na values for trainning, such as PoolQc, or Alley. But they can have a noticeable effect on the final sale price.
# Therefore, we keep them using onehotencoder.
# For the numerical values missing, such as LoTFrontage, we could use imputation.

numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Doesn't apply")),
    ("onehot", OneHotEncoder(
        handle_unknown="ignore"
        ))
])

# We apply the transformations to each column in a single module
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

Number of missing values in target variable: 0


Unnamed: 0,missing_count(num)
LotFrontage,259
GarageYrBlt,81
MasVnrArea,8


Unnamed: 0,missing_count(str)
PoolQC,1453
MiscFeature,1406
Alley,1369
Fence,1179
MasVnrType,872
FireplaceQu,690
GarageFinish,81
GarageQual,81
GarageType,81
GarageCond,81


In [10]:
def split_dataset(dataset, test_ratio=0.30, random_state=1):
  np.random.seed(random_state)
  test_indices = np.random.rand(len(dataset)) < test_ratio
  return dataset[~test_indices], dataset[test_indices]

train_ds_pd, valid_ds_pd = split_dataset(dataset_df)
print("{} examples in training, {} examples in testing.".format(
    len(train_ds_pd), len(valid_ds_pd)))

# We create the dataframe we use:
X_train = train_ds_pd.drop(columns=["SalePrice"])
X_valid = valid_ds_pd.drop(columns=["SalePrice"])
y_train = train_ds_pd["SalePrice"]
y_valid = valid_ds_pd["SalePrice"]

# We now apply preprocessor to create usable datasets
# First fit it for X_train
preprocessor.fit(X_train)
# Now me transform both data set
X_train_prepro = preprocessor.transform(X_train)
X_valid_prepro = preprocessor.transform(X_valid)

1031 examples in training, 429 examples in testing.


In [11]:
# In order to determina whether we need to drop low correlated columns to improve the speed, lets first run it without dropping any feature.
# We select model. Here we compare two models, gradient boosting vs random forest
import xgboost as xgb
print(xgb.__version__)

from xgboost import XGBRegressor, callback

my_model_XGB = XGBRegressor(n_estimators=1000, learning_rate=0.05, random_state=0, n_jobs=-1, early_stopping_rounds=100)

my_model_XGB.fit(
    X_train_prepro, y_train,
    eval_set=[(X_valid_prepro, y_valid)],
    #callbacks=[xgb.callback.EarlyStopping(rounds=5, save_best=True)], #This line doesn't work
    verbose=False
)
# Get predictions
predictions_XGB = my_model_XGB.predict(X_valid_prepro) # Your code here
# Calculate MAE
mae_XGB = mean_absolute_error(predictions_XGB, y_valid) # Your code here
# Uncomment to print MAE
print("Mean Absolute Error for XGB model:" , mae_XGB)

# Now, with random forest:
from sklearn.ensemble import RandomForestRegressor
# Now, we can use cross validation to obtain MAE 
# The preprocessor is included in the pipeline

def get_score(n_estimators, max_depth, min_samples_leaf, min_samples_split, X=X, y=y):
    """Return the average MAE over 3 CV folds of random forest model.
    
    Keyword argument:
    n_estimators -- the number of trees in the forest
    """
    my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators= n_estimators, random_state=0, max_depth=max_depth, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split))
    ])
        
    get_scores = -1 * cross_val_score(my_pipeline, X, y,
                              cv=3,
                              scoring='neg_mean_absolute_error')
    return get_scores.mean()
    
    pass

n_estimators = 100
results_MAE = get_score(n_estimators, None, 1, 2, X, y)
print("MAE of Random Forest model (n_estimators=100): {:.0f}".format(results_MAE))

n_estimators2 = 1000
results_MAE = get_score(n_estimators2, None, 1, 2, X, y)
print("MAE of Random Forest model (n_estimators=1000): {:.0f}".format(results_MAE))

print("We can see that increasing the number of trees in the random forest model from 100 to 1000 has a significant impact on the MAE, reducing it from {:.0f} to {:.0f}.".format(results_MAE, results_MAE))
print("However, XGBoost model has a better MAE than the random forest model, even with 1000 trees. This suggests that XGBoost may be a more effective algorithm for this particular dataset and problem.")

3.1.3
Mean Absolute Error for XGB model: 16784.248046875
MAE of Random Forest model (n_estimators=100): 18110
MAE of Random Forest model (n_estimators=1000): 17912
We can see that increasing the number of trees in the random forest model from 100 to 1000 has a significant impact on the MAE, reducing it from 17912 to 17912.
However, XGBoost model has a better MAE than the random forest model, even with 1000 trees. This suggests that XGBoost may be a more effective algorithm for this particular dataset and problem.


In [12]:
# Now we use test data in order to predict the sale price for the test set and submit it to Kaggle.
test_file_path = "data/test.csv"
test_ds_pd = pd.read_csv(test_file_path)
print("Full test dataset shape is {}".format(test_ds_pd.shape))
ids = test_ds_pd.pop('Id')


# We now preprocess it
X_test = test_ds_pd
X_test_prepro = preprocessor.transform(X_test)

#We now get the predictions
predictions_XGB_final = my_model_XGB.predict(X_test_prepro)
output = pd.DataFrame({'Id': ids,
                       'SalePrice': predictions_XGB_final.squeeze()})

output.head()


Full test dataset shape is (1459, 80)


Unnamed: 0,Id,SalePrice
0,1461,120685.507812
1,1462,168843.65625
2,1463,179406.1875
3,1464,191955.90625
4,1465,199249.328125
