In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

# Data

In [80]:
# Read the data
train_data = pd.read_csv('train.csv', index_col='Id')
test_data = pd.read_csv('test.csv', index_col='Id')

In [81]:
df = pd.concat([train_data, test_data])

# Data Processing

In [82]:
#Removing features that have mostly just 1 value
#categorical values
cat_col = df.select_dtypes(include=['object']).columns
overfit_cat = []
for i in cat_col:
    counts = df[i].value_counts()
    zeros = counts.iloc[0]
    if zeros / len(df) * 100 > 96:
        overfit_cat.append(i)

overfit_cat = list(overfit_cat)
df = df.drop(overfit_cat, axis=1)

In [83]:
#numerical values
num_col = df.select_dtypes(exclude=['object']).drop(['MSSubClass'], axis=1).columns
overfit_num = []
for i in num_col:
    counts = df[i].value_counts()
    zeros = counts.iloc[0]
    if zeros / len(df) * 100 > 96:
        overfit_num.append(i)

overfit_num = list(overfit_num)
df = df.drop(overfit_num, axis=1)

In [84]:
print("Categorical Features with >96% of the same value: ",overfit_cat)
print("Numerical Features with >96% of the same value: ",overfit_num)

Categorical Features with >96% of the same value:  ['Street', 'Utilities', 'Condition2', 'RoofMatl', 'Heating']
Numerical Features with >96% of the same value:  ['LowQualFinSF', '3SsnPorch', 'PoolArea', 'MiscVal']


# Feature Engineering

In [85]:
# df['TotalLot'] = df['LotFrontage'] + df['LotArea']
# df['TotalBsmtFin'] = df['BsmtFinSF1'] + df['BsmtFinSF2']
# df['TotalSF'] = df['TotalBsmtSF'] + df['2ndFlrSF']
# df['TotalBath'] = df['FullBath'] + df['HalfBath']
# df['TotalPorch'] = df['OpenPorchSF'] + df['EnclosedPorch'] + df['ScreenPorch']

# X and y

In [86]:
feats = df.columns
black_list = ["SalePrice"]
feats = [x for x in feats if x not in black_list]

In [87]:
df_train = df[ df["SalePrice"].notnull() ].copy()
df_test = df[ df["SalePrice"].isnull() ].copy()

X = df_train[feats]
y = df_train["SalePrice"]
X_test = df_test[feats]

In [88]:
# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X.columns if
                    X[cname].nunique() < 10 and 
                    X[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X.columns if 
                X[cname].dtype in ['int64', 'float64']]

my_cols = categorical_cols + numerical_cols
# df = df[my_cols].copy()

In [89]:
from sklearn.model_selection import train_test_split

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

In [90]:
# columns = list(X_train.columns)
# removed = ['GarageYrBlt','TotRmsAbvGrd','1stFlrSF','GarageCars', 'Street', 'Utilities', 'Condition2', 
#            'RoofMatl', 'Heating', 'LowQualFinSF', '3SsnPorch', 'PoolArea', 'MiscVal']
# for column in removed:
#     print(column in columns)

# Pipeline

In [91]:
# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='median') 

# Preprocessing for categorical data
categorical_transformer =  Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define model
model = XGBRegressor(reg_lambda = 0.1,
                     reg_alpha = 0.1,
                     n_estimators = 3000,
                     min_child_weight = 1,
                     max_depth = 3,
                     learning_rate = 0.01)

In [92]:
# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

MAE: 16617.0908203125


# Submission

In [93]:
# read test data file using pandas
test_data = pd.read_csv('test.csv')

# make predictions which we will submit. 
test_preds = my_pipeline.predict(X_test)

In [94]:
#save predictions in the format used for competition scoring

output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': test_preds})
output.to_csv('submission.csv', index=False)