In [1]:
#Relevant libraries
import pandas as pd
from pathlib import Path


#Data Extraction
WORKING_DIRECTORY = Path.cwd()
DATA_DIRECTORY = "D:/PROJECTS/house-prices-advanced-regression-techniques/data"

train_df = pd.read_csv("D:/PROJECTS/house-prices-advanced-regression-techniques/data/train.csv")
test_df = pd.read_csv("D:/PROJECTS/house-prices-advanced-regression-techniques/data/test.csv")

In [2]:
# Drop the 'Id' column to avoid generalization, it does not necessarily influence house prices and it is also too specific
train_df= train_df.drop(['Id'], axis= 1)
test_df= test_df.drop(['Id'], axis= 1)

#Generate the X_train, y_train and X_test subsets
X_train_full = train_df.drop(['SalePrice'], axis = 1)
y_train_full = train_df['SalePrice']
X_test_full = test_df.copy()

In [3]:
# Get list of categorical variables- X_train
s_train = (X_train_full.dtypes == 'object')
train_object_cols = list(s_train[s_train].index)


# Get list of categorical variables- X_test
s_test = (X_test_full.dtypes == 'object')
test_object_cols = list(s_test[s_test].index)

Categorical Variable Processing: One Hot Encoding

In [4]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train_full[train_object_cols]))
OH_cols_test = pd.DataFrame(OH_encoder.transform(X_test_full[test_object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train_full.index
OH_cols_test.index = X_test_full.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train_full.drop(train_object_cols, axis=1)
num_X_test = X_test_full.drop(test_object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_test = pd.concat([num_X_test, OH_cols_test], axis=1)

Missing Values: Imputation

In [5]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer(strategy= 'constant')
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(OH_X_train))
imputed_X_test = pd.DataFrame(my_imputer.transform(OH_X_test))

# Imputation removed column names; put them back
imputed_X_train.columns = OH_X_train.columns
imputed_X_test.columns = OH_X_test.columns



Model building and Testing

In [7]:
from sklearn import linear_model
model=  linear_model.LinearRegression()

# Preprocessing of training data, fit model 
model.fit(imputed_X_train, y_train_full)

# Preprocessing of validation data, get predictions
preds = model.predict(imputed_X_test)

# Evaluate the model
model_assessment= model.score(imputed_X_train, y_train_full)
acc_random_forest = round(model_assessment*100, 2)
print(round(acc_random_forest,2,), "%")



93.32 %


