In [122]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

In [123]:
# loading data
df = pd.read_csv('A2_Data/House_Price.csv')
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

# Part 2.2.1: Preprocessing

In [124]:
# split the data into training and test sets (70-30 split with seed of 309)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=309)

# split numeric and categorical
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

# DATA PREPROCESSING -------------
# Handle missing values and normalise
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# combine
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# scaling the y as well
scaler_y = MinMaxScaler()
y_train_processed = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
y_test_processed = scaler_y.transform(y_test.values.reshape(-1, 1))

### Saving Preprocessed Data


In [125]:

# get feature names (after preprocess)
numeric_feature_names = numeric_features.tolist()
categorical_feature_names = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features).tolist()
all_feature_names = numeric_feature_names + categorical_feature_names

# converting to dataframe
X_train_processed_df = pd.DataFrame(X_train_processed.toarray() if hasattr(X_train_processed, 'toarray') else X_train_processed, 
                                    columns=all_feature_names, 
                                    index=X_train.index)
X_test_processed_df = pd.DataFrame(X_test_processed.toarray() if hasattr(X_test_processed, 'toarray') else X_test_processed, 
                                   columns=all_feature_names, 
                                   index=X_test.index)

# adding the target variable back
train_processed = X_train_processed_df.join(y_train)
test_processed = X_test_processed_df.join(y_test)

# saving processed datasets
train_processed.to_csv('train_processed.csv', index=False)
test_processed.to_csv('test_processed.csv', index=False)

print("Preprocessing completed. (Part 2.2.1)")


Preprocessing completed. (Part 2.2.1)


# Part 2.2.2: Dimentionality Reduction 

### PCA

In [126]:
# converting processed data to dense matrix before PCA as PCA doesn't like sparse matrix
X_train_dense = X_train_processed.toarray()
X_test_dense = X_test_processed.toarray()

# PCA ---------------------------------------
pca = PCA(n_components=0.95)  # Adjust n_components as needed
X_train_pca = pca.fit_transform(X_train_dense)
X_test_pca = pca.transform(X_test_dense)

### RFE

In [127]:
linear_model = LinearRegression()

# RFE with 5 features ----------------------
rfe = RFE(estimator=linear_model, n_features_to_select=5)
X_train_rfe = rfe.fit_transform(X_train_pca, y_train_processed)
X_test_rfe = rfe.transform(X_test_pca)

### Saving to File

In [128]:

# getting the selected feature names after RFE
selected_feature_indices = rfe.get_support(indices=True)
selected_feature_names = [f"PC{i+1}" for i in selected_feature_indices]

# convert to dataframe 
X_train_final_df = pd.DataFrame(X_train_rfe, columns=selected_feature_names, index=X_train.index)
X_test_final_df = pd.DataFrame(X_test_rfe, columns=selected_feature_names, index=X_test.index)

train_final = X_train_final_df.join(y_train)
test_final = X_test_final_df.join(y_test)

# saving
train_final.to_csv('train_final.csv', index=False)
test_final.to_csv('test_final.csv', index=False)

print("Dimensionality Reduction and Feature Selection completed. (Part 2.2.2)")

Dimensionality Reduction and Feature Selection completed. (Part 2.2.2)


# Part 2.2.3: Data Mining


In [129]:
# new imports
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

# define the models
lin_reg = LinearRegression()
ridge_reg = Ridge(alpha=0.5)
rf_reg = RandomForestRegressor(random_state=309)

### Training models on PCA

In [130]:
# linear regression
lin_reg.fit(X_train_pca, y_train_processed)
y_train_pred_pca_lin = lin_reg.predict(X_train_pca)
y_test_pred_pca_lin = lin_reg.predict(X_test_pca)
mse_train_pca_lin = mean_squared_error(y_train_processed, y_train_pred_pca_lin)
mse_test_pca_lin = mean_squared_error(y_test_processed, y_test_pred_pca_lin)


# ridge regression
ridge_reg.fit(X_train_pca, y_train_processed)
y_train_pred_pca_ridge = ridge_reg.predict(X_train_pca)
y_test_pred_pca_ridge = ridge_reg.predict(X_test_pca)
mse_train_pca_ridge = mean_squared_error(y_train_processed, y_train_pred_pca_ridge)
mse_test_pca_ridge = mean_squared_error(y_test_processed, y_test_pred_pca_ridge)

# random forest regression
rf_reg.fit(X_train_pca, y_train_processed.ravel())
y_train_pred_pca_rf = rf_reg.predict(X_train_pca)
y_test_pred_pca_rf = rf_reg.predict(X_test_pca)
mse_train_pca_rf = mean_squared_error(y_train_processed, y_train_pred_pca_rf)
mse_test_pca_rf = mean_squared_error(y_test_processed, y_test_pred_pca_rf)


# results:
print(f"Linear Regression on PCA - Train MSE: {mse_train_pca_lin:.6f}, Test MSE: {mse_test_pca_lin:.6f}")
print(f"Ridge Regression on PCA - Train MSE: {mse_train_pca_ridge:.6f}, Test MSE: {mse_test_pca_ridge:.6f}")
print(f"Random Forest Regression on PCA - Train MSE: {mse_train_pca_rf:.6f}, Test MSE: {mse_test_pca_rf:.6f}")

Linear Regression on PCA - Train MSE: 0.001790, Test MSE: 0.001830
Ridge Regression on PCA - Train MSE: 0.001790, Test MSE: 0.001825
Random Forest Regression on PCA - Train MSE: 0.000494, Test MSE: 0.003086


### Training models on RFE

In [131]:
# linear regression
lin_reg.fit(X_train_rfe, y_train_processed)
y_train_pred_rfe_lin = lin_reg.predict(X_train_rfe)
y_test_pred_rfe_lin = lin_reg.predict(X_test_rfe)
mse_train_rfe_lin = mean_squared_error(y_train_processed, y_train_pred_rfe_lin)
mse_test_rfe_lin = mean_squared_error(y_test_processed, y_test_pred_rfe_lin)

# ridge regression
ridge_reg.fit(X_train_rfe, y_train_processed)
y_train_pred_rfe_ridge = ridge_reg.predict(X_train_rfe)
y_test_pred_rfe_ridge = ridge_reg.predict(X_test_rfe)
mse_train_rfe_ridge = mean_squared_error(y_train_processed, y_train_pred_rfe_ridge)
mse_test_rfe_ridge = mean_squared_error(y_test_processed, y_test_pred_rfe_ridge)

# random forest
rf_reg.fit(X_train_rfe, y_train_processed.ravel())
y_train_pred_rfe_rf = rf_reg.predict(X_train_rfe)
y_test_pred_rfe_rf = rf_reg.predict(X_test_rfe)
mse_train_rfe_rf = mean_squared_error(y_train_processed, y_train_pred_rfe_rf)
mse_test_rfe_rf = mean_squared_error(y_test_processed, y_test_pred_rfe_rf)

# show results:
print(f"Linear Regression on RFE - Train MSE: {mse_train_rfe_lin:.6f}, Test MSE: {mse_test_rfe_lin:.6f}")
print(f"Ridge Regression on RFE - Train MSE: {mse_train_rfe_ridge:.6f}, Test MSE: {mse_test_rfe_ridge:.6f}")
print(f"Random Forest Regression on RFE - Train MSE: {mse_train_rfe_rf:.6f}, Test MSE: {mse_test_rfe_rf:.6f}")

Linear Regression on RFE - Train MSE: 0.004304, Test MSE: 0.003931
Ridge Regression on RFE - Train MSE: 0.004304, Test MSE: 0.003930
Random Forest Regression on RFE - Train MSE: 0.000559, Test MSE: 0.003339


# Part 2.2.3: Further Analyses 

In [132]:
# import
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

# define the 2 further models
dt_reg = DecisionTreeRegressor(random_state=309)
svr_reg = SVR()


### Training models on PCA

In [133]:
# decision tree regression
dt_reg.fit(X_train_pca, y_train_processed)
y_train_pred_pca_dt = dt_reg.predict(X_train_pca)
y_test_pred_pca_dt = dt_reg.predict(X_test_pca)
mse_train_pca_dt = mean_squared_error(y_train_processed, y_train_pred_pca_dt)
mse_test_pca_dt = mean_squared_error(y_test_processed, y_test_pred_pca_dt)

# support vector regression
svr_reg.fit(X_train_pca, y_train_processed.ravel())
y_train_pred_pca_svr = svr_reg.predict(X_train_pca)
y_test_pred_pca_svr = svr_reg.predict(X_test_pca)
mse_train_pca_svr = mean_squared_error(y_train_processed, y_train_pred_pca_svr)
mse_test_pca_svr = mean_squared_error(y_test_processed, y_test_pred_pca_svr)

print(f"Decision Tree Regression on PCA - Train MSE: {mse_train_pca_dt:.6f}, Test MSE: {mse_test_pca_dt:.6f}")
print(f"Support Vector Regression on PCA - Train MSE: {mse_train_pca_svr:.6f}, Test MSE: {mse_test_pca_svr:.6f}")

Decision Tree Regression on PCA - Train MSE: 0.000000, Test MSE: 0.007691
Support Vector Regression on PCA - Train MSE: 0.003762, Test MSE: 0.004433


### Training modelse on RFE

In [134]:
# decision tree regression
dt_reg.fit(X_train_rfe, y_train_processed.ravel())
y_train_pred_rfe_dt = dt_reg.predict(X_train_rfe)
y_test_pred_rfe_dt = dt_reg.predict(X_test_rfe)
mse_train_rfe_dt = mean_squared_error(y_train_processed, y_train_pred_rfe_dt)
mse_test_rfe_dt = mean_squared_error(y_test_processed, y_test_pred_rfe_dt)

# support vector regression
svr_reg.fit(X_train_rfe, y_train_processed.ravel())
y_train_pred_rfe_svr = svr_reg.predict(X_train_rfe)
y_test_pred_rfe_svr = svr_reg.predict(X_test_rfe)
mse_train_rfe_svr = mean_squared_error(y_train_processed, y_train_pred_rfe_svr)
mse_test_rfe_svr = mean_squared_error(y_test_processed, y_test_pred_rfe_svr)

print(f"Decision Tree Regression on RFE - Train MSE: {mse_train_rfe_dt:.6f}, Test MSE: {mse_test_rfe_dt:.6f}")
print(f"Support Vector Regression on RFE - Train MSE: {mse_train_rfe_svr:.6f}, Test MSE: {mse_test_rfe_svr:.6f}")


Decision Tree Regression on RFE - Train MSE: 0.000000, Test MSE: 0.005993
Support Vector Regression on RFE - Train MSE: 0.003572, Test MSE: 0.003877
