In [11]:
import tensorflow as tf
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor



print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [162]:
df_train = pd.read_csv('../kagle_data/housing/train.csv', index_col='Id')
df_test = pd.read_csv('../kagle_data/housing/test.csv', index_col='Id')



In [31]:
clean_train = df_train.copy()
clean_train[217:220]

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
218,70,RM,57.0,9906,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,,,,0,9,2006,WD,Family,107000
219,50,RL,,15660,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,5,2008,WD,Normal,311500
220,120,RL,43.0,3010,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,3,2006,New,Partial,167240


In [None]:
clean_train.info()

### Categorical features converted

In [None]:
clean_train.fillna({
    'PoolQC': 'None',
    'Fence': 'None',
    'Alley': 'None',
    'FireplaceQu': 'None',
    'GarageQual': 'None',
    'GarageCond': 'None',
    'GarageFinish': 'None',
    'GarageYrBlt': 0,
    'BsmtQual': 'None',
    'BsmtCond': 'None',
    'GarageType': 'None',
    'BsmtFinType1': 'None',
    'BsmtFinType2': 'None',
    'BsmtExposure': 'None',
    'MasVnrType': 'None',
}, inplace=True)

In [None]:
clean_train['Electrical'].fillna(clean_train['Electrical'].mode()[0], inplace=True)

In [None]:
missing = clean_train.isnull().sum().sort_values(ascending=False)
missing_percent = (missing / len(clean_train)) * 100
print(missing_percent[missing_percent > 0 ])

In [None]:
not_null_masvnr = clean_train[clean_train['MasVnrArea'].notnull()]
masvnr_nn = not_null_masvnr[['MasVnrArea','MasVnrType']]
masvnr_nn = pd.get_dummies(masvnr_nn, drop_first=True)

In [None]:
correlation_matrix = masvnr_nn.corr().round(3)

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Feature Correlation Matrix to MasVnrArea')
plt.show()

In [None]:
clean_train[clean_train['MasVnrArea'].isnull()]


In [None]:
median_masvnr = not_null_masvnr.groupby('MasVnrType')['MasVnrArea'].median()
clean_train['MasVnrArea'] = clean_train.apply(
    lambda row: median_masvnr[row['MasVnrType']] if pd.isnull(row['MasVnrArea']) else row['MasVnrArea'],
    axis=1
)

In [None]:
import pandas as pd
poolqc_mapping = {
    'Ex': 5,  # Excellent
    'Gd': 4,  # Good
    'TA': 3,  # Typical/Average
    'Fa': 2,  # Fair
    'None': 1   # No Pool
}
clean_train['PoolQC_Num'] = clean_train['PoolQC'].map(poolqc_mapping)
correlation = clean_train[['PoolQC_Num', 'SalePrice']].corr().iloc[0, 1]
print(f"Correlation between PoolQC and SalePrice: {correlation}")

In [None]:
clean_train

In [None]:
fireplacequ_mapping ={
    'Ex': 5,
    'Gd': 4,
    'TA': 3,
    'Fa': 2,
    'Po': 1,
    'None': 0
}

clean_train['FireplaceQu_Num'] = clean_train['FireplaceQu'].map(fireplacequ_mapping)
correlation = clean_train[['FireplaceQu_Num', 'SalePrice']].corr().iloc[0, 1]
print(correlation)

In [None]:
clean_train['FireplaceQu_Num']

In [None]:
missing_lot_rows = clean_train[clean_train['LotFrontage'].isnull()]
missing_lot_rows

In [None]:
not_null_frontage = clean_train[clean_train['LotFrontage'].notnull()]
frontage_nn = not_null_frontage[['LotFrontage', 'LotArea', 'OverallQual', 'GrLivArea', 'GarageArea', 'LotShape', 'MSSubClass']]
frontage_nn = pd.get_dummies(frontage_nn, drop_first=True)

In [None]:
# correlation_matrix = frontage_nn.corr().round(3)
#
# plt.figure(figsize=(12, 8))
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
# plt.title('Feature Correlation Matrix to LotFrontage')
# plt.show()

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

impute_features = clean_train[['LotFrontage', 'LotArea', 'OverallQual', 'GrLivArea', 'GarageArea', 'LotShape', 'MSSubClass']]
impute_features = pd.get_dummies(impute_features, drop_first=True)

imputer = IterativeImputer(random_state=42)
clean_train['LotFrontage'] = imputer.fit_transform(impute_features)[:, 0]

In [None]:
clean_train['LotFrontage'] = np.log1p(clean_train['LotFrontage'])

In [None]:
check = clean_train['MiscFeature'].fillna('None')
check_dummies = pd.get_dummies(check, drop_first=True)

print(check.value_counts(normalize=True))

In [None]:
plt.figure(figsize=(8, 5))
sns.barplot(x=check, y=clean_train['SalePrice'])
plt.title('Average SalePrice by MiscFeature')
plt.xticks(rotation=45)
plt.show()

In [None]:
misc_feature_mapping = {
    'None': 1,
    'Shed': 1,
    'Gar2': 1,
    'Othr': 2,
    'TenC': 3
}

clean_train['MiscFeature_Cat'] = clean_train['MiscFeature'].map(misc_feature_mapping)

clean_train['MiscFeature_Cat'] = clean_train['MiscFeature_Cat'].astype('category')

clean_train.drop(['MiscFeature'], axis=1, inplace=True)

print(clean_train['MiscFeature_Cat'].value_counts())

In [None]:
clean_train = clean_train.drop(['MiscFeature_Cat'], axis=1)

In [None]:
clean_train.isnull().sum().sort_values(ascending=False)

In [None]:
clean_train.fillna({'GarageCond': 'None'})

In [None]:
clean_train['TotalSF'] = clean_train['1stFlrSF'] + clean_train['2ndFlrSF'] + clean_train['TotalBsmtSF']
clean_train['Bathrooms'] = clean_train['FullBath'] + (0.5 * clean_train['HalfBath'])


In [None]:
clean_train = clean_train.drop(['PoolQC_Num', 'PoolQC', 'PoolArea'], axis=1)

In [None]:
from sklearn import preprocessing
test = clean_train.copy()
label_encoder = LabelEncoder()
categorical_cols = test.select_dtypes(include=['object']).columns
for col in categorical_cols:
    test[col] = label_encoder.fit_transform(test[col])

In [None]:
X_clean_train = test.copy()
y_clean_train = X_clean_train.pop("SalePrice")

# label encoding for categorical features
for colname in X_clean_train.select_dtypes('object'):
    X_clean_train[colname], _ = X_clean_train[colname].factorize()

discrete_features = X_clean_train.dtypes == int

In [None]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X, y, discrete_features):
    mi_score = mutual_info_regression(X, y, discrete_features=discrete_features)
    mi_score = pd.Series(mi_score, name='MI scores', index=X.columns)
    mi_score = mi_score.sort_values(ascending=False)
    return mi_score

In [None]:
mi_scores = make_mi_scores(X_clean_train, y_clean_train, discrete_features)
mi_scores

In [None]:
correlation_matrix = test.corr()

saleprice_correlation = correlation_matrix['SalePrice'].sort_values(ascending=False)

caped_saleprice_correlation = (saleprice_correlation < 0.2) & (saleprice_correlation > -0.2)
print(saleprice_correlation[caped_saleprice_correlation])

In [None]:
low_mi_columns = mi_scores[mi_scores < 0.1].index
low_corr_columns = saleprice_correlation[(saleprice_correlation < 0.2) & (saleprice_correlation > -0.2)].index

test = test.drop(columns=low_corr_columns)

In [None]:

X_train = test.drop(['SalePrice'], axis=1)
y_train = test['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
y_train_log = np.log(y_train)
y_test_log = np.log(y_test)

In [None]:
# lr_model = LinearRegression()
# lr_model.fit(X_train, y_train_log)
#
# y_pred_lr = lr_model.predict(X_test)
# rmse_lr = np.sqrt(mean_squared_error(y_test_log, y_pred_lr))
# comparison_rf = pd.DataFrame({
#     'Actual_SalePrice': (y_test),
#     'Predicted_SalePrice_RF': np.exp(y_pred_lr)
# })
# print(comparison_rf.head(10))
#
# print(f'Linear Regression RMSE (Log Scale): {rmse_lr:.5f}')

In [None]:
rf_model = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train_log)

y_pred_rf = rf_model.predict(X_test)
comparison_rf = pd.DataFrame({
    'Actual_SalePrice': (y_test),
    'Predicted_SalePrice_RF': np.exp(y_pred_rf)
})

print(comparison_rf.head(10))
rmse_rf = np.sqrt(mean_squared_error(y_test_log, y_pred_rf))



In [None]:
print(f'Random Forest RMSE (Log Scale): {rmse_rf:.5f}')


In [209]:
from sklearn.base import BaseEstimator, TransformerMixin
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, threshold_mi=0.1, threshold_corr=0.2):
        self.ordinal_mappings = {
            'PoolQC': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'None': 0},
            'FireplaceQu': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0},
        }
        self.columns_to_drop = ['PoolQC', 'PoolArea', 'MiscFeature']
        self.categorical_features_ = None
        self.threshold_mi = threshold_mi
        self.threshold_corr = threshold_corr

    def fit(self, X, y=None):
        X_transformed = self.transform(X)
        self.categorical_features_ = X_transformed.select_dtypes(include=['object']).columns.tolist()
        return self

    def transform(self, X, y=None):
        X = X.copy()

        for col, mapping in self.ordinal_mappings.items():
            X[col] = X[col].map(mapping).fillna(0).astype(int)

        X['TotalSF'] = X['1stFlrSF'] + X['2ndFlrSF'] + X['TotalBsmtSF']
        X['Bathrooms'] = X['FullBath'] + (0.5 * X['HalfBath'])

        impute_features = X[['LotFrontage', 'LotArea', 'OverallQual', 'GrLivArea', 'GarageArea', 'LotShape', 'MSSubClass']].copy()
        impute_features = pd.get_dummies(impute_features, drop_first=True)
        imputer = IterativeImputer(random_state=42)
        X['LotFrontage'] = imputer.fit_transform(impute_features)[:, 0]

        X = X.drop(columns=self.columns_to_drop, errors='ignore')

        if y is not None:
            mi_scores = mutual_info_regression(pd.get_dummies(X, drop_first=True), y)
            mi_scores = pd.Series(mi_scores, index=X.columns, name='MI scores')

            low_mi_columns = mi_scores[mi_scores < self.threshold_mi].index

            correlation_matrix = pd.concat([X, y], axis=1).corr()
            saleprice_correlation = correlation_matrix[y.name].sort_values(ascending=False)

            low_corr_columns = saleprice_correlation[
                (saleprice_correlation.abs() < self.threshold_corr)
            ].index

            self.columns_to_drop = list(set(low_mi_columns).union(low_corr_columns))
            X = X.drop(columns=self.columns_to_drop, errors='ignore')

        return X

In [186]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer

def create_preprocessor(categorical_features):
    numerical_features = [
        'LotFrontage', 'LotArea', 'OverallQual', 'GrLivArea',
        'GarageArea', 'TotalSF', 'Bathrooms'
    ]

    numerical_transformer = Pipeline(steps=[
        ('imputer', IterativeImputer(max_iter=10, random_state=42)),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='None')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ],
        remainder='drop'
    )

    return preprocessor

In [224]:
def build_pipeline(categorical_features):
    return Pipeline(steps=[
        ('feature_engineer', FeatureEngineer(threshold_mi=0.1, threshold_corr=0.2)),
        ('preprocessor', create_preprocessor(categorical_features)),
        ('regressor', RandomForestRegressor(n_estimators=200, random_state=42))
    ])


In [188]:
def prepare_data(df, is_train=True):
    if is_train:
        X = df.drop('SalePrice', axis=1)
        y = df['SalePrice']
        return X, y
    return df

In [189]:
def get_preprocessed_data(pipeline, X):
    feature_engineer = pipeline.named_steps['feature_engineer']
    X_transformed = feature_engineer.transform(X)

    categorical_features = X_transformed.select_dtypes(include=['object']).columns.tolist()

    preprocessor = create_preprocessor(categorical_features)
    transformed_data = preprocessor.fit_transform(X_transformed)

    num_features = preprocessor.transformers_[0][2]
    cat_features = preprocessor.transformers_[1][1].named_steps['onehot'].get_feature_names_out(
        preprocessor.transformers_[1][2]  #
    )
    feature_names = list(num_features) + list(cat_features)

    return pd.DataFrame(transformed_data, columns=feature_names, index=X.index)

In [225]:
from sklearn.model_selection import cross_val_score
def submission(train_path, test_path=None):
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path) if test_path else None

    X_train, y_train = prepare_data(train_df)

    feature_engineer = FeatureEngineer()
    feature_engineer.fit(X_train)
    categorical_features = feature_engineer.categorical_features_

    pipeline = build_pipeline(categorical_features)
    pipeline.fit(X_train, y_train)

    # preprocessed_df = get_preprocessed_data(pipeline, X_train)
    # print("Preprocessed DataFrame:")
    # print(preprocessed_df.head())


    if test_df is not None:
        X_test = prepare_data(test_df, is_train=False)
        test_pred = pipeline.predict(X_test)
        neg_mse_scores = -cross_val_score(pipeline, X_train, np.log(y_train), cv=5, scoring='neg_mean_squared_error')
        rmse_scores = np.sqrt(neg_mse_scores)
        print(f"Cross-Validated RMSE: {rmse_scores}")
        submission_df = pd.DataFrame({'Id': test_df['Id'], 'SalePrice': test_pred})
        submission_df.to_csv('housing2.csv', index=False)
        print("Submission file 'housing2.csv' created successfully!")

    return pipeline

In [226]:
submission('../kagle_data/housing/train.csv', '../kagle_data/housing/test.csv')

Cross-Validated RMSE: [0.13693967 0.16131249 0.15324556 0.13817389 0.14767128]
Submission file 'housing2.csv' created successfully!
