In [None]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline

# Data
[House Prices - Advanced Regression Techniques](https://www.kaggle.com/competitions/house-prices-advanced-regression-techniques/data)

In [None]:
df_train = pd.read_csv('/content/train.csv')
df_train.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


#Feature Engineering

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

def preprocess_data(data):
    # Fill numerical missing values with the mean
    num_vars = data.select_dtypes(include=['int64', 'float64']).columns
    data[num_vars] = data[num_vars].fillna(data[num_vars].mean())

    # Fill categorical missing values with the mode
    cat_vars = data.select_dtypes(include=['object']).columns
    data[cat_vars] = data[cat_vars].fillna(data[cat_vars].mode().iloc[0])

    # Encode categorical variables
    data = pd.get_dummies(data, drop_first=True)

    # Feature scaling
    scaler = StandardScaler()
    data[num_vars] = scaler.fit_transform(data[num_vars])

    return data

In [None]:
train_preprocessed_data = preprocess_data(df_train)

In [None]:
from sklearn.model_selection import train_test_split

X = train_preprocessed_data.drop('SalePrice', axis=1)
y = df_train['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Create a random forest regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = rf.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 889062684.1743206


In [None]:
# New data
data = pd.read_csv('/content/test.csv')

In [None]:
# Preprocess the data using the defined function
test_preprocessed_data = preprocess_data(data)

In [None]:
test_preprocessed_data.head()


Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,-1.730864,-0.874711,0.555587,0.363929,-0.751101,0.400766,-0.340945,-1.072885,-0.570108,0.063295,...,0,0,0,0,1,0,0,0,1,0
1,-1.72849,-0.874711,0.604239,0.897861,-0.054877,0.400766,-0.439695,-1.214908,0.041273,1.063392,...,0,0,0,0,1,0,0,0,1,0
2,-1.726115,0.061351,0.263676,0.809646,-0.751101,-0.497418,0.844059,0.678742,-0.570108,0.773254,...,0,0,0,0,1,0,0,0,1,0
3,-1.723741,0.061351,0.458284,0.032064,-0.054877,0.400766,0.876976,0.678742,-0.456889,0.357829,...,0,0,0,0,1,0,0,0,1,0
4,-1.721367,1.465443,-1.244533,-0.971808,1.337571,-0.497418,0.679475,0.394694,-0.570108,-0.387298,...,0,0,0,0,1,0,0,0,1,0


In [None]:
print("Training features: ", X.columns)
print("Test features: ", test_preprocessed_data.columns)

Training features:  Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       ...
       'SaleType_ConLI', 'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth',
       'SaleType_WD', 'SaleCondition_AdjLand', 'SaleCondition_Alloca',
       'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=246)
Test features:  Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       ...
       'SaleType_ConLI', 'SaleType_ConLw', 'SaleType_New', 'SaleType_Oth',
       'SaleType_WD', 'SaleCondition_AdjLand', 'SaleCondition_Alloca',
       'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=228)


In [None]:
missing_cols = set(X.columns) - set(test_preprocessed_data.columns)
for c in missing_cols:
    test_preprocessed_data[c] = 0

In [None]:
test_preprocessed_data = test_preprocessed_data[X.columns]

In [None]:
predictions = rf.predict(test_preprocessed_data)

In [None]:
submission = pd.DataFrame({'Id': data['Id'],'SalePrice': predictions})
submission.to_csv('sample_submission.csv', index=False)

#Neural network regression model

In [None]:
tf.random.set_seed(42)

house_price_model = tf.keras.Sequential([
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(50, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1)
])

# Compile the model
house_price_model.compile(loss=tf.keras.losses.mae,
                          optimizer=tf.keras.optimizers.Adam(),
                          metrics=['mae'])

# Fit the model
house_price_model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0)


<keras.src.callbacks.History at 0x7fe845c9f220>

In [None]:
model_prediction = house_price_model.predict(test_preprocessed_data)



In [None]:
model_prediction = model_prediction.flatten()

#RandomForestRegressor has more accuracy than neural network regression model

In [None]:
submission = pd.DataFrame({'Id': data['Id'],'SalePrice': model_prediction})
submission.to_csv('sample_submission_model.csv', index=False)