In [None]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_data_path = '/kaggle/input/california-house-prices/train.csv'
test_data_path = '/kaggle/input/california-house-prices/test.csv'

In [None]:
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv(test_data_path)

In [None]:
#
# Drop columns
# 

test_df_result = test_df[['Id']]
columns = [
    'Id', 'Address', 'Summary', 'Cooling', 'Heating features', 'Cooling features',
    'Elementary School', 'Middle School', 'High School',
    'Flooring', 'Appliances included', 'Laundry features', 'Parking features', 'State',
    'Listed On', 'Last Sold On'
]
train_df = train_df.drop(columns = columns)
test_df = test_df.drop(columns = columns, errors = 'ignore')

In [None]:
#
# Type
#

def process_type(df):
    df.loc[df['Type'] == 'MultiFamily', 'Type'] = 'Townhouse'
    df.loc[df['Type'] == 'Single Family', 'Type'] = 'SingleFamily'

    valid_types = ['SingleFamily', 'Condo', 'Townhouse', 'MobileManufactured', 'VacantLand', 'Apartment']
    df.loc[~df['Type'].isin(valid_types), 'Type'] = 'Others'
    
    df_encoded = pd.get_dummies(df, columns = ['Type'], drop_first = False, dtype = int)

    return df_encoded

train_df = process_type(train_df)
test_df = process_type(test_df)

In [None]:
#
# Heating
#

def process_heating(df):
    import re

    keywords = {
        'Air forced': r'Air\s+Forced',
        'Central': r'Central',
        'Gas': r'Gas',
        'Furnace': r'Furnace',
        'Wall': r'Wall',
        'Electric': r'Electric'
    }

    df['Heating'] = df['Heating'].fillna('')
    for keyword in keywords:
        df[keyword] = df['Heating'].apply(lambda x: 1 if re.search(keywords[keyword], x, flags = re.IGNORECASE) else 0)

    df = df.drop(columns = ['Heating'])
    
    return df

train_df = process_heating(train_df)
test_df = process_heating(test_df)

In [None]:
# 
# Parking
#

def process_parking(df):
    df['Parking'] = df['Parking'].fillna('')
    keywords = ['Attached', 'Detached', 'Covered', 'Carport', 'Driveway', 'Basement']
    for keyword in keywords:
        df[keyword] = df['Parking'].str.contains(keyword, case = False).astype(int)

    df = df.drop(columns = ['Parking'])
    
    return df

train_df = process_parking(train_df)
test_df = process_parking(test_df)

In [None]:
#
# Bedrooms
#

def process_bedrooms(df):
    df['Bedrooms'] = pd.to_numeric(df['Bedrooms'], errors = 'coerce')
    df['Bedrooms'] = df['Bedrooms'].fillna(df['Bedrooms'].median())
    return df

train_df = process_bedrooms(train_df)
test_df = process_bedrooms(test_df)

In [None]:
#
# Region, Zip, City
#

def target_encoding(train_df, test_df, target_column, categorical_columns):
    for categorical_column in categorical_columns:        
        target_median = train_df.groupby(categorical_column)[target_column].median()
        overall_median = train_df[target_column].median()
        
        train_df[f'{categorical_column}_target'] = train_df[categorical_column].map(target_median)
        train_df[f'{categorical_column}_target'] = train_df[f'{categorical_column}_target'].fillna(overall_median)

        test_df[f'{categorical_column}_target'] = test_df[categorical_column].map(target_median)
        test_df[f'{categorical_column}_target'] = test_df[f'{categorical_column}_target'].fillna(overall_median)
        
    train_df = train_df.drop(columns = categorical_columns)
    test_df = test_df.drop(columns = categorical_columns)

    return train_df, test_df

train_df, test_df = target_encoding(train_df, test_df, 'Sold Price', ['Region', 'Zip', 'City'])

In [None]:
#
# Year built and other general numerical columns
#

def process_numerical_columns(df):
    df = df.apply(lambda x: x.fillna(x.median()) if x.dtype.kind in 'biufc' else x)
    return df

train_df = process_numerical_columns(train_df)
test_df = process_numerical_columns(test_df)

In [None]:
#
# Check for NaNs
#

print("NaNs in train_df:", train_df.isna().sum().sum())
print("NaNs in test_df:", test_df.isna().sum().sum())

In [None]:
y = train_df['Sold Price']
X = train_df.drop(columns = 'Sold Price')

X.select_dtypes(exclude = ['number']).columns

In [None]:
X.head(5)

In [None]:
y.head(5)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

X_lower_bound = np.percentile(X, 1, axis = 0)
X_upper_bound = np.percentile(X, 99, axis = 0)
X_clipped = np.clip(X, X_lower_bound, X_upper_bound)
X_scaler = MinMaxScaler()
X_scaled = X_scaler.fit_transform(X_clipped)

y_lower_bound = np.percentile(y, 1)
y_upper_bound = np.percentile(y, 99)
y_clipped = np.clip(y, y_lower_bound, y_upper_bound).values.reshape(-1, 1)
y_scaler = MinMaxScaler()
y_scaled = y_scaler.fit_transform(y_clipped)

X_tensor = torch.tensor(X_scaled, dtype = torch.float32)
y_tensor = torch.tensor(y_scaled, dtype = torch.float32)

X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size = 0.2, random_state = 42)

train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size = 128, shuffle = True)

In [None]:
assert not torch.isnan(X_tensor).any(), "NaNs found in X_tensor"
assert not torch.isinf(X_tensor).any(), "Infs found in X_tensor"
assert not torch.isnan(y_tensor).any(), "NaNs found in y_tensor"
assert not torch.isinf(y_tensor).any(), "Infs found in y_tensor"

In [None]:
class HousePriceModel(nn.Module):
    def __init__(self, in_features_size):
        super(HousePriceModel, self).__init__()
        self.fc1 = nn.Linear(in_features = in_features_size, out_features = 128)
        self.fc2 = nn.Linear(in_features = 128, out_features = 64)
        self.fc3 = nn.Linear(in_features = 64, out_features = 32)
        self.fc4 = nn.Linear(in_features = 32, out_features = 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.fc4(x)
        return x

In [None]:
model = HousePriceModel(X_tensor.shape[1])
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr = 0.000001)

num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs.squeeze(), batch_y.squeeze())
        loss.backward()
        optimizer.step()
    
    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

In [None]:
model.eval()
with torch.no_grad():
    test_outputs = model(X_test)
    test_loss = criterion(test_outputs, y_test)
    print(f'Test Loss: {test_loss.item():.4f}')

In [None]:
predict_X = test_df
predict_X_scaled = X_scaler.transform(predict_X)
predict_X_tensor = torch.tensor(predict_X_scaled, dtype = torch.float32)

predict_y = model(predict_X_tensor)
predict_y = y_scaler.inverse_transform(predict_y.detach().numpy())

test_df_result['Sold Price'] = predict_y
test_df_result.to_csv('submission.csv', index = False)