# House Prices - Advanced Regression Techniques
This notebook is a solution to the [House Prices - Advanced Regression Techniques](https://www.kaggle.com/c/house-prices-advanced-regression-techniques) competition. The goal of the competition is to predict the final price of each home given a set of features. The metric used to evaluate the model is the Root Mean Squared Error (RMSE) between the logarithm of the predicted value and the logarithm of the observed sales price.

## Data


In [None]:
%pip install kaggle
%pip install numpy
%pip install pandas
%pip install matplotlib

### Kaggle Data

In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '.'
data_dir = './data'
!chmod 600 kaggle.json
!kaggle competitions download -c house-prices-advanced-regression-techniques -p {data_dir}
!unzip -o {data_dir}/house-prices-advanced-regression-techniques.zip -d {data_dir}

### Data Description

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv(f'{data_dir}/train.csv')
data.head()

### Cleaning and Preprocessing

In [None]:
data.drop(columns=['Id', 'Utilities'], inplace=True)

In [None]:
def scale_features(col):
    if col.dtype == 'int64' or col.dtype == 'float64':
        mean = col.mean()
        std = col.std()
        return (col - mean) / std
    return col

In [None]:
def clean_features(df):
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col], categories = pd.factorize(df[col])
        df[col] = scale_features(df[col])
    return df.fillna(0)

In [None]:
from sklearn.model_selection import train_test_split
train, valid = train_test_split(data, test_size=0.2, random_state=42)

In [None]:
feattures = train.drop('SalePrice', axis=1)
labels = train['SalePrice']

In [None]:
feattures = clean_features(feattures)
feattures.head()

### Visualization

In [None]:
avg_feattures = pd.Series(feattures.loc[:, feattures.columns != 'Id'].mean(axis=1))
plt.scatter(avg_feattures, labels)
plt.xlabel('Feattures')
plt.ylabel('Sale Price')
plt.title('Sale Price vs Feattures')
plt.show()

## Model

In [None]:
from estimator import scratch_model

params = scratch_model.ModelParams(
    weights = np.zeros(feattures.shape[1]),
    bias = 0.0,
    features = feattures.values,
    labels = labels.values,
)
model = scratch_model.PriceEstimator(params)
history = model.train(70, 0.206)

In [None]:
plt.plot(history) 
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss vs Epochs')
plt.show()

## Validation


In [None]:
valid_features = valid.drop('SalePrice', axis=1)
valid_labels = valid['SalePrice']
valid_features = clean_features(valid_features)

In [None]:
row = valid.shape[0]

for i in range(row):
    print(f"Predicted: {model.predict(valid_features.iloc[i].values)} Actual: {valid_labels.iloc[i]}")

## Submission

In [None]:
test = pd.read_csv(f'{data_dir}/test.csv')
clean_test = clean_features(test)
clean_test.drop(columns=['Id', 'Utilities'], inplace=True)
clean_test.head()

In [None]:
submission = pd.read_csv(f'{data_dir}/sample_submission.csv')
for i in range(test.shape[0]):
    submission['SalePrice'][i] = model.predict(clean_test.iloc[i].values)
submission.head()

In [None]:
submission.to_csv(f'{data_dir}/submission.csv', index=False)

In [None]:
!kaggle competitions submit -c house-prices-advanced-regression-techniques -f {data_dir}/submission.csv -m "First submission using scratch model."