# Section 1. Introduction to the problem/task and dataset

The group has selected the house dataset based on the list of datasets.

The target task of this notebook to predict the house prices in King County given selected house features. Using these models may be of interest to buyers or sellers who may not be familiar with the house market in King County.

Interested buyers for houses in King County may want to estimate their budget given their desired house features.

On the other hand, sellers who may not be familiar with the market may not want to underestimate or overestimate their house's worth. These models may be of assistance in guiding sellers to price their houses.

# Section 2. Description of the dataset

This dataset consists of house sale prices and sold houses between May 2014 and May 2015 in King County.

Each row represents a house sold and each column represents a feature of a house.
This dataset contains 21613 instances and 21 features overall.

Features:
- `id` – A notation for a house x
- `date` – Date sold x
- `price` – Sale price
- `bedrooms` – Number of bedrooms
- `bathrooms` – Number of bathrooms
- `sqft_living` – Size of living area in square feet
- `sqft_lot` – Size of the lot in square feet
- `floors` – Total floors in the house
- `waterfront` – ‘1’ if the property has a waterfront, ‘0’ if not.
- `view` – An index from 0 to 4 of how good the view of the property was.
- `condition` – Condition of the house, ranked from 1 to 5
- `grade` – Classification by construction quality which refers to the types of materials used
and the quality of workmanship. Buildings of better quality (higher grade) cost more to
build per unit of measure and command higher value.
- `sqft_above` –  Square feet above ground (find just to remove)
- `sqft_basement` – Square feet below ground (find just to bin)
- `yr_built` – Year built
- `yr_renovated` – Year renovated. ‘0’ if never renovated
- `zipcode` – 5-digit zip code (transform)
- `lat` – Latitude coordinate (transform)
- `long` – Longitude coordinate (transform)
- `sqft_living15` – Average size of interior housing living space for the closest 15 houses, in
square feet
- `sqft_lot15` – Average size of land lots for the closest 15 houses, in square feet

# Section 3. List of requirements

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib.ticker import ScalarFormatter

import statsmodels.api as sm
from statsmodels.formula.api import ols

# Geographical analysis
import folium
from folium.plugins import MarkerCluster
from folium.plugins import HeatMap
from sklearn.cluster import KMeans

# extracting lat, long from zipcode
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

# for google Colab
# from google.colab import drive


# Section 4. Data preprocessing and cleaning

In [None]:
# drive.mount('/content/drive')

In [None]:
house_df = pd.read_csv("house_prices.csv")

In [None]:
house_df.head()

In [None]:
house_df.info()

## Duplicate data


In [None]:
a = house_df["id"].value_counts()
dupe_id_df = house_df.join(a, on='id')
dupe_id_df[dupe_id_df['count'] > 1]

Duplicate IDs are from house prices that change overtime

In [None]:
house_df[house_df.id == 795000620]

In [None]:
# keeping only most recent ids for dupes
house_df = house_df.drop_duplicates(subset=['id'], keep='last')
house_df.id.duplicated().value_counts()

This line of code should return the the row that has a year of 2015.

In [None]:
house_df[house_df.id == 795000620]

In [None]:
house_df.loc[house_df["id"] == 7129300520]

## Converting date into DateTime object
Done for easier processing.


In [None]:
# Convert date for easier processing
house_df['date'] = pd.to_datetime(house_df['date'])
house_df['date'].info()

## Check for the minimum and maximum values

In [None]:
col_minmax = house_df.columns.drop(['id', 'date','lat','long'])
print("Minimum and Maximum of each column (min,max)")
for c in col_minmax:
  print("%s \t\t(%.2f, %.2f)" % (c, house_df[c].min(), house_df[c].max()))

In [None]:
# weird to use float in here
bathroom_counts = house_df["bathrooms"].value_counts()
bathroom_counts

In [None]:
plt.bar(bathroom_counts.index, bathroom_counts[:])

In [None]:
# I don't know what's this 33 bedrooms thing
house_df["bedrooms"].value_counts()

In [None]:
# This entry may need to be examined more, this looks too suspicious
house_df.loc[house_df["bedrooms"] == 33]

In [None]:
house_df.loc[house_df["price"] == 7700000.0]

In [None]:
house_df["sqft_living"].value_counts()

In [None]:
house_df["sqft_living"].min()

In [None]:
house_df["sqft_living"].max()

In [None]:
house_df.loc[house_df["sqft_living"] == 13540]

In [None]:
house_df["sqft_lot"].min()

In [None]:
house_df.loc[house_df["sqft_lot"] == 520]

In [None]:
house_df["sqft_lot"].max()

In [None]:
house_df.loc[house_df["sqft_lot"] == 1651359]

In [None]:
house_df["floors"].min()

In [None]:
house_df["floors"].max()

In [None]:
house_df["waterfront"].value_counts()

In [None]:
house_df["view"].value_counts()

In [None]:
house_df["condition"].value_counts()

In [None]:
house_df["grade"].value_counts()

In [None]:
house_df["sqft_above"].min()

In [None]:
house_df["sqft_above"].max()

In [None]:
house_df["sqft_basement"].min()

In [None]:
house_df["sqft_basement"].max()

In [None]:
house_df["yr_built"].min()

In [None]:
house_df["yr_built"].max()

In [None]:
house_df["zipcode"].value_counts()

In [None]:
house_df["yr_renovated"].value_counts()

In [None]:
house_df["zipcode"].min()

In [None]:
house_df["zipcode"].max()

In [None]:
house_df["lat"].min()

In [None]:
house_df["lat"].max()

In [None]:
house_df["long"].min()

In [None]:
house_df["long"].max()

In [None]:
house_df["sqft_living15"].min()

In [None]:
house_df["sqft_living15"].max()

In [None]:
house_df["sqft_lot15"].max()

# Section 5. Exploratory data analysis

### When was the most houses sold?

In [None]:
house_df['count'] = 1
daily_counts = house_df.groupby('date').count().reset_index()

# Group by month and year, summing the counts
daily_counts['year_month'] = daily_counts['date'].dt.to_period('M')  # Convert to year-month period
monthly_counts = daily_counts.groupby('year_month')['count'].sum().reset_index()

# Convert back to datetime format for plotting
monthly_counts['year_month'] = monthly_counts['year_month'].dt.to_timestamp().sort_values(ascending=False)

Based on the

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(monthly_counts['year_month'], monthly_counts['count'], marker='o')
plt.ylim(ymin=0)
plt.title("Houses Sold between May 2014 and May 2015")
plt.xlabel("Time")
plt.ylabel("Houses sold")
plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.gca().xaxis.set_major_locator(mdates.MonthLocator())  # Show each month

# Rotate date labels
plt.gcf().autofmt_xdate()

plt.show()

In [None]:
monthly_counts['year_month'] = monthly_counts['year_month'].dt.to_period('M')
monthly_counts.sort_values(by=['count'], ascending=False)

In [None]:
monthly_counts['count'].describe()

### What is the distribution of prices of a house overtime?

In [None]:
house_df['year_month'] = house_df['date'].dt.to_period('M')  # Convert to year-month period

In [None]:
house_df['price'].describe().apply(lambda x: format(x, 'f'))

In [None]:
plt.figure(figsize=(45, 6))
plt.scatter(house_df['date'], house_df['price'])

### What is the distribution of the condition of houses sold?

In [None]:
house_df['condition'].value_counts()

In [None]:
bins = [1, 2, 3, 4, 5, 6]
plt.xticks(bins)
plt.title("Condition of Houses Sold")
plt.xlabel("Condition")
plt.ylabel("Frequency")
plt.hist(house_df['condition'], bins=bins, align='left', edgecolor="black", rwidth=0.5)


### What is the distribution of the grade of houses sold?

In [None]:
house_df['grade'].value_counts()

In [None]:
bins = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
plt.xticks(bins)
plt.title("Grade of Houses Sold")
plt.xlabel("Grade")
plt.ylabel("Frequency")
plt.hist(house_df['grade'], bins=bins, align='left', edgecolor="black", rwidth=0.5)

### Is sqft_living just sqft_above + sqft_basement?

In [None]:
# Check if sqft_living equals sqft_above + sqft_basement
mismatch = house_df[house_df['sqft_living'] != house_df['sqft_above'] + house_df['sqft_basement']]

# Index the id where the condition is not met
mismatch_ids = mismatch['id'].tolist()

print("IDs where sqft_living != sqft_above + sqft_basement:", mismatch_ids)

Yes, yes it is just sqft_above + sqft_basement

### Is there a statistically significant difference between condition and grade on price?

In [None]:
price_grade_condition = house_df[['price', 'grade', 'condition']]
model = ols('price ~ C(grade) + C(condition) + C(grade):C(condition)',
            data=price_grade_condition).fit()
result = sm.stats.anova_lm(model, type=2)
print(result)

### Is there a statistically significant difference between having a waterfront and a good view on price?

In [None]:
price_waterfront_view = house_df[['price', 'waterfront', 'view']]
model = ols('price ~ C(waterfront) + C(view) + C(waterfront):C(view)',
            data=price_waterfront_view).fit()
result = sm.stats.anova_lm(model, type=2)
print(result)

### Is there a correlation between price and number of bedrooms?

In [None]:
house_df[['price', 'bedrooms']].corr()

In [None]:
plt.figure(figsize=(10, 6))
plt.title("Correlation between Bedrooms and Price of Sold Houses")
plt.xlabel("Number of bedrooms")
plt.ylabel("Price")
plt.scatter(house_df['bedrooms'], house_df['price'], alpha=0.5)

### Is there a correlation between price and number of bathrooms?

In [None]:
house_df[['price', 'bathrooms']].corr()

In [None]:
plt.figure(figsize=(10, 6))
plt.title("Correlation between Bathrooms and Price of Sold Houses")
plt.xlabel("Number of bathrooms")
plt.ylabel("Price")
plt.scatter(house_df['bathrooms'], house_df['price'], alpha=0.5)

### Is there a correlation between price and the size of a house's living area in square feet?

In [None]:
house_df[['price', 'sqft_living']].corr()

In [None]:
plt.figure(figsize=(10, 6))
plt.title("Correlation between Size of Living Area in Square Feet and Price of Sold Houses")
plt.xlabel("Size of Living Area in Square Feet")
plt.ylabel("Price")
plt.scatter(house_df['sqft_living'], house_df['price'], alpha=0.5)

### Is there a correlation between price and the size of a house's lot?

In [None]:
house_df[['price', 'sqft_lot']].corr()

In [None]:
plt.figure(figsize=(10, 6))
plt.title("Correlation between Size of Lot in Square Feet and Price of Sold Houses")
plt.xlabel("Size of Lot in Square Feet")
plt.ylabel("Price")
plt.scatter(house_df['sqft_lot'], house_df['price'], alpha=0.5)

### Is there a correlation between price and total number of floors?

In [None]:
house_df[['price', 'floors']].corr()

In [None]:
plt.figure(figsize=(10, 6))
plt.title("Correlation between Total Number of Floors and Price of Sold Houses")
plt.xlabel("Total Number of Floors")
plt.ylabel("Price")
plt.scatter(house_df['floors'], house_df['price'], alpha=0.5)

### Is there a trend between price and if the property has a waterfront?

In [None]:
house_df.boxplot("price", by="waterfront", figsize=(5, 5))

### Is there a trend between price and how good of a view the property has?

In [None]:
house_df.boxplot("price", by="view", figsize=(5, 5))

### Is there a correlation between price and how high the house was built above ground?

In [None]:
house_df[['price', 'sqft_above']].corr()

In [None]:
plt.figure(figsize=(10, 6))
plt.title("Correlation between Square feet above ground and Price of Sold Houses")
plt.xlabel("Square feet above ground")
plt.ylabel("Price")
plt.scatter(house_df['sqft_above'], house_df['price'], alpha=0.5)

### Is there a correlation between price and how low the house was built?

In [None]:
house_df[['price', 'sqft_basement']].corr()

In [None]:
plt.figure(figsize=(10, 6))
plt.title("Correlation between Square Feet below Ground and Price of Sold Houses")
plt.xlabel("Square feet below ground")
plt.ylabel("Price")
plt.scatter(house_df['sqft_basement'], house_df['price'], alpha=0.5)

### Is there a correlation between price and average size of interior housing living space for the closest 15 houses?

In [None]:
house_df[['price', 'sqft_living15']].corr()

In [None]:
plt.figure(figsize=(10, 6))
plt.title("Correlation between Average Size of Interior Housing Living Space for Closest 15 Houses and Price of Sold Houses")
plt.xlabel("Average size of interior housing living space for the closest 15 houses")
plt.ylabel("Price")
plt.scatter(house_df['sqft_living15'], house_df['price'], alpha=0.5)

### Is there a correlation between price and average size of land lots for the closest 15 houses in square feet?

In [None]:
house_df[['price', 'sqft_lot15']].corr()

In [None]:
plt.figure(figsize=(10, 6))
plt.title("Correlation between Average Size of Land Lots for Closest 15 Houses and Price of Sold Houses")
plt.xlabel("Average size of interior housing living space for the land lots for the closest 15 houses")
plt.ylabel("Price")
plt.scatter(house_df['sqft_lot15'], house_df['price'], alpha=0.5)

## Geographical Analysis


### Finding the best cluster amount

In [None]:
X = house_df[['lat', 'long']]

# Calculate WCSS for different number of clusters
wcss = []
for i in range(1, 20):
    kmeans = KMeans(n_clusters=i, random_state=0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

# Plot the elbow graph
plt.plot(range(1, 20, 1), wcss, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.grid(True)
plt.show()

### Mapping clusters to the dataframe

In [None]:
X = house_df[['lat', 'long']]
kmeans = KMeans(n_clusters=4, random_state=0).fit(X)
house_df['cluster'] = kmeans.labels_
cluster_total_house = house_df.groupby('cluster')['id'].count().reset_index()
cluster_total_house

In [None]:
house_coor_df = house_df[['id', 'price', 'lat', 'long', 'cluster']].copy()
house_coor_df

### Visualizing Clusters

In [None]:
# Setting up the map
map_center = [house_coor_df['lat'].mean(), house_coor_df['long'].mean()]
m = folium.Map(location=map_center, zoom_start=9)

# Cluster up points that are close to each other; reduce lag
cluster = MarkerCluster().add_to(m)
color_map = {0: 'red', 1: 'darkgreen', 2: 'blue', 3: 'purple'}

# Plotting each house
for i, row in house_coor_df.iterrows():
    color = color_map.get(row['cluster'], 'gray')
    folium.Marker(
        location=[row['lat'], row['long']],
        popup=f"House ID: {row['id']}<br>Price: {row['price']}<br>Cluster: {row['cluster']}",
        icon=folium.Icon(color=color, icon='info-sign')
    ).add_to(cluster)

# Marks to show the center of the clusters within a 10km range
for i, center in enumerate(kmeans.cluster_centers_):
    color = color_map.get(i, 'gray')
    folium.Circle(
        location=[center[0], center[1]],
        radius=10000,
        color="black",
        weight=1,
        fill_opacity=0.2,
        opacity=1,
        fill_color=color,
        fill=False,  # gets overridden by fill_color
        popup="{} meters".format(10000),
        tooltip=f"Cluster: {i}<br>Center: {center[0]}, {center[1]}",
    ).add_to(m)

m


### Generating a heatmap to show density of house prices


In [None]:
# Setting up the map
map_center = [house_coor_df['lat'].mean(), house_coor_df['long'].mean()]
m = folium.Map(location=map_center, zoom_start=9)

# Cluster up points that are close to each other; reduce lag
cluster = MarkerCluster().add_to(m)
color_map = {0: 'red', 1: 'darkgreen', 2: 'blue', 3: 'purple'}

# Plotting each house
for i, row in house_coor_df.iterrows():
    color = color_map.get(row['cluster'], 'gray')
    folium.Marker(
        location=[row['lat'], row['long']],
        popup=f"House ID: {row['id']}<br>Price: {row['price']}<br>Cluster: {row['cluster']}",
        icon=folium.Icon(color=color, icon='info-sign')
    ).add_to(cluster)

# Heat map
heat_data = [[row['lat'], row['long'], row['price']] for i, row in house_df.iterrows()]
HeatMap(heat_data).add_to(m)


In [None]:
house_df['year'] = house_df['date'].dt.year
house_df['age'] = house_df.apply(lambda row: row['yr_renovated'] - row['yr_built'] if row['yr_renovated'] != 0 else row['year'] - row['yr_built'], axis=1)
house_df.head()

# Section 6. Initial model training

In [None]:
house_df['year'] = house_df['date'].dt.year
house_df['age'] = house_df.apply(lambda row: row['yr_renovated'] - row['yr_built'] if row['yr_renovated'] != 0 else row['year'] - row['yr_built'], axis=1)

In [None]:
house_df.head()

### KNN

In [None]:
col_remove = ['id', 'date', 'yr_built', 'yr_renovated', 'lat', 'long', 'count', 'year_month', 'cluster', 'year']
knn_df = house_df.drop(columns=col_remove)
knn_df

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score

In [None]:
X = knn_df.drop('price', axis=1)
y = knn_df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [None]:
k_folds = 5
k_choices = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 43, 44, 46, 48, 50]
scores = np.zeros((len(k_choices), k_folds))

for i in range(len(k_choices)):
    print("k is : " + str(k_choices[i]))
    model = KNeighborsRegressor(n_neighbors=k_choices[i])
    scores[i] = cross_val_score(model, X_train, y_train, cv=k_folds)
    pass

avg_scores = np.mean(scores, axis=1)
avg_scores

In [None]:
avg_scores.max()

In [None]:
model = KNeighborsRegressor(n_neighbors=6)
model.fit(X_train, y_train)
y_predicted = model.predict(X_test)

mae = mean_absolute_error(y_test, y_predicted)
mse = mean_squared_error(y_test, y_predicted)
r2 = r2_score(y_test, y_predicted)

print(f"mae: {mae}")
print(f"mse: {mse}")
print(f"r2: {r2}")

### Linear

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

In [None]:
linear_df = house_df.drop(columns=col_remove)
linear_df

In [None]:
X = linear_df.drop('price', axis=1)
y = linear_df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


mae = mean_absolute_error(y_test, y_predicted)
mse = mean_squared_error(y_test, y_predicted)
r2 = r2_score(y_test, y_predicted)

print(f"mae: {mae}")
print(f"mse: {mse}")
print(f"r2: {r2}")

In [None]:
ridge = Ridge(alpha=1, solver='auto')
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)

mae = mean_absolute_error(y_test, y_predicted)
mse = mean_squared_error(y_test, y_predicted)
r2 = r2_score(y_test, y_predicted)

print(f"mae: {mae}")
print(f"mse: {mse}")
print(f"r2: {r2}")

In [None]:
lasso = Lasso(alpha=0.0001, max_iter=100000)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)

mae = mean_absolute_error(y_test, y_predicted)
mse = mean_squared_error(y_test, y_predicted)
r2 = r2_score(y_test, y_predicted)

print(f"mae: {mae}")
print(f"mse: {mse}")
print(f"r2: {r2}")

### Neural Networks

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.init

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, num_classes, list_hidden, activation='sigmoid'):
        super(NeuralNetwork, self).__init__()
        self.input_size = input_size
        self.num_classes = num_classes
        self.list_hidden = list_hidden
        self.activation = activation
        self.create_network()

    def create_network(self):
        layers = []
        layers.append(nn.Linear(self.input_size, self.list_hidden[0]))
        layers.append(self.get_activation(self.activation))
        for i in range(len(self.list_hidden) - 1):
            layers.append(nn.Linear(self.list_hidden[i], self.list_hidden[i + 1]))
            layers.append(self.get_activation(self.activation))
        layers.append(nn.Linear(self.list_hidden[-1], self.num_classes))
        self.layers = nn.Sequential(*layers)

    def get_activation(self, mode='sigmoid'):
        if mode == 'tanh':
            return nn.Tanh()
        elif mode == 'relu':
            return nn.ReLU(inplace=True)
        return nn.Sigmoid()

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

In [None]:
nn_df = house_df.drop(columns=col_remove)
nn_df

In [None]:
X = nn_df.drop('price', axis=1)
y = nn_df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [None]:
input_size = X_train.shape[1]
hidden_layers = [64]
output_size = 1
network = NeuralNetwork(input_size, output_size, hidden_layers, activation='relu')

criterion = nn.MSELoss()
optimizer = optim.Adam(network.parameters(), lr=0.001)

max_epochs = 5
losses = []

for epoch in range(max_epochs):
    current_epoch_loss = 0
    for X, y in zip(X_train.values, y_train.values):
        X = torch.Tensor(X).float()
        y = torch.Tensor([y]).float()

        optimizer.zero_grad()
        outputs = network(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        current_epoch_loss += loss.item()

    average_loss = current_epoch_loss / len(X_train)
    losses.append(average_loss)
    print(f'Epoch: {epoch + 1}, Loss: {average_loss:.6f}')

# Section 7. Error analysis

## Getting the average longitude and latitude from zip codes
zip codes by itself does not express the location of the house well. Longitudes and latitudes would be good but the differences between the coordinate would have differences so small that the models might not see it as significant even if it is.

The model improved a lot in terms of performance when we removed 'zipcode', 'lat' and 'long' so we know those columns are the problem.

We opt to use zip codes like theyre a cluster and use an average latitude and longitude to give the models a numeric idea of the distance each house will have from one another.

### Extracting latitude and longitude out of the zip codes

In [None]:
zipcode_df = house_df['zipcode'].unique()
zipcode_df = pd.DataFrame(zipcode_df)
zipcode_df = zipcode_df.rename(columns={0:'zipcode'})
zipcode_df

In [None]:
# # TAKES A WHILE TO RUN!! (a minute on my local machine)
# geolocator = Nominatim(user_agent="geotest")
# geocode = RateLimiter(geolocator.geocode,
#                       min_delay_seconds=1)

# zipcode_df['location'] = zipcode_df['zipcode'].apply(geocode)
# zipcode_df['lat'] = zipcode_df['location'].apply(lambda loc: loc.point.latitude if loc else None)
# zipcode_df['long'] = zipcode_df['location'].apply(lambda loc: loc.point.longitude if loc else None)

# # saving it so that this doesn't have to be run again
# zipcode_df.to_csv("/content/drive/MyDrive/STINTSY_mco/zipcode_df.csv")
# zipcode_df

In [None]:
zipcode_df = pd.read_csv("/content/drive/MyDrive/STINTSY_mco/zipcode_df.csv")
zipcode_df = zipcode_df.set_index('zipcode')

In [None]:
zip_lat_dict = zipcode_df[['lat']].to_dict()
zip_long_dict = zipcode_df[['long']].to_dict()
house_df['zip_lat'] = house_df['zipcode'].map(zip_lat_dict['lat'])
house_df['zip_long'] = house_df['zipcode'].map(zip_long_dict['long'])
house_df

# Section 8. Improving model performance

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
norm_col = house_df.columns.drop(['id', 'date', 'price', 'yr_built', 'yr_renovated', 'lat', 'long', 'year'])
house_df[norm_col] = scaler.fit_transform(house_df[norm_col])

In [None]:
house_df.head()

### KNN

In [None]:
col_remove = ['id', 'date', 'yr_built', 'yr_renovated', 'lat', 'long', 'year']
knn_df = house_df.drop(columns=col_remove)
knn_df

In [None]:
X = knn_df.drop('price', axis=1)
y = knn_df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [None]:
k_folds = 5
k_choices = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 43, 44, 46, 48, 50]
scores = np.zeros((len(k_choices), k_folds))

for i in range(len(k_choices)):
    print("k is : " + str(k_choices[i]))
    model = KNeighborsRegressor(n_neighbors=k_choices[i])
    scores[i] = cross_val_score(model, X_train, y_train, cv=k_folds)
    pass

avg_scores = np.mean(scores, axis=1)
avg_scores

In [None]:
avg_scores.max()

In [None]:
model = KNeighborsRegressor(n_neighbors=6)
model.fit(X_train, y_train)
y_predicted = model.predict(X_test)

mae = mean_absolute_error(y_test, y_predicted)
mse = mean_squared_error(y_test, y_predicted)
r2 = r2_score(y_test, y_predicted)

print(f"mae: {mae}")
print(f"mse: {mse}")
print(f"r2: {r2}")

In [None]:
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_predicted})
print(results_df)

### Linear


In [None]:
linear_df = house_df.drop(columns=col_remove)
linear_df

In [None]:
X = linear_df.drop('price', axis=1)
y = linear_df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [None]:
from sklearn.linear_model import SGDRegressor

model = SGDRegressor(max_iter=1000, tol=1e-3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_predicted)
mse = mean_squared_error(y_test, y_predicted)
r2 = r2_score(y_test, y_predicted)

print(f"mae: {mae}")
print(f"mse: {mse}")
print(f"r2: {r2}")

In [None]:
ridge = Ridge(alpha=1, solver='auto')
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)

mae = mean_absolute_error(y_test, y_predicted)
mse = mean_squared_error(y_test, y_predicted)
r2 = r2_score(y_test, y_predicted)

print(f"mae: {mae}")
print(f"mse: {mse}")
print(f"r2: {r2}")

In [None]:
lasso = Lasso(alpha=0.0001, max_iter=100000)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)

mae = mean_absolute_error(y_test, y_predicted)
mse = mean_squared_error(y_test, y_predicted)
r2 = r2_score(y_test, y_predicted)

print(f"mae: {mae}")
print(f"mse: {mse}")
print(f"r2: {r2}")

### Neural Networks

In [None]:
nn_df = house_df.drop(columns=col_remove)
nn_df

In [None]:
X = nn_df.drop('price', axis=1)
y = nn_df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

In [None]:
input_size = X_train.shape[1]
hidden_layers = [64]
output_size = 1
network = NeuralNetwork(input_size, output_size, hidden_layers, activation='relu')

criterion = nn.MSELoss()
optimizer = optim.Adam(network.parameters(), lr=0.001)

max_epochs = 300
losses = []

for epoch in range(max_epochs):
    current_epoch_loss = 0
    for X, y in zip(X_train.values, y_train.values):
        X = torch.Tensor(X).float()
        y = torch.Tensor([y]).float()

        optimizer.zero_grad()
        outputs = network(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        current_epoch_loss += loss.item()

    average_loss = current_epoch_loss / len(X_train)
    losses.append(average_loss)
    print(f'Epoch: {epoch + 1}, Loss: {average_loss:.6f}')

In [None]:
network.eval()
with torch.no_grad():
    y_pred = network(torch.Tensor(X_test.values).float()).numpy()

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

# Section 9. Model performance summary

# Section 10. Insights and conclusions

# Section 11. References