In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error

In [None]:
house = pd.read_csv("../data/raw/data_house.csv")

In [None]:
house.info()

In [None]:
#HINTS FROM IGNACIO:
# Step 1: Compute mean price per ZIP code and sort by the mean price
mean_price_per_zip = house.groupby('zipcode')['price'].mean()

# Step 2: Identify the cheapest ZIP code's mean price and replace by 1
min_price = mean_price_per_zip.min()

# Step 3: Create a ZIP code to price ratio mapping
zip_ratio_map = (mean_price_per_zip / min_price).to_dict()

# Step 4: Replace ZIP codes with their price ratio
house['zipcode'] = house['zipcode'].map(zip_ratio_map)

house['zipcode'].sort_values()

In [None]:
house = house.drop(["Unnamed: 0", "id", "date", "yr_renovated", "long", "lat","yr_built"], axis = 1)

In [None]:
house.head()

In [None]:
corr=np.abs(house.corr(method='pearson')) # corr(x,y) = corr(y, x), corr(x,x) = 1

#Set up mask for triangle representation
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(12, 12))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask,  vmax=1,square=True, linewidths=.5, cmap="copper",annot = corr)
plt.title(" Linear Correlation ")
plt.show()

In [None]:
corr=np.abs(house.corr(method = "spearman")) # corr(x,y) = corr(y, x), corr(x,x) = 1

#Set up mask for triangle representation
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(12, 12))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask,  vmax=1,square=True, linewidths=.5, cmap="copper",annot = corr)
plt.title(" Spearman Correlation (Non-linear)")
plt.show()

In [None]:
house = house.drop(["sqft_lot","waterfront","view","condition","sqft_above","sqft_basement","sqft_living15","sqft_lot15"], axis = 1)

In [None]:
Q1 = house["price"].quantile(0.25)
Q3 = house["price"].quantile(0.75)
IQR = Q3 - Q1

mask = (house["price"] >= (Q1 - 1.5 * IQR)) & (house["price"] <= (Q3 + 1.5 * IQR))
df = house[mask]

In [None]:
features = df.drop(columns = ["price"])
target = df["price"]

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.20, random_state=0)

In [None]:
normalizer = MinMaxScaler()
normalizer.fit(X_train)

In [None]:
X_train_norm = normalizer.transform(X_train)
X_test_norm = normalizer.transform(X_test)

In [None]:
X_train_norm = pd.DataFrame(X_train_norm, columns=X_train.columns, index=X_train.index)

In [None]:
X_test_norm = pd.DataFrame(X_test_norm, columns=X_test.columns, index=X_test.index)

In [None]:
tree = DecisionTreeRegressor(max_depth=10)
tree.fit(X_train_norm, y_train)
y_pred_test_dt = tree.predict(X_test_norm)

print(f"MAE, {mean_absolute_error(y_pred_test_dt, y_test): .2f}")
print(f"MSE, {mean_squared_error(y_pred_test_dt, y_test): .2f}")
print(f"RMSE, {root_mean_squared_error(y_pred_test_dt, y_test): .2f}")
print(f"R2 score, {tree.score(X_test_norm, y_test): .2f}")

In [None]:
tree_importance = {feature : importance for feature, importance in zip(X_train_norm.columns, tree.feature_importances_)}
tree_importance

In [None]:
from sklearn.tree import export_text

tree_viz = export_text(tree, feature_names=list(X_train_norm.columns))
print(tree_viz)

In [None]:
from sklearn.tree import DecisionTreeRegressor, export_graphviz
import graphviz

tree = DecisionTreeRegressor(max_depth=2)
tree.fit(X_train_norm, y_train)


dot_data = export_graphviz(tree, out_file="tree.dot", filled=True, rounded=True, feature_names=X_train_norm.columns)

with open("tree.dot") as f:
    dot_graph = f.read()
graphviz.Source(dot_graph)

In [None]:
#Linear regression
lin_reg = LinearRegression()
lin_reg.fit(X_train_norm, y_train) # Determines the b0 and b1's values
import pickle

#with open("linear_model.pkl", "wb") as file:
 #   pickle.dump(lin_reg, file)

#with open("min_max_scaler.pkl", "wb") as file:
 #   pickle.dump(normalizer, file)

y_pred_test = lin_reg.predict(X_test_norm)

#lin_reg.score(X_test_norm_df, y_test)

print(f"MAE {mean_absolute_error(y_pred_test, y_test): .2f}") # mean(abs(error)) = mean(abs(y_test - y_pred_test))
print(f"MSE {mean_squared_error(y_pred_test, y_test): .2f}") # mean(error**2) = mean((y_test - y_pred_test)**2)
print(f"RMSE, {root_mean_squared_error(y_pred_test, y_test): .2f}") # sqrt( mean( (y_test - y_pred_test)^2 ) ) # b0, b1, b2...
print(f"R2 score, {lin_reg.score(X_test_norm, y_test): .2f}") # r2_score

In [None]:
lin_reg_coef = {feature : coef for feature, coef in zip(X_train_norm.columns, lin_reg.coef_)}
lin_reg_coef

In [None]:
lin_reg.intercept_

In [None]:
#KNN
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=10)
knn.fit(X_train_norm, y_train)
print(f"The R2 of the model is {knn.score(X_test_norm, y_test): .2f}")
knn.predict(X_test_norm)
y_test.values