In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

from utils.data_cleaning import load_and_clean
from utils import save_figure
from utils._config import COLORS

## Load in the Data

In [None]:
df_og = load_and_clean(drop_columns=False, drop_rows=False)
df_clean = load_and_clean(verbose=True)

---

## Comparison Between Original and Cleaned Data

Comparison between data types of the two datasets

In [None]:
# Create color mappings for the dtypes
o_counts = df_og.dtypes.value_counts()
c_counts = df_clean.dtypes.value_counts()

all_dtypes = pd.concat([o_counts, c_counts], axis=1).fillna(0).set_axis(['Original', 'Cleaned'], axis=1)
color_map = {dtype: color for dtype, color in zip(all_dtypes.index, COLORS[:len(all_dtypes)])}

# Function to show both the percentage and the count of each data type
def count_and_pct(pct, data: pd.DataFrame):
    return f"{round(len(data.columns) * pct / 100)} ({pct:.1f}%)"

# Plot the pie charts
plt.figure(figsize=(10, 6))

# Original Dataset
plt.subplot(1,2,1)
plt.pie(o_counts, labels=o_counts.index.astype(str), autopct=lambda pct: count_and_pct(pct, df_og), colors=[color_map[d] for d in o_counts.index], startangle=90)
plt.title(f'Original Data-types ({len(df_og.columns)})')

# Cleaned Dataset
plt.subplot(1,2,2)
plt.pie(c_counts, labels=c_counts.index.astype(str), autopct=lambda pct: count_and_pct(pct, df_clean), colors=[color_map[d] for d in c_counts.index], startangle=90)
plt.title(f'Cleaned Data-types ({len(df_clean.columns)})')

plt.tight_layout()
save_figure(plt, 'data_types_pie_chart', subfolder='eda')
plt.show();

Comparison between the distributions of the prices of the listings in the cleaned dataset and the original dataset.

In [None]:
plt.figure(figsize=(12,5))

# Original Data
plt.subplot(1,2,1)
sns.histplot(df_og['price'], bins='auto', kde=True, color=COLORS[0], alpha=0.6)
plt.title("Original Price Distribution")
plt.xlabel("Price ($)")
plt.xlim(0, 1000)

# Cleaned Data
plt.subplot(1,2,2)
sns.histplot(df_clean['price'], bins=100, kde=True, color=COLORS[1], alpha=0.6)
plt.title("Cleaned Price Distribution")
plt.xlabel("Price ($)")
plt.xlim(0, 1000)

plt.tight_layout()
save_figure(plt, 'price_distribution_histogram', subfolder='eda')
plt.show();

Outlier Detection before and after cleaning, through boxplot visualization

In [None]:
plt.figure(figsize=(10,5))

# Original Data
plt.subplot(1,2,1)
sns.boxplot(y=df_og['price'], color=COLORS[0])
plt.title('Original Prices')
plt.ylabel('Price ($)')

# Cleaned Data
plt.subplot(1,2,2)
sns.boxplot(y=df_clean['price'], color=COLORS[1])
plt.title('Cleaned Prices')
plt.ylabel('Price ($)')

plt.tight_layout()
save_figure(plt, 'boxplot_price', subfolder='eda')
plt.show();

---

## Visualizations of the Cleaned Data

Create a feature correlation heatmap to visualize the relationships between the features.

In [None]:
# Extract numerical features with the highest correlations to price
num_feats = df_clean.select_dtypes(include=['number'])
corr = num_feats.corr()['price'].drop('price')
abs_corr = corr.abs().sort_values(ascending=False)
n_feats = 10 # Number of features to plot
top_corr_features = abs_corr.head(n_feats).index.tolist()
temp_df = df_clean[top_corr_features + ['price']]

# Plot the heatmap
plt.figure(figsize=(12,8))
sns.heatmap(temp_df.corr(), annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title(f"Top {n_feats} Features with Highest Absolute Correlation to Price")

plt.tight_layout()
save_figure(plt, 'top_correlation_heatmap', subfolder='eda')
plt.show();

Plot the correlation matrix for the numerical features with the highest correlation to the price

In [None]:
data = df_clean[['price', 'accommodates', 'bedrooms', 'beds', 'bathrooms']]
plt.figure(figsize=(10, 8))
sns.pairplot(data, diag_kind='kde', corner=True)

plt.tight_layout()
save_figure(plt, 'pairplot', subfolder='eda')
plt.show();

---

## Check for Normality

Use multiple tests to numerically test if the price could be normally distributed.

In [None]:
# Shapiro-Wilk Test
stat, p_value = stats.shapiro(df_clean['price'])
print("Shapiro-Wilk Test:")
print("Statistic:", stat)
print("p-value:", p_value)
print("Verdict: ", "Reject Normality" if p_value < 0.05 else "Fail to reject Normality")

# Anderson-Darling Test
result = stats.anderson(df_clean['price'], dist='norm')
print("\nAnderson-Darling Test:")
print("Statistic:", result.statistic)
print("Critical Values:", result.critical_values)
print("Significance Levels:", result.significance_level)
print("Verdict: ", "Reject Normality" if result.statistic > result.critical_values[2] else "Fail to reject Normality")

# D'Agostino and Pearson's Test
stat, p_value = stats.normaltest(df_clean['price'])
print("\nD'Agostino and Pearson's Test:")
print("Statistic:", stat)
print("p-value:", p_value)
print("Verdict: ", "Reject Normality" if p_value < 0.05 else "Fail to reject Normality")

# Kolmogorov-Smirnov Test
stat, p_value = stats.kstest(df_clean['price'], 'norm', args=(df_clean['price'].mean(), df_clean['price'].std()))
print("\nKolmogorov-Smirnov Test:")
print("Statistic:", stat)
print("p-value:", p_value)
print("Verdict: ", "Reject Normality" if p_value < 0.05 else "Fail to reject Normality")

Plot the Q-Q Plot to visualy check for normality.

In [None]:
plt.figure(figsize=(6,6))
stats.probplot(df_clean['price'], dist='norm', plot=plt)
plt.title('Normal Q-Q Plot for Price')

plt.tight_layout()
save_figure(plt, 'normal_qq_plot_price', subfolder='eda')
plt.show();