# 🧠 Exploratory Data Analysis – House Prices Dataset

In [None]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
# Load the training and test datasets
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
train.shape, test.shape

In [None]:
# Preview the first few rows of the training set
train.head()

In [None]:
# Check for missing values in the training set
train.isnull().sum().sort_values(ascending=False).head(20)

In [None]:
# Visualize the distribution of the target variable 'SalePrice'
sns.histplot(train['SalePrice'], kde=True)
plt.title('Distribution of Sale Price')
plt.xlabel('Sale Price')
plt.ylabel('Count')
plt.show()

## 📉 Outlier Detection

Let's visualize key numerical features with the highest correlation to `SalePrice` to detect possible outliers.


In [None]:
# Scatter plot of GrLivArea vs SalePrice
plt.figure(figsize=(8, 6))
sns.scatterplot(data=train, x='GrLivArea', y='SalePrice')
plt.title('GrLivArea vs SalePrice')
plt.xlabel('Above Ground Living Area (sq ft)')
plt.ylabel('Sale Price')
plt.show()

# Check for unusually large living areas with low sale prices
train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000)]

In [None]:
# Compute and visualize correlation with SalePrice
corr = train.corr(numeric_only=True)
sns.heatmap(corr[['SalePrice']].sort_values('SalePrice', ascending=False), 
            annot=True, cmap='coolwarm')
plt.title('Feature Correlation with SalePrice')

## 🧩 Interpreting Missing Values

In this dataset, many `NaN` values **are not missing at random**, but rather represent **the absence of a feature**.

For example:
- `PoolQC` is NaN → there is no pool.
- `GarageType` is NaN → no garage.
- `Fence` is NaN → no fence.

⚠️ Therefore, **dropping or blindly imputing missing values is not appropriate**.
We will instead treat them as meaningful categorical values in further processing.


In [None]:
# Display the top 10 features most correlated with SalePrice
corr['SalePrice'].abs().sort_values(ascending=False).head(10)