## 1. Import Libraries

Import all necessary libraries for data analysis, visualization, and modeling.

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Set random seed for reproducibility
np.random.seed(42)

# Set plotting style
sns.set_style('whitegrid')
%matplotlib inline

## 2. Load Data

Load the housing price dataset from the raw data folder.

In [None]:
# Load dataset
df = pd.read_csv('../data/raw/dataset3.csv')

# Display first few rows
print(f"Dataset shape: {df.shape}")
df.head()

## 3. Exploratory Data Analysis (EDA)

### 3.1 Basic Information

Examine the structure and basic statistics of the dataset.

In [None]:
# Display data info
print("Dataset Info:")
print(df.info())
print("\nBasic Statistics:")
print(df.describe())

### 3.2 Missing Values

Check for missing values in the dataset.

In [None]:
# TODO: Check for missing values and visualize them
# Hint: Use df.isnull().sum() and create a bar plot for columns with missing values

### 3.3 Target Variable Distribution

Analyze the distribution of the target variable (sale price).

In [None]:
# TODO: Analyze the sale_price distribution
# - Plot histogram
# - Check for skewness
# - Consider log transformation if needed
# - Display basic statistics

### 3.4 Feature Exploration

Create visualizations to explore relationships between features and sale price.

In [None]:
# TODO: Create exploratory visualizations
# Ideas:
# - Correlation heatmap for numerical features
# - Scatter plots: living_area_sqft vs price, overall_quality vs price
# - Box plots: price by neighborhood, property_type
# - Price distribution by categorical features
# - Feature relationships (e.g., bedrooms vs bathrooms)

# Example: Price by neighborhood
# plt.figure(figsize=(14, 6))
# df.groupby('neighborhood')['sale_price'].median().sort_values().plot(kind='bar')
# plt.title('Median Sale Price by Neighborhood')
# plt.ylabel('Sale Price ($)')
# plt.xticks(rotation=45)
# plt.show()

## 4. Data Cleaning and Preprocessing

### 4.1 Handle Missing Values

Decide how to handle missing values for this housing dataset.

In [None]:
# TODO: Handle missing values
# Consider:
# - garage_size_cars: missing might mean no garage (fill with 0)
# - basement_sqft: missing might mean no basement (fill with 0)
# - year_built: impute with median or mode
# - Think about what makes sense for each feature

### 4.2 Handle Duplicates

Check for and remove duplicate property entries.

In [None]:
# TODO: Check for duplicates and remove if necessary

### 4.3 Handle Categorical Variables

Standardize and encode categorical variables. Watch out for inconsistencies!

In [None]:
# TODO: Standardize categorical values (e.g., 'Yes' vs 'yes')
# TODO: Encode categorical variables
# Consider:
# - One-hot encoding for nominal features (neighborhood, property_type, etc.)
# - Label encoding for ordinal features (overall_quality, kitchen_quality)
# - Binary encoding for yes/no features

### 4.4 Handle Outliers

Identify and handle outliers in price and other numerical features.

In [None]:
# TODO: Identify outliers
# - Use box plots for visualization
# - Consider IQR method or z-scores
# - Decide whether to remove extreme outliers or cap them
# - Be careful: high prices might be legitimate luxury homes

### 4.5 Feature Engineering (Optional)

Consider creating new features that might improve predictions.

In [None]:
# TODO: Create new features if helpful
# Ideas:
# - Total bathrooms (including half baths)
# - Price per square foot
# - Age bins (new, modern, old, vintage)
# - Total amenities count
# - Quality × Condition interaction

### 4.6 Handle Skewness

Consider transforming skewed features (especially the target variable if needed).

In [None]:
# TODO: Check skewness of numerical features
# Consider log transformation for highly skewed features
# Note: if you transform the target, remember to inverse transform predictions

### 4.7 Feature Scaling

Scale numerical features for better model performance.

In [None]:
# TODO: Scale numerical features
# StandardScaler or RobustScaler (better for outliers)

## 5. Train/Validation Split

Split the data into training and validation sets.

In [None]:
# TODO: Prepare features (X) and target (y)
# Remove non-predictive columns (e.g., property_id)

# TODO: Split into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 6. Model Training

Train regression model(s). Consider:
- Linear Regression (baseline)
- Ridge/Lasso Regression (with regularization)
- Random Forest Regressor
- Gradient Boosting Regressor (XGBoost, LightGBM)

In [None]:
# TODO: Import and train your chosen model(s)

# Example:
# from sklearn.ensemble import RandomForestRegressor
# model = RandomForestRegressor(n_estimators=100, random_state=42)
# model.fit(X_train, y_train)

## 7. Model Evaluation

Evaluate your regression model using appropriate metrics:
- RMSE (Root Mean Squared Error)
- MAE (Mean Absolute Error)
- R² Score
- MAPE (Mean Absolute Percentage Error) - optional

In [None]:
# TODO: Make predictions on the test set
# y_pred = model.predict(X_test)
# If you transformed y, inverse transform here

In [None]:
# TODO: Calculate and display evaluation metrics
# rmse = np.sqrt(mean_squared_error(y_test, y_pred))
# mae = mean_absolute_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# print(f"RMSE: ${rmse:,.2f}")
# print(f"MAE: ${mae:,.2f}")
# print(f"R² Score: {r2:.4f}")

## 8. Visualizations

Create visualizations to communicate your findings and model performance.

### 8.1 EDA Visualization

Create a visualization that highlights important patterns in the housing data.

In [None]:
# TODO: Create an EDA visualization
# Save it to ../figures/dataset3/eda_plot.png

# Ideas:
# - Correlation heatmap of top features
# - Price by neighborhood and property type
# - Scatter plot matrix of key features
# - Price trends by property age or quality

# Example template:
# fig, ax = plt.subplots(figsize=(14, 8))
# # Your visualization code here
# plt.tight_layout()
# plt.savefig('../figures/dataset3/eda_plot.png', dpi=300, bbox_inches='tight')
# plt.show()

### 8.2 Model Performance Visualization

Create a visualization showing regression model performance.

In [None]:
# TODO: Create a model performance visualization
# Save it to ../figures/dataset3/model_performance.png

# Ideas:
# - Actual vs Predicted scatter plot with diagonal line
# - Residual plot (residuals vs predicted values)
# - Feature importance (for tree-based models)
# - Distribution of prediction errors

# Example template:
# fig, ax = plt.subplots(figsize=(10, 8))
# # Your visualization code here
# plt.tight_layout()
# plt.savefig('../figures/dataset3/model_performance.png', dpi=300, bbox_inches='tight')
# plt.show()

## 9. Summary and Next Steps

Write a brief summary of:
- Key price drivers identified
- Model performance and prediction accuracy
- Recommendations and potential improvements

**Your Summary Here:**

- Key price drivers: ...
- Model performance: ...
- Limitations: ...
- Next steps: ...