# House Price Prediction - EDA and Data Cleaning
## Exploratory Data Analysis and Data Preprocessing

### 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

### 2. Load Dataset

In [None]:
# Load the dataset
df = pd.read_csv('house_data.csv')
print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")

### 3. Initial Data Exploration

In [None]:
# Display first few rows
print("First 5 rows of the dataset:")
df.head()

In [None]:
# Dataset information
print("Dataset Information:")
df.info()

In [None]:
# Statistical summary
print("Statistical Summary:")
df.describe()

In [None]:
# Check data types
print("Data Types:")
print(df.dtypes)

### 4. Check for Missing Values

In [None]:
# Check missing values
print("Missing Values:")
missing_values = df.isnull().sum()
missing_percentage = (df.isnull().sum() / len(df)) * 100
missing_df = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percentage})
print(missing_df[missing_df['Missing Values'] > 0])

In [None]:
# Visualize missing values
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap='viridis', yticklabels=False)
plt.title('Missing Values Heatmap')
plt.show()

### 5. Data Distribution Analysis

In [None]:
# Distribution of target variable (Price)
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
sns.histplot(df['Price'], kde=True, bins=30, color='blue')
plt.title('Distribution of House Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
sns.boxplot(y=df['Price'], color='lightblue')
plt.title('Box Plot of House Prices')
plt.ylabel('Price')

plt.tight_layout()
plt.show()

In [None]:
# Distribution of numerical features
numerical_cols = ['Area', 'Bedrooms', 'Bathrooms', 'Floors', 'YearBuilt']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    sns.histplot(df[col], kde=True, ax=axes[idx], bins=20)
    axes[idx].set_title(f'Distribution of {col}')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

### 6. Categorical Variables Analysis

In [None]:
# Categorical columns
categorical_cols = ['Location', 'Condition', 'Garage']

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for idx, col in enumerate(categorical_cols):
    df[col].value_counts().plot(kind='bar', ax=axes[idx], color='skyblue')
    axes[idx].set_title(f'Distribution of {col}')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Count')
    axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

### 7. Correlation Analysis

In [None]:
# Correlation matrix for numerical features
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
correlation_matrix = df[numerical_features].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, fmt='.2f')
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.show()

In [None]:
# Correlation with target variable (Price)
price_correlation = df[numerical_features].corr()['Price'].sort_values(ascending=False)
print("\nCorrelation with Price:")
print(price_correlation)

### 8. Feature Relationships with Price

In [None]:
# Scatter plots of features vs Price
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for idx, col in enumerate(numerical_cols):
    axes[idx].scatter(df[col], df['Price'], alpha=0.5)
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Price')
    axes[idx].set_title(f'{col} vs Price')

plt.tight_layout()
plt.show()

In [None]:
# Box plots for categorical variables vs Price
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for idx, col in enumerate(categorical_cols):
    sns.boxplot(x=col, y='Price', data=df, ax=axes[idx])
    axes[idx].set_title(f'{col} vs Price')
    axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

### 9. Outlier Detection

In [None]:
# Detect outliers using IQR method
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return len(outliers)

print("Number of outliers in each numerical column:")
for col in numerical_features:
    if col != 'Id':
        outlier_count = detect_outliers_iqr(df, col)
        print(f"{col}: {outlier_count} outliers")

### 10. Data Cleaning

In [None]:
# Create a copy for cleaning
df_clean = df.copy()

# Drop Id column (not useful for prediction)
if 'Id' in df_clean.columns:
    df_clean = df_clean.drop('Id', axis=1)
    print("Dropped 'Id' column")

# Check for any remaining missing values
print(f"\nMissing values after initial cleaning: {df_clean.isnull().sum().sum()}")

In [None]:
# Handle missing values (if any)
# For numerical columns: fill with median
# For categorical columns: fill with mode

for col in df_clean.columns:
    if df_clean[col].isnull().sum() > 0:
        if df_clean[col].dtype in ['int64', 'float64']:
            df_clean[col].fillna(df_clean[col].median(), inplace=True)
            print(f"Filled missing values in {col} with median")
        else:
            df_clean[col].fillna(df_clean[col].mode()[0], inplace=True)
            print(f"Filled missing values in {col} with mode")

In [None]:
# Check final dataset
print("\nCleaned Dataset Shape:", df_clean.shape)
print("\nCleaned Dataset Info:")
df_clean.info()

### 11. Save Cleaned Data

In [None]:
# Save cleaned dataset
df_clean.to_csv('house_data_cleaned.csv', index=False)
print("Cleaned data saved as 'house_data_cleaned.csv'")

### 12. Key Insights Summary

In [None]:
print("=" * 60)
print("KEY INSIGHTS FROM EDA")
print("=" * 60)
print(f"\n1. Dataset Shape: {df_clean.shape[0]} rows, {df_clean.shape[1]} columns")
print(f"\n2. Target Variable (Price):")
print(f"   - Mean: ${df_clean['Price'].mean():,.2f}")
print(f"   - Median: ${df_clean['Price'].median():,.2f}")
print(f"   - Min: ${df_clean['Price'].min():,.2f}")
print(f"   - Max: ${df_clean['Price'].max():,.2f}")
print(f"\n3. Most Correlated Features with Price:")
print(price_correlation.head(6))
print(f"\n4. Categorical Variables:")
for col in categorical_cols:
    print(f"   - {col}: {df_clean[col].nunique()} unique values")
print("\n" + "=" * 60)