In [None]:
# Importing essential libraries
import pandas as pd  # Data manipulation
import numpy as np  # Numerical calculations
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport  # Automated EDA reports
import os  # File operations

# Ensure correct Matplotlib backend for Jupyter Notebook
%matplotlib inline

# Set plot style
sns.set_style("whitegrid")




In [None]:
# Load the dataset
df = pd.read_csv("../data/train.csv")  # Adjust the path if necessary

# Print dataset shape
print("Dataset Shape:", df.shape)

# Display first few rows
df.head()


In [None]:
# Check for missing values in each column
print("\nMissing Values per Column:")
print(df.isnull().sum())

# Display dataset information
print("\nDataset Information:")
df.info()

# Display summary statistics
print("\nSummary Statistics:")
df.describe()


In [None]:
# Check the current working directory
print("\nCurrent Working Directory:")
print(os.getcwd())

# List the contents of the 'data' folder
print("\nContents of 'data' Directory:")
print(os.listdir("../data"))



In [None]:
# Generate an EDA report
profile = ProfileReport(df, explorative=True)

# Save the report as an HTML file
profile.to_file("eda_report.html")

print("EDA report generated successfully!")



In [None]:
# Fill missing values for numerical columns with median
df["LotFrontage"].fillna(df["LotFrontage"].median(), inplace=True)

# Fill missing values for categorical columns with mode
mode_value = df["MasVnrType"].mode()
if not mode_value.empty:
    df["MasVnrType"].fillna(mode_value[0], inplace=True)
else:
    df["MasVnrType"].fillna("Unknown", inplace=True)  # Default value if mode is empty

# Verify missing values are handled
print("\nMissing Values After Handling:")
print(df.isnull().sum())


In [None]:
# Save the cleaned dataset
df.to_csv("../data/cleaned_train.csv", index=False)
print("Cleaned dataset saved successfully!")


In [None]:
# Apply log transformation to SalePrice if highly skewed
df['SalePrice'] = np.log1p(df['SalePrice'])

# Handle any infinite or NaN values after transformation
if df.isnull().sum().sum() > 0:  # Only drop rows if there are missing values
    df.fillna(df.median(numeric_only=True), inplace=True)  # <-- Indented properly


# Fill missing values for categorical columns with the most frequent value (mode)
for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)


# Create a new feature: Price per square foot
df['PricePerSqFt'] = df['SalePrice'] / df['GrLivArea']
df['PricePerSqFt'].fillna(df['PricePerSqFt'].median(), inplace=True)  # Avoid NaN if division by zero occurs

# Save the transformed dataset

# Debugging: Check DataFrame before saving
print("Before saving, DataFrame shape:", df.shape)

# Save only if the DataFrame is NOT empty
if df.shape[0] > 0:
    df.to_csv("../data/final_cleaned_train.csv", index=False)
    print(f"Final cleaned dataset saved successfully! Shape: {df.shape}")
else:
    raise ValueError("ERROR: DataFrame is empty before saving! Check preprocessing steps.")



In [None]:
# Verify the CSV file before loading

file_path = "../data/final_cleaned_train.csv"

if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
    df_check = pd.read_csv(file_path)
    print("Reloaded CSV successfully! Shape:", df_check.shape)
    print(df_check.head())
else:
    print("WARNING: CSV file is empty or missing!")



In [None]:
# Scatter plot to visualize the relationship between SalePrice and GrLivArea
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['GrLivArea'], y=df['SalePrice'])
plt.title('SalePrice vs GrLivArea')
plt.xlabel('Above Ground Living Area (sq ft)')
plt.ylabel('SalePrice')
plt.show()


In [None]:
# Correlation matrix heatmap to show relationships between all features
plt.figure(figsize=(12, 8))
# Remove any columns that are completely NaN before correlation
df_corr = df.dropna(axis=1, how="all")  # Remove columns that are completely NaN
df_corr_numeric = df_corr.select_dtypes(include=['number'])  # Keep only numeric columns

if df_corr_numeric.shape[0] > 0 and df_corr_numeric.shape[1] > 1:
    corr_matrix = df_corr_numeric.corr()  # Compute correlation only on numeric values
    sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
    plt.title("Correlation Matrix")
    plt.show()
else:
    print("WARNING: Not enough valid numeric data to compute correlations!")





In [None]:
# Box plot to visualize the distribution of SalePrice across different OverallQual levels
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['OverallQual'], y=df['SalePrice'])
plt.title('SalePrice by Overall Quality')
plt.show()


In [None]:
# Box plot to check for outliers in SalePrice
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['SalePrice'])
plt.title('SalePrice Distribution with Outliers')
plt.show()


## Summary of Insights

1. **High Correlation with `OverallQual`**:
   - There is a strong positive correlation between the `SalePrice` and `OverallQual`. Higher quality houses tend to sell for much higher prices, indicating that the overall condition and features of the house significantly impact its value.

2. **Log Transformation of `SalePrice`**:
   - The distribution of `SalePrice` is highly skewed, which could negatively affect certain machine learning models. To address this, a log transformation was applied to `SalePrice`, which helped normalize the distribution and made the data more suitable for modeling.

3. **Outliers in `SalePrice`**:
   - Several outliers were detected in the `SalePrice` feature. These extreme values can distort the model’s accuracy. As a result, outliers above a specified threshold (e.g., 700,000 USD) were removed to ensure the model performs well without being skewed by these extreme values.

## Business Relevance

- **Impact on Property Investment**: These insights can guide property investors in focusing on homes with higher `OverallQual` ratings, which are likely to provide better returns. Additionally, adjusting for outliers ensures that predictions are more accurate and not overly influenced by atypical data points.

- **Market Strategy**: The transformation of `SalePrice` helps in identifying more consistent price trends, which can aid real estate businesses in setting better price expectations and making more data-driven decisions for listing properties.


In [None]:
# Ensure that filtering does not remove all data
if df.shape[0] > 0:
    filtered_df = df[df['SalePrice'] < 700000]

    if filtered_df.shape[0] > 0:
        df = filtered_df
        print("Outliers removed. New shape:", df.shape)
    else:
        print("WARNING: No data left after filtering! Keeping original dataset.")
else:
    print("ERROR: DataFrame is already empty before filtering.")



