In [None]:
import pandas as pd  # Data manipulation
import numpy as np  # Numerical calculations
import matplotlib.pyplot as plt  # Basic data visualization
import seaborn as sns  # Advanced data visualization
from ydata_profiling import ProfileReport # Automated EDA reports
import os  # Importing OS module for file operations

# Load the dataset using a relative path
df = pd.read_csv("../data/train.csv")

# Check for missing values in each column
print("\nMissing Values per Column:")
print(df.isnull().sum())

# Display the first few rows of the dataset
print("\nFirst 5 Rows of the Dataset:")
print(df.head())

# Display dataset information
print("\nDataset Information:")
print(df.info())

# Display summary statistics
print("\nSummary Statistics:")
print(df.describe())

# Check current working directory
print("\nCurrent Working Directory:")
print(os.getcwd())

# List the contents of the 'data' folder
print("\nContents of 'data' Directory:")
print(os.listdir("../data"))




In [None]:
# Reload the saved CSV file and check its contents
df_check = pd.read_csv("../data/final_cleaned_train.csv")

# Print the shape to verify it has rows and columns
print("Reloaded CSV shape:", df_check.shape)  # Should NOT be (0, 82)

# Show first few rows to confirm the data is correctly loaded
print(df_check.head())


In [None]:
print("Before saving, DataFrame shape:", df.shape)  # Should NOT be (0, 82)
print(df.head())  # Preview first few rows


In [None]:
df.to_csv("../data/final_cleaned_train.csv", index=False)
print("Dataset saved successfully!")


In [None]:
# Ensure the os module is imported
import os 
# Lists all files in the data directory
print("Files in data folder:", os.listdir("../data"))


In [None]:
import pandas as pd
df_check = pd.read_csv("../data/final_cleaned_train.csv")

# Should show (number_of_rows, number_of_columns)
print("Shape of CSV file:", df_check.shape)  

In [None]:
# Generate an EDA report using Pandas Profiling
profile = ProfileReport(df, explorative=True, correlations={"auto": {"calculate": False}})

# Save the report as an HTML file
profile.to_file("eda_report.html")

print("EDA report generated successfully!")


In [None]:
# Save the cleaned dataset for future use
df.to_csv("../data/cleaned_train.csv", index=False)
print("Cleaned dataset saved successfully!")


In [None]:

# Apply log transformation to SalePrice if highly skewed
df['SalePrice'] = np.log1p(df['SalePrice'])

# Handle any infinite or NaN values after transformation
if df.isnull().sum().sum() > 0:  # Only drop rows if there are missing values
    df.dropna(inplace=True)

# Create a new feature: Price per square foot
df['PricePerSqFt'] = df['SalePrice'] / df['GrLivArea']
df['PricePerSqFt'].fillna(df['PricePerSqFt'].median(), inplace=True)  # Avoid NaN if division by zero occurs

# Save the transformed dataset
df.to_csv("../data/final_cleaned_train.csv", index=False)
print("Final cleaned dataset saved successfully with feature transformations!")


In [None]:
# Verify that missing values are handled
print("\nMissing Values After Handling:")
print(df.isnull().sum())


In [None]:
# Fill missing values for numerical columns with the median
df["LotFrontage"].fillna(df["LotFrontage"].median(), inplace=True)

# Fill missing values for categorical columns with the most frequent value (mode)
mode_value = df["MasVnrType"].mode()
if not mode_value.empty:
    df["MasVnrType"].fillna(mode_value[0], inplace=True)
else:
    df["MasVnrType"].fillna("Unknown", inplace=True)  # Or another default value



In [None]:
# Verify that all necessary dependencies are installed correctly
import pkg_resources
import ydata_profiling

print("All dependencies are installed correctly!")


In [None]:
# Check if DataFrame has data before running ProfileReport
print("DataFrame Shape:", df.shape)  # Verify if it has rows and columns
print(df.head())  # Preview first few rows

# Generate a Profile Report for EDA
profile = ProfileReport(df, explorative=True)
profile.to_notebook_iframe()

print("DataFrame shape before ProfileReport:", df.shape)
print(df.head())  # Preview first few rows


In [None]:
# Scatter plot to visualize the relationship between SalePrice and GrLivArea
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['GrLivArea'], y=df['SalePrice'])
plt.title('SalePrice vs GrLivArea')  # Title of the plot
plt.xlabel('Above Ground Living Area (sq ft)')  # Label for x-axis
plt.ylabel('SalePrice')  # Label for y-axis
plt.show()


In [None]:
# Correlation matrix heatmap to show relationships between all features
plt.figure(figsize=(12, 8))
corr_matrix = df.corr()  # Compute correlation matrix
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')  # Title of the heatmap
plt.show()


In [None]:
# Box plot to visualize the distribution of SalePrice across different OverallQual levels
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['OverallQual'], y=df['SalePrice'])
plt.title('SalePrice by Overall Quality')  # Title of the plot
plt.show()


In [None]:
# Box plot to check for outliers in SalePrice
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['SalePrice'])
plt.title('SalePrice Distribution with Outliers')  # Title of the plot
plt.show()

# Removing extreme SalePrice outliers (e.g., above 700,000)
df = df[df['SalePrice'] < 700000]

# Save the cleaned dataset after removing outliers
df.to_csv("../data/cleaned_train.csv", index=False)
print("Cleaned dataset saved successfully after outlier removal!")



## Summary of Insights

1. **High Correlation with `OverallQual`**:
   - There is a strong positive correlation between the `SalePrice` and `OverallQual`. Higher quality houses tend to sell for much higher prices, indicating that the overall condition and features of the house significantly impact its value.

2. **Log Transformation of `SalePrice`**:
   - The distribution of `SalePrice` is highly skewed, which could negatively affect certain machine learning models. To address this, a log transformation was applied to `SalePrice`, which helped normalize the distribution and made the data more suitable for modeling.

3. **Outliers in `SalePrice`**:
   - Several outliers were detected in the `SalePrice` feature. These extreme values can distort the model’s accuracy. As a result, outliers above a specified threshold (e.g., 700,000 USD) were removed to ensure the model performs well without being skewed by these extreme values.

## Business Relevance

- **Impact on Property Investment**: These insights can guide property investors in focusing on homes with higher `OverallQual` ratings, which are likely to provide better returns. Additionally, adjusting for outliers ensures that predictions are more accurate and not overly influenced by atypical data points.

- **Market Strategy**: The transformation of `SalePrice` helps in identifying more consistent price trends, which can aid real estate businesses in setting better price expectations and making more data-driven decisions for listing properties.
