In [None]:
import pandas as pd  # Data manipulation
import numpy as np  # Numerical calculations
import matplotlib.pyplot as plt  # Basic data visualization
import seaborn as sns  # Advanced data visualization
from ydata_profiling import ProfileReport # Automated EDA reports
import os  # Importing OS module for file operations

# Load the dataset using a relative path
df = pd.read_csv("../data/train.csv")

# Check for missing values in each column
print("\nMissing Values per Column:")
print(df.isnull().sum())

# Display the first few rows of the dataset
print("\nFirst 5 Rows of the Dataset:")
print(df.head())

# Display dataset information
print("\nDataset Information:")
print(df.info())

# Display summary statistics
print("\nSummary Statistics:")
print(df.describe())

# Check current working directory
print("\nCurrent Working Directory:")
print(os.getcwd())

# List the contents of the 'data' folder
print("\nContents of 'data' Directory:")
print(os.listdir("../data"))



In [None]:
# Generate an EDA report using Pandas Profiling
profile = ProfileReport(df, title="Pandas Profiling Report", explorative=True)

# Save the report as an HTML file
profile.to_file("eda_report.html")

print("EDA report generated successfully!")


In [None]:
# Save the cleaned dataset for future use
df.to_csv("../data/cleaned_train.csv", index=False)
print("Cleaned dataset saved successfully!")


In [None]:
# Verify that missing values are handled
print("\nMissing Values After Handling:")
print(df.isnull().sum())


In [None]:
# Fill missing values for numerical columns with the median
df["LotFrontage"].fillna(df["LotFrontage"].median(), inplace=True)

# Fill missing values for categorical columns with the most frequent value (mode)
df["MasVnrType"].fillna(df["MasVnrType"].mode()[0], inplace=True)


In [None]:
# Verify that all necessary dependencies are installed correctly
import pkg_resources
import ydata_profiling

print("All dependencies are installed correctly!")


In [None]:
# Generate a Profile Report for EDA
profile = ProfileReport(df, explorative=True)
profile.to_notebook_iframe()

## Step 6: Visualizing Relationships Between Features and Target

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['GrLivArea'], y=df['SalePrice'])
plt.title('SalePrice vs GrLivArea')
plt.xlabel('Above Ground Living Area (sq ft)')
plt.ylabel('SalePrice')
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()


In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['OverallQual'], y=df['SalePrice'])
plt.title('SalePrice by Overall Quality')
plt.show()
