In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load the cleaned dataset
combined_data = pd.read_csv('combined_data_cleaned.csv')

# Display basic information to verify the dataset
print("Dataset Info:")
print(combined_data.info())

In [None]:
# Make a copy of the dataset for encoding
combined_encoded = combined_data.copy()

# Encode 'City' and 'Province' columns
label_encoders = {}
for col in ['City', 'Province']:
    le = LabelEncoder()
    combined_encoded[col] = le.fit_transform(combined_encoded[col])
    label_encoders[col] = le  # Store the encoder for later use

In [None]:
# Select numeric columns
numeric_and_encoded_data = combined_encoded.select_dtypes(include=['float64', 'int64'])

# Add encoded 'City' and 'Province' columns
numeric_and_encoded_data['City'] = combined_encoded['City']
numeric_and_encoded_data['Province'] = combined_encoded['Province']

# Display the columns in the numeric and encoded dataset for verification
print("Numeric and Encoded Data Columns:")
print(numeric_and_encoded_data.columns)

In [None]:
# Calculate the correlation matrix
plt.figure(figsize=(12, 8))
correlation_matrix = numeric_and_encoded_data.corr()

# Plot the heatmap
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm')
plt.title("Correlation Matrix Including Encoded Categorical Features")
plt.show()

In [None]:
# Scatter plot for Population vs. Price
plt.figure(figsize=(8, 6))
sns.scatterplot(x=combined_data['Population'], y=combined_data['Price'])
plt.title("Population vs. Price")
plt.xlabel("Population")
plt.ylabel("Price")
plt.show()

# Box plot for Province vs. Price
plt.figure(figsize=(12, 6))
sns.boxplot(x=combined_data['Province'], y=combined_data['Price'])
plt.title("Province vs. Price")
plt.xlabel("Province")
plt.ylabel("Price")
plt.xticks(rotation=45)
plt.show()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:
# Load the cleaned dataset
combined_data = pd.read_csv('combined_data_cleaned.csv')

# Display basic information
print("Dataset Info:")
print(combined_data.info())

# Check for missing values
print("Missing Values:")
print(combined_data.isnull().sum())

In [None]:
# Create a composite feature combining Province and City
combined_data['Province_City'] = combined_data['Province'] + "_" + combined_data['City'].astype(str)

In [None]:
# Map Province to its median Price
province_median_price = combined_data.groupby('Province')['Price'].median().to_dict()
combined_data['Province_Median_Price'] = combined_data['Province'].map(province_median_price)

# Drop the original Province column
combined_data.drop(columns=['Province'], inplace=True)

In [None]:
# Drop weak features like Garage, Lot_Area, and Year_Built
combined_data.drop(columns=['Garage', 'Lot_Area', 'Year_Built'], inplace=True)

In [None]:
# Encode Province_City
le = LabelEncoder()
combined_data['Province_City'] = le.fit_transform(combined_data['Province_City'])

In [None]:
# Select numerical columns
numerical_cols = ['Bedrooms', 'Bathrooms', 'Population', 'Median_Family_Income', 'Province_Median_Price']
scaler = StandardScaler()

# Scale the numerical columns
combined_data[numerical_cols] = scaler.fit_transform(combined_data[numerical_cols])

In [None]:
# Create a box plot for Price distribution across Province_City
plt.figure(figsize=(12, 8))
sns.boxplot(x='Province_City', y='Price', data=combined_data)
plt.title("Price Distribution by Province_City")
plt.xlabel("Province_City")
plt.ylabel("Price")
plt.xticks(rotation=90)  # Rotate labels for better readability
plt.show()