# Step 1: Load and Understand the Dataset


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Load the dataset
housing_or = pd.read_csv('housing.csv')

# Check dataset info
print(housing_or.info())

# Display the first 5 rows to get a feel of the data
print(housing_or.head())


# Step 2: Handle Missing Values

In [None]:
# Check for missing values
print(housing_or.isnull().sum())


# delete the row of missing value
housing = housing_or.dropna()
print()

# Verify if there are any more missing values
print(housing.isnull().sum())


# Step 3: Descriptive Statistics

In [None]:
# Show descriptive statistics
print(housing.describe())


# Step 4: Visualizations

## Histogram of median_house_value



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Histogram for median_house_value
plt.figure(figsize=(10, 6))
sns.histplot(housing['median_house_value'], kde=True, bins=50)
plt.title('Distribution of Median House Value')
plt.xlabel('Median House Value ($100,000)')
plt.ylabel('Frequency')
plt.show()


## Scatter Plot Between median_house_value and median_income

In [None]:
# Scatter plot between median_house_value and median_income
plt.figure(figsize=(10, 6))
sns.scatterplot(x='median_income', y='median_house_value', data=housing, alpha=0.6)
plt.title('Relationship Between Median Income and House Value')
plt.xlabel('Median Income')
plt.ylabel('Median House Value ($100,000)')
plt.show()


## Box Plot for ocean_proximity and median_house_value

In [None]:
# Box plot for ocean_proximity and median_house_value
plt.figure(figsize=(12, 6))
sns.boxplot(x='ocean_proximity', y='median_house_value', data=housing)
plt.title('House Values by Ocean Proximity')
plt.xlabel('Ocean Proximity')
plt.ylabel('Median House Value ($100,000)')
plt.xticks(rotation=45)
plt.show()


### We can also explore how proximity to the ocean differ by house age

In [None]:
# Box plot for ocean_proximity and median_house_value
plt.figure(figsize=(20, 10))
sns.boxplot(x='housing_median_age', y='ocean_proximity', data=housing)
plt.title('House age by Ocean Proximity')
plt.xlabel('housing median age')
plt.ylabel('Ocean Proximity')
plt.xticks(rotation=45)
plt.show()


## Step 5: Geographical Distribution

In [None]:
# Geographical distribution of house value
plt.figure(figsize=(10, 6))
sns.scatterplot(x='longitude', y='latitude', hue='median_house_value', palette='coolwarm', data=housing, size='population', sizes=(10, 100), alpha=0.6)
plt.title('Geographical Distribution of House Values')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend(title='Median House Value', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


## Step 6: Correlation Between Numeric Variables

In [None]:
# Select only numeric columns for correlation calculation
numeric_columns = housing.select_dtypes(include=['float64', 'int64']).columns

# Calculate the correlation matrix for numeric columns only
correlation_matrix = housing[numeric_columns].corr()

# Visualize the correlation matrix using a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Between Numeric Variables')
plt.show()
