In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [56]:
class DataProcessor:
	def __init__(self):
		self.df_zip = None
		self.df_salary = None
	
	def load_data(self):
		"""Load raw datasets"""
		print("="*80)
		print("PHASE 1: DATA LOADING & INITIAL PROFILING")
		print("="*80)

		self.df_zip = pd.read_excel("data/US_housing_prices.xlsx")
		self.df_salary = pd.read_csv("data/DC_Public_Employee_Salary.csv")

		print(f"\n✓ ZIP CODES: {self.df_zip.shape[0]} rows × {self.df_zip.shape[1]} columns")
		print(f"✓ SALARY DATA: {self.df_salary.shape[0]} rows × {self.df_salary.shape[1]} columns")
		
		# Initial quality check
		print(f"\n[ZIP DATA] Missing values: {self.df_zip.isnull().sum().sum()}")
		print(f"[SALARY DATA] Missing values: {self.df_salary.isnull().sum().sum()}")
		
		return self

# Create an instance and load the data
processor = DataProcessor()
processor.load_data()

PHASE 1: DATA LOADING & INITIAL PROFILING

✓ ZIP CODES: 26313 rows × 317 columns
✓ SALARY DATA: 84268 rows × 9 columns

[ZIP DATA] Missing values: 1939163
[SALARY DATA] Missing values: 109


<__main__.DataProcessor at 0x118cd5e80>

In [57]:
processor.df_zip.head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,2000-01-31 00:00:00,...,2024-11-30 00:00:00,2024-12-31 00:00:00,2025-01-31 00:00:00,2025-02-28 00:00:00,2025-03-31 00:00:00,2025-04-30 00:00:00,2025-05-31 00:00:00,2025-06-30 00:00:00,2025-07-31 00:00:00,2025-08-31 00:00:00
0,91982,1,77494,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,212188.8134,...,505406.4338,505829.3944,506257.7759,505932.8949,505018.5373,503212.1814,501001.4043,498385.552,496296.6346,495168.747
1,61148,2,8701,zip,NJ,NJ,Lakewood,"New York-Newark-Jersey City, NY-NJ-PA",Ocean County,115971.6517,...,539607.01,541369.3428,541929.401,543877.7401,546201.0215,549536.6608,552397.1327,555002.3343,556841.1514,558123.1322
2,91940,3,77449,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Harris County,104601.5242,...,282857.0061,282121.6755,281505.9731,280877.0404,280038.2102,279071.7232,278261.0681,277252.9979,276304.4661,275361.6749
3,62080,4,11368,zip,NY,NY,New York,"New York-Newark-Jersey City, NY-NJ-PA",Queens County,174036.4464,...,537444.62,534784.8078,532292.757,532161.334,531815.8046,531702.6999,532095.5557,534167.1058,536925.3144,538597.5972
4,91733,5,77084,zip,TX,TX,Houston,"Houston-The Woodlands-Sugar Land, TX",Harris County,103494.3356,...,276232.059,275889.5732,275644.6636,275238.7841,274372.8367,273324.0265,272422.993,271412.7257,270492.4832,269609.4093


In [58]:
# in the zip codes dataset, we want to only filter the STATE = 'DC' 
processor.df_zip = processor.df_zip[processor.df_zip['State'] == 'DC']


In [59]:
# Recheck the missing values after filtering
print(f"\n[ZIP DATA after filtering] Missing values: {processor.df_zip.isnull().sum().sum()}")
print(f"[SALARY DATA after filtering] Missing values: {processor.df_salary.isnull().sum().sum()}")


[ZIP DATA after filtering] Missing values: 148
[SALARY DATA after filtering] Missing values: 109


In [60]:
# Identify columns with missing values in ZIP data
missing_zip = processor.df_zip.isnull().sum()
missing_zip = missing_zip[missing_zip > 0]
print("\nColumns with missing values in ZIP data:")
print(missing_zip[missing_zip > 0])



Columns with missing values in ZIP data:
2000-01-31    1
2000-02-29    1
2000-03-31    1
2000-04-30    1
2000-05-31    1
             ..
2011-12-31    1
2012-01-31    1
2012-02-29    1
2012-03-31    1
2012-04-30    1
Length: 148, dtype: int64


In [None]:
# Identify columns with missing values in SALARY data
missing_salary = processor.df_salary.isnull().sum()
missing_salary = missing_salary[missing_salary > 0]
print("\nColumns with missing values in SALARY data:")
print(missing_salary[missing_salary > 0])
# drop the columns with too many missing values since this data is not critical to our analysis
processor.df_salary = processor.df_salary.drop(columns='GRADE')



Columns with missing values in SALARY data:
GRADE    109
dtype: int64


In [63]:
processor.df_salary.head()
missing_salary = processor.df_salary.isnull().sum()
missing_salary = missing_salary[missing_salary > 0]
print("\nColumns with missing values in SALARY data:")
print(missing_salary[missing_salary > 0])


Columns with missing values in SALARY data:
Series([], dtype: int64)
