# HOUSING MARKET INDEX

Combining the tables all together

In [4]:
import pandas as pd
import sqlite3

# Step 1: Load CSVs
file_paths = [
    "house/B19013.csv", 
    "house/B25001.csv",
    "house/B25002.csv",
    "house/B25003.csv",
    "house/B25036.csv",
    "house/B25070.csv",
    "house/B25077.csv",
    "house/B25091.csv",
    "house/PEPANNRES.csv",
    ""

]



Median_Household_Income = pd.read_csv(file_paths[0])
Total_Housing_Units = pd.read_csv(file_paths[1])
Vacancy_Status = pd.read_csv(file_paths[2])
Homeownership_Rent = pd.read_csv(file_paths[3])
Houses_Built = pd.read_csv(file_paths[4])
Gross_Rent_Burden = pd.read_csv(file_paths[5])
Median_value_dollars = pd.read_csv(file_paths[6])
Owner_Cost_Burden = pd.read_csv(file_paths[7])
Population = pd.read_csv(file_paths[8])



# Step 4: Load into SQLite
conn = sqlite3.connect(":memory:")
Median_Household_Income.to_sql("income", conn, index=False, if_exists="replace")
Total_Housing_Units.to_sql("units", conn, index=False, if_exists="replace")
Vacancy_Status.to_sql("vacancy", conn, index=False, if_exists="replace")
Homeownership_Rent.to_sql("homeownership", conn, index=False, if_exists="replace")
Houses_Built.to_sql("built", conn, index=False, if_exists="replace")
Gross_Rent_Burden.to_sql("gross_rent", conn, index=False, if_exists="replace")
Median_value_dollars.to_sql("median_value", conn, index=False, if_exists="replace")
Owner_Cost_Burden.to_sql("owner_cost", conn, index=False, if_exists="replace")
Population.to_sql("population", conn, index=False, if_exists="replace")


# Step 5: SQL Join using simplified column names
query = """
SELECT 
    population.Region AS Region,
    income.[Median household income],
    units.[Total Housing Units],
    vacancy.Vacant,
    homeownership.[Renter occupied],
    homeownership.[Owner occupied],
    built.[Houses Built],
    gross_rent.[Gross Rent Burden],
    median_value.[Median value (dollars)],
    owner_cost.[Owner Cost Burden],
    population.Population
FROM population
LEFT JOIN income ON population.Region = income.Region
LEFT JOIN units ON population.Region = units.Region
LEFT JOIN vacancy ON population.Region = vacancy.Region
LEFT JOIN homeownership ON population.Region = homeownership.Region
LEFT JOIN built ON population.Region = built.Region
LEFT JOIN gross_rent ON population.Region = gross_rent.Region
LEFT JOIN median_value ON population.Region = median_value.Region
LEFT JOIN owner_cost ON population.Region = owner_cost.Region
WHERE
    population.Population IS NOT NULL
    AND income.[Median household income] IS NOT NULL
    AND median_value.[Median value (dollars)] IS NOT NULL
    AND homeownership.[Renter occupied] IS NOT NULL
    AND homeownership.[Owner occupied] IS NOT NULL
    AND built.[Houses Built] IS NOT NULL
    AND vacancy.Vacant IS NOT NULL
    AND owner_cost.[Owner Cost Burden] IS NOT NULL
    AND gross_rent.[Gross Rent Burden] IS NOT NULL
"""




# Step 6: Run query and load results
merged_df = pd.read_sql_query(query, conn)
conn.close()

# Step 7: Identify columns to convert (exclude Region)
cols_to_convert = merged_df.columns.drop('Region')


merged_df = merged_df[(merged_df[cols_to_convert] != 0).all(axis=1)]

# View result
merged_df


Unnamed: 0,Region,Median household income,Total Housing Units,Vacant,Renter occupied,Owner occupied,Houses Built,Gross Rent Burden,Median value (dollars),Owner Cost Burden,Population
0,"Abilene, TX Metro Area",61262,76652,10192,25067,41393,11160,10757,172800,5499,165252
1,"Akron, OH Metro Area",70125,321142,22949,94990,203203,19216,42534,199000,25077,703200
2,"Albany, GA Metro Area",53056,67256,8028,27734,31494,5965,11674,148800,4525,153857
3,"Albany-Schenectady-Troy, NY Metro Area",82410,424620,37930,139464,247226,42828,58572,268400,32910,870716
4,"Albuquerque, NM Metro Area",67640,405504,24178,121091,260235,40943,55669,263500,43559,887077
...,...,...,...,...,...,...,...,...,...,...,...
351,"Yakima, WA Metro Area",69525,93422,4008,34133,55281,9749,13406,281100,8581,243231
352,"York-Hanover, PA Metro Area",81362,191099,7235,45816,138048,12884,19579,235000,20546,434972
353,"Youngstown-Warren, OH Metro Area",51350,202443,15616,56362,130465,6305,22922,134600,13861,565773
354,"Yuba City, CA Metro Area",77335,66213,3635,22346,40232,6529,11603,390800,8905,166892


| **Feature**              | **Formula**                                                 | **Table Code(s)**                                      |
| ------------------------ | ----------------------------------------------------------- | ------------------------------------------------------ |
| **Affordability\_Ratio** | `Median_Home_Value / MedianHouseholdIncome`                 | `B25077` (Median Home Value), `B19013` (Median Income) |
| **Vacancy\_Rate**        | `Vacant / Total_Housing_Units`                              | `B25002` (Vacancy Breakdown), `B25001` (Total Units)   |
| **Rent\_Burden\_Rate**   | `High_Rent_Burden / Renter occupied` *(if not already a %)* | `B25070` (Rent Burden), `B25003` (Tenure – Renter %)   |
| **Owner\_Burden\_Rate**  | `Owner_Cost_Burden / Owner occupied` *(if not already a %)* | `B25091` (Owner Burden), `B25003` (Tenure – Owner %)   |
| **New\_Units\_Rate**     | `(Built 2010–2019 + Built 2020+) / Total_Housing_Units`     | `B25036` (Year Built), `B25001` (Total Units)          |
| **Pop\_to\_Unit\_Ratio** | `Population / Total_Housing_Units`                          | `PEPANNRES` (Population), `B25001` (Total Units)       |
| **Recent\_Units\_Share** | Already calculated or use `New_Units_Rate`                  | `B25036` only                                          |


In [5]:
# Step 1: Define columns to convert
cols_to_convert = [
    'Median household income',
    'Total Housing Units',
    'Vacant',
    'Renter occupied',
    'Houses Built',
    'Owner occupied',
    'Gross Rent Burden',
    'Median value (dollars)',
    'Owner Cost Burden',
    'Population'
]

# Step 2: Safely convert all columns to numeric (remove commas first if strings)
for col in cols_to_convert:
    merged_df[col] = pd.to_numeric(
        merged_df[col].astype(str).str.replace(",", ""), errors="coerce"
    )

# Step 3: Drop rows with any missing (NaN) values after conversion
housing_index_df = merged_df

# 1. Affordability Ratio
housing_index_df['Affordability_Ratio'] = (
    housing_index_df['Median value (dollars)'] / housing_index_df['Median household income']
)

# 2. Vacancy Rate
housing_index_df['Vacancy_Rate'] = (
    housing_index_df['Vacant'] / housing_index_df['Total Housing Units']
)

# 3. Rent Burden Rate
housing_index_df['Rent_Burden_Rate'] = (
    housing_index_df['Gross Rent Burden'] / housing_index_df['Renter occupied']
)

# 4. Owner Burden Rate
housing_index_df['Owner_Burden_Rate'] = (
    housing_index_df['Owner Cost Burden'] / housing_index_df['Owner occupied']
)

# 5. Owner Burden Rate
housing_index_df['New_Units_Rate'] = (
    housing_index_df['Houses Built'] / housing_index_df['Total Housing Units']
)

# 6. Population-to-Unit Ratio
housing_index_df['Pop_to_Unit_Ratio'] = (
    housing_index_df['Population'] / housing_index_df['Total Housing Units']
)

# 7. Recent Units Share (same as New Units Rate if 2010+ is the definition)
housing_index_df['Recent_Units_Share'] = housing_index_df['New_Units_Rate']

# Preview results
housing_index_df

Unnamed: 0,Region,Median household income,Total Housing Units,Vacant,Renter occupied,Owner occupied,Houses Built,Gross Rent Burden,Median value (dollars),Owner Cost Burden,Population,Affordability_Ratio,Vacancy_Rate,Rent_Burden_Rate,Owner_Burden_Rate,New_Units_Rate,Pop_to_Unit_Ratio,Recent_Units_Share
0,"Abilene, TX Metro Area",61262,76652,10192,25067,41393,11160,10757,172800,5499,165252,2.820672,0.132965,0.429130,0.132849,0.145593,2.155873,0.145593
1,"Akron, OH Metro Area",70125,321142,22949,94990,203203,19216,42534,199000,25077,703200,2.837790,0.071461,0.447773,0.123409,0.059836,2.189686,0.059836
2,"Albany, GA Metro Area",53056,67256,8028,27734,31494,5965,11674,148800,4525,153857,2.804584,0.119365,0.420927,0.143678,0.088691,2.287632,0.088691
3,"Albany-Schenectady-Troy, NY Metro Area",82410,424620,37930,139464,247226,42828,58572,268400,32910,870716,3.256886,0.089327,0.419979,0.133117,0.100862,2.050577,0.100862
4,"Albuquerque, NM Metro Area",67640,405504,24178,121091,260235,40943,55669,263500,43559,887077,3.895624,0.059625,0.459729,0.167383,0.100968,2.187591,0.100968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351,"Yakima, WA Metro Area",69525,93422,4008,34133,55281,9749,13406,281100,8581,243231,4.043150,0.042902,0.392758,0.155225,0.104354,2.603573,0.104354
352,"York-Hanover, PA Metro Area",81362,191099,7235,45816,138048,12884,19579,235000,20546,434972,2.888326,0.037860,0.427340,0.148832,0.067421,2.276161,0.067421
353,"Youngstown-Warren, OH Metro Area",51350,202443,15616,56362,130465,6305,22922,134600,13861,565773,2.621227,0.077138,0.406692,0.106243,0.031145,2.794727,0.031145
354,"Yuba City, CA Metro Area",77335,66213,3635,22346,40232,6529,11603,390800,8905,166892,5.053339,0.054899,0.519243,0.221341,0.098606,2.520532,0.098606


### Calculate Composite Index

```python
Housing_Market_Index = (
    0.25 * Recent_Units_Share +
    0.20 * (New_Housing_Units / Total_Housing_Units) +
    0.20 * (MedianHouseholdIncome / Median_Home_Value) +
    0.15 * (1 - High_Rent_Burden) +
    0.10 * (1 - Owner_Cost_Burden) +
    0.10 * (1 - Vacancy_Rate)
)
```

In [6]:
from sklearn.preprocessing import MinMaxScaler

# Define features used in the index
features = {
    'Recent_Units_Share': housing_index_df['Recent_Units_Share'],
    'New_Units_Rate': housing_index_df['Houses Built'] / housing_index_df['Total Housing Units'],
    'Affordability_Ratio': housing_index_df['Median value (dollars)'] / housing_index_df['Median household income'],
    'Rent_Burden_Inverse': 1 - housing_index_df['Rent_Burden_Rate'],
    'Owner_Burden_Inverse': 1 - housing_index_df['Owner_Burden_Rate'],
    'Vacancy_Inverse': 1 - housing_index_df['Vacancy_Rate']
}

# Create DataFrame of these features
feature_df = pd.DataFrame(features)

# Apply Min-Max Normalization to scale everything between 0 and 1
scaler = MinMaxScaler()
normalized_features = pd.DataFrame(
    scaler.fit_transform(feature_df),
    columns=feature_df.columns,
    index=feature_df.index
)

# Weighted Composite Index Calculation
housing_index_df['Housing_Market_Index'] = (
    0.25 * normalized_features['Recent_Units_Share'] +
    0.20 * normalized_features['New_Units_Rate'] +
    0.20 * normalized_features['Affordability_Ratio'] +
    0.15 * normalized_features['Rent_Burden_Inverse'] +
    0.10 * normalized_features['Owner_Burden_Inverse'] +
    0.10 * normalized_features['Vacancy_Inverse']
)

# Create final summary table
housing_summary_df = housing_index_df[['Region', 'Housing_Market_Index']].copy()

# Optional: Sort by healthiest markets (higher is better)
housing_summary_df = housing_summary_df.sort_values(by='Housing_Market_Index', ascending=False)

# Show result

housing_summary_df

Unnamed: 0,Region,Housing_Market_Index
21,"Austin-Round Rock-San Marcos, TX Metro Area",0.732440
260,"Provo-Orem-Lehi, UT Metro Area",0.728224
128,"Greeley, CO Metro Area",0.659738
192,"Logan, UT-ID Metro Area",0.655868
263,"Raleigh-Cary, NC Metro Area",0.648927
...,...,...
224,"Muncie, IN Metro Area",0.220584
277,"Saginaw, MI Metro Area",0.218145
169,"Kingston, NY Metro Area",0.214629
34,"Binghamton, NY Metro Area",0.214274


# Job Market Index

### 🛠️ Features to Engineer

| **Feature**                  | **Formula** or Source                                | **Description**                                     |
| ---------------------------- | ---------------------------------------------------- | --------------------------------------------------- |
| `Unemployment_Rate`          | `Unemployed / Labor Force` (from B23025)             | Lower = tighter market                              |
| `Labor_Force_Participation`  | `Labor Force / Working-Age Population` (from B23025) | Higher = more active labor pool                     |
| `Private_Sector_Share`       | % in private wage/salary jobs (from C24010)          | Market-driven labor strength                        |
| `College_Degree_Rate`        | % with BA or higher (from B15003)                    | Skilled labor supply                                |
| `High_Growth_Industry_Share` | % in info, finance, STEM, healthcare (from C24030)   | Healthy, expanding job sectors                      |
| `Transit_Use_Share`          | % using public transportation (from B08101)          | Accessibility to job-rich zones                     |
| `Commute_Efficiency`         | % with commute < 30 mins (from B08303)               | Reflects job proximity and infrastructure alignment |

---

### 🧮 Sample Composite Formula

```python
Labor_Tightness_Index = (
    0.25 * (1 - Unemployment_Rate) +
    0.20 * Labor_Force_Participation +
    0.15 * Private_Sector_Share +
    0.15 * College_Degree_Rate +
    0.10 * High_Growth_Industry_Share +
    0.10 * Transit_Use_Share +
    0.05 * Commute_Efficiency
)
```


Creating Datasets through Python

In [12]:
import pandas as pd

# COMMUTE EFFICIENCY

# Step 0: Load CSVs
Time_30_34 = pd.read_csv("job/30_to_34.csv")
Time_35_39 = pd.read_csv("job/35_to_39.csv")
Time_40_44 = pd.read_csv("job/40_to_44.csv")
Time_45_59 = pd.read_csv("job/45_to_59.csv")
Time_60_89 = pd.read_csv("job/60_to_89.csv")
Time_90_More = pd.read_csv("job/90_to_More.csv")
Total_Transportation = pd.read_csv("job/Total_Transporation.csv")

# Step 1: Merge DataFrames on Region
commute_df = (
    Time_30_34[["Region", "30 to 34 minutes"]]
    .merge(Time_35_39[["Region", "35 to 39 minutes"]], on="Region")
    .merge(Time_40_44[["Region", "40 to 44 minutes"]], on="Region")
    .merge(Time_45_59[["Region", "45 to 59 minutes"]], on="Region")
    .merge(Time_60_89[["Region", "60 to 89 minutes"]], on="Region")
    .merge(Time_90_More[["Region", "90 or more minutes"]], on="Region")
    .merge(Total_Transportation[["Region", "Total:"]], on="Region")
)

# Step 2: Remove commas and convert to numeric
cols = [
    "30 to 34 minutes", "35 to 39 minutes", "40 to 44 minutes",
    "45 to 59 minutes", "60 to 89 minutes", "90 or more minutes", "Total:"
]

for col in cols:
    commute_df[col] = commute_df[col].replace(",", "", regex=True)
    commute_df[col] = pd.to_numeric(commute_df[col], errors="coerce")

# Step 3: Calculate total under 30 minutes (Total commuters - sum of 30+ bins)
commute_df["Commutes_30min_Or_More"] = (
    commute_df["30 to 34 minutes"] +
    commute_df["35 to 39 minutes"] +
    commute_df["40 to 44 minutes"] +
    commute_df["45 to 59 minutes"] +
    commute_df["60 to 89 minutes"] +
    commute_df["90 or more minutes"]
)

commute_df["Commutes_Under_30_Min"] = commute_df["Total:"] - commute_df["Commutes_30min_Or_More"]
commute_df["Commute_Efficiency"] = commute_df["Commutes_Under_30_Min"] / commute_df["Total:"]

# Step 4: Clean rows with missing or zero totals
commute_df = commute_df[commute_df["Total:"].notna() & (commute_df["Total:"] > 0)]

# Step 5: Final result
Commute_Efficiency = commute_df[["Region", "Commute_Efficiency"]]

Commute_Efficiency



Unnamed: 0,Region,Commute_Efficiency
0,"Abilene, TX Metro Area",0.876952
1,"Aguadilla, PR Metro Area",0.733537
2,"Akron, OH Metro Area",0.732741
3,"Albany, GA Metro Area",0.754485
4,"Albany, OR Metro Area",0.678981
...,...,...
388,"Yakima, WA Metro Area",0.804489
389,"York-Hanover, PA Metro Area",0.676376
390,"Youngstown-Warren, OH Metro Area",0.763326
391,"Yuba City, CA Metro Area",0.613165


In [None]:
import pandas as pd
import sqlite3

# Step 1: Define file paths
file_paths = [
    "job/30_to_34.csv",
    "job/35_to_39.csv",
    "job/40_to_44.csv",
    "job/45_to_59.csv",
    "job/60_to_89.csv",
    "job/90_to_More.csv",
    "job/Bachelors.csv",
    "job/Doctorate.csv",
    "job/Female_Finance_Jobs.csv",
    "job/Female_Information_Jobs.csv",
    "job/Female_Managment_Jobs.csv",
    "job/Female_Public_Administration_Jobs.csv",
    "job/Female_Scientific_Jobs.csv",
    "job/Female_SocialServices_Jobs.csv",
    "job/Labor_Force.csv",
    "job/Male_Finance_Jobs.csv",
    "job/Male_Information_Jobs.csv",
    "job/Male_Managment_Jobs.csv",
    "job/Male_Public_Administration_Jobs.csv",
    "job/Male_Scientific_Jobs.csv",
    "job/Male_SocialServices_Jobs.csv",
    "job/Masters.csv",
    "job/Percent_of_Private_Workers.csv",
    "job/Professional_School_Degree.csv",
    "job/Public_Transportation.csv",
    "job/Total_in_School.csv",
    "job/Total_Jobs.csv",
    "job/Total_Time.csv",
    "job/Total_Transportation.csv",
    "job/Total_Travel_Time.csv",
    "job/Unemployed.csv",
    "job/Working_Age_Population.csv",
    "job/PEPANNRES.csv"

]

# Step 2: Load into pandas DataFrames
Time_30_34 = pd.read_csv(file_paths[0])
Time_35_39 = pd.read_csv(file_paths[1])
Time_40_44 = pd.read_csv(file_paths[2])
Time_45_59 = pd.read_csv(file_paths[3])
Time_60_89 = pd.read_csv(file_paths[4])
Time_90_More = pd.read_csv(file_paths[5])
Bachelors = pd.read_csv(file_paths[6])
Doctorate = pd.read_csv(file_paths[7])
Female_Finance_Jobs = pd.read_csv(file_paths[8])
Female_Information_Jobs = pd.read_csv(file_paths[9])
Female_Managment_Jobs = pd.read_csv(file_paths[10])
Female_Public_Administration_Jobs = pd.read_csv(file_paths[11])
Female_Scientific_Jobs = pd.read_csv(file_paths[12])
Female_SocialServices_Jobs = pd.read_csv(file_paths[13])
Labor_Force = pd.read_csv(file_paths[14])
Male_Finance_Jobs = pd.read_csv(file_paths[15])
Male_Information_Jobs = pd.read_csv(file_paths[16])
Male_Managment_Jobs = pd.read_csv(file_paths[17])
Male_Public_Administration_Jobs = pd.read_csv(file_paths[18])
Male_Scientific_Jobs = pd.read_csv(file_paths[19])
Male_SocialServices_Jobs = pd.read_csv(file_paths[20])
Masters = pd.read_csv(file_paths[21])
Percent_of_Private_Workers = pd.read_csv(file_paths[22])
Professional_School_Degree = pd.read_csv(file_paths[23])
Public_Transportation = pd.read_csv(file_paths[24])
Total_in_School = pd.read_csv(file_paths[25])
Total_Jobs = pd.read_csv(file_paths[26])
Total_Time = pd.read_csv(file_paths[27])
Total_Transportation = pd.read_csv(file_paths[28])
Total_Travel_Time = pd.read_csv(file_paths[29])
Unemployed = pd.read_csv(file_paths[30])
Working_Age_Population = pd.read_csv(file_paths[31])
Population = pd.read_csv(file_paths[32])





# Step 4: Load into SQLite
conn = sqlite3.connect(":memory:")

Time_30_34.to_sql("Time_30_34", conn, index=False, if_exists="replace")
Time_35_39.to_sql("Time_35_39", conn, index=False, if_exists="replace")
Time_40_44.to_sql("Time_40_44", conn, index=False, if_exists="replace")
Time_45_59.to_sql("Time_45_59", conn, index=False, if_exists="replace")
Time_60_89.to_sql("Time_60_89", conn, index=False, if_exists="replace")
Time_90_More.to_sql("Time_90_more", conn, index=False, if_exists="replace")

Bachelors.to_sql("bachelors", conn, index=False, if_exists="replace")
Doctorate.to_sql("doctorate", conn, index=False, if_exists="replace")
Masters.to_sql("masters", conn, index=False, if_exists="replace")
Professional_School_Degree.to_sql("professional_school", conn, index=False, if_exists="replace")

Female_Finance_Jobs.to_sql("female_finance_jobs", conn, index=False, if_exists="replace")
Female_Information_Jobs.to_sql("female_information_jobs", conn, index=False, if_exists="replace")
Female_Managment_Jobs.to_sql("female_management_jobs", conn, index=False, if_exists="replace")
Female_Public_Administration_Jobs.to_sql("female_public_admin_jobs", conn, index=False, if_exists="replace")
Female_Scientific_Jobs.to_sql("female_scientific_jobs", conn, index=False, if_exists="replace")
Female_SocialServices_Jobs.to_sql("female_social_services_jobs", conn, index=False, if_exists="replace")

Male_Finance_Jobs.to_sql("male_finance_jobs", conn, index=False, if_exists="replace")
Male_Information_Jobs.to_sql("male_information_jobs", conn, index=False, if_exists="replace")
Male_Managment_Jobs.to_sql("male_management_jobs", conn, index=False, if_exists="replace")
Male_Public_Administration_Jobs.to_sql("male_public_admin_jobs", conn, index=False, if_exists="replace")
Male_Scientific_Jobs.to_sql("male_scientific_jobs", conn, index=False, if_exists="replace")
Male_SocialServices_Jobs.to_sql("male_social_services_jobs", conn, index=False, if_exists="replace")

Labor_Force.to_sql("labor_force", conn, index=False, if_exists="replace")
Unemployed.to_sql("unemployed", conn, index=False, if_exists="replace")
Working_Age_Population.to_sql("working_age_population", conn, index=False, if_exists="replace")

Percent_of_Private_Workers.to_sql("percent_private_workers", conn, index=False, if_exists="replace")

Public_Transportation.to_sql("public_transportation", conn, index=False, if_exists="replace")
Total_Transportation.to_sql("total_transportation", conn, index=False, if_exists="replace")
Total_Travel_Time.to_sql("total_travel_time", conn, index=False, if_exists="replace")
Total_Time.to_sql("total_time", conn, index=False, if_exists="replace")

Total_in_School.to_sql("total_in_school", conn, index=False, if_exists="replace")
Total_Jobs.to_sql("total_jobs", conn, index=False, if_exists="replace")

Population.to_sql("population", conn, index=False, if_exists="replace")



# Step 5: SQL Join using simplified column names
query = """
SELECT 
    population.Region AS Region,
    income.[Median household income],
    units.[Total Housing Units],
    vacancy.Vacant,
    homeownership.[Renter occupied],
    homeownership.[Owner occupied],
    built.[Houses Built],
    gross_rent.[Gross Rent Burden],
    median_value.[Median value (dollars)],
    owner_cost.[Owner Cost Burden],
    population.Population
FROM population
LEFT JOIN income ON population.Region = income.Region
LEFT JOIN units ON population.Region = units.Region
LEFT JOIN vacancy ON population.Region = vacancy.Region
LEFT JOIN homeownership ON population.Region = homeownership.Region
LEFT JOIN built ON population.Region = built.Region
LEFT JOIN gross_rent ON population.Region = gross_rent.Region
LEFT JOIN median_value ON population.Region = median_value.Region
LEFT JOIN owner_cost ON population.Region = owner_cost.Region
WHERE
    population.Population IS NOT NULL
    AND income.[Median household income] IS NOT NULL
    AND median_value.[Median value (dollars)] IS NOT NULL
    AND homeownership.[Renter occupied] IS NOT NULL
    AND homeownership.[Owner occupied] IS NOT NULL
    AND built.[Houses Built] IS NOT NULL
    AND vacancy.Vacant IS NOT NULL
    AND owner_cost.[Owner Cost Burden] IS NOT NULL
    AND gross_rent.[Gross Rent Burden] IS NOT NULL
"""




# Step 6: Run query and load results
job_df = pd.read_sql_query(query, conn)
conn.close()

# Step 7: Identify columns to convert (exclude Region)
cols_to_convert = merged_df.columns.drop('Region')


job_df = merged_df[(merged_df[cols_to_convert] != 0).all(axis=1)]

# View result
job_df
