Visualization:

In [2]:
import pandas as pd
import json

# Read the CSV file
df = pd.read_csv(r'C:\Users\ykks\Desktop\zuoye\spark\yks_code\final_dataset.csv') 

# Group the data by CoC Name, Year, and Homelessness Type
grouped = df.groupby(['CoC Name', 'Year', 'Homelessness.Type'])['Count'].sum().unstack(level='Homelessness.Type').reset_index()

# Rename columns to match the expected format
grouped = grouped.rename(columns={
    'Overall.Homeless': 'Overall_Homeless',
    'Sheltered.Total.Homeless': 'Sheltered',
    'Unsheltered.Homeless': 'Unsheltered'
})

# Fill NaN values with 0
grouped = grouped.fillna(0)

# Convert data to the required format
data = grouped.to_dict('records')

# Create JavaScript code string
js_data = f"var data = {json.dumps(data)};"

# Read HTML template
with open('templates/index.html', 'r', encoding='utf-8') as file:
    html_template = file.read()

# Insert data into HTML template
html_with_data = html_template.replace('// INSERT_DATA_HERE', js_data)

# Write the result to a new HTML file
output_path = r'C:\Users\ykks\Desktop\zuoye\spark\yks_code\homeless_data_visualization.html'
with open(output_path, 'w', encoding='utf-8') as file:
    file.write(html_with_data)

print(f"HTML file has been generated: {output_path}")

# Verify the data was inserted into the HTML
print("Data insertion verification:")
print("Data variable definition found:" if "var data = " in html_with_data else "Data variable definition NOT found")
print("First data item found:" if str(data[0]) in html_with_data else "First data item NOT found")

# Print the first few items of the processed data
print("\nProcessed data sample:")
print(json.dumps(data[:5], indent=2))

HTML file has been generated: C:\Users\ykks\Desktop\zuoye\spark\yks_code\homeless_data_visualization.html
Data insertion verification:
Data variable definition found:
First data item NOT found

Processed data sample:
[
  {
    "CoC Name": "Akron, Barberton/Summit County CoC",
    "Year": 2007,
    "Overall_Homeless": 824,
    "Overall.Homeless.Individuals": 575,
    "Overall.Homeless.People.in.Families": 249,
    "Sheltered": 632,
    "Unsheltered": 192
  },
  {
    "CoC Name": "Akron, Barberton/Summit County CoC",
    "Year": 2008,
    "Overall_Homeless": 740,
    "Overall.Homeless.Individuals": 490,
    "Overall.Homeless.People.in.Families": 250,
    "Sheltered": 636,
    "Unsheltered": 104
  },
  {
    "CoC Name": "Akron, Barberton/Summit County CoC",
    "Year": 2009,
    "Overall_Homeless": 820,
    "Overall.Homeless.Individuals": 549,
    "Overall.Homeless.People.in.Families": 271,
    "Sheltered": 658,
    "Unsheltered": 162
  },
  {
    "CoC Name": "Akron, Barberton/Summit Coun

Calculate per capita data:

In [None]:
import pandas as pd


df = pd.read_csv(r"C:\Users\ykks\Desktop\zuoye\spark\model\Final_estimate.csv")

# "Total Population", "Overall Homeless", "Overall Homeless Individuals", 
# "Overall Homeless People in Families", "Unsheltered Homeless", "Sheltered Total Homeless"

# Calculate per capita homelessness related indicators
df["Overall_Homeless_Per_Capita"] = df["Overall Homeless"] / df["Total Population"]
df["Overall_Homeless_Individuals_Per_Capita"] = df["Overall Homeless Individuals"] / df["Total Population"]
df["Overall_Homeless_People_in_Families_Per_Capita"] = df["Overall Homeless People in Families"] / df["Total Population"]
df["Unsheltered_Homeless_Per_Capita"] = df["Unsheltered Homeless"] / df["Total Population"]
df["Sheltered_Homeless_Per_Capita"] = df["Sheltered Total Homeless"] / df["Total Population"]

# Save the results as a new CSV file if necessary
df.to_csv(r"C:\Users\ykks\Desktop\zuoye\spark\model\Final_estimate_per_capita.csv", index=False)

print("Homelessness data per capita has been successfully calculated and saved!")


人均无家可归数据已成功计算并保存！


Random_forest_model:


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder


data = pd.read_csv(r'C:\Users\ykks\Desktop\zuoye\spark\model\Merged_Data.csv')

# Filter only includes data from 2010-2023
data = data[(data['Year'] >= 2010) & (data['Year'] <= 2023)]

# Check the data and remove missing values
print(data.info())
data = data.dropna()

# Encode non-numerical features
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Define a list of different target variables
targets = ['Overall Homeless', 'Overall Homeless Individuals', 'Overall Homeless People in Families', 'Unsheltered Homeless', 'Sheltered Total Homeless']

# Create an empty result list to store the evaluation results for each target variable
results = []

# Prediction and evaluation of each target variable
for target in targets:
    # Feature and target variable selection
    X = data.drop(columns=targets)
    y = data[target]
    
   # Divide the training set and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
   # Create a random forest regression model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    
   # Training model
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results.append((target, mse, r2))

for target, mse, r2 in results:
    print(f"{target} - Mean Squared Error: {mse:.2e}, R-squared: {r2:.4f}")


<class 'pandas.core.frame.DataFrame'>
Index: 5372 entries, 3 to 6523
Data columns (total 43 columns):
 #   Column                                                   Non-Null Count  Dtype  
---  ------                                                   --------------  -----  
 0   CoC_Number                                               5344 non-null   object 
 1   Year                                                     5372 non-null   int64  
 2   B01003_001E                                              5372 non-null   int64  
 3   B17001_002E                                              5372 non-null   int64  
 4   B25002_001E                                              5372 non-null   int64  
 5   B25002_003E                                              5372 non-null   int64  
 6   B25003_003E                                              5372 non-null   int64  
 7   B25003_001E                                              5372 non-null   int64  
 8   B25106_001E                      