Visualization:

In [None]:
import pandas as pd
import json

# Read the CSV file
df = pd.read_csv(r'C:\Users\ykks\Desktop\zuoye\spark\final_dataset.csv') 

# Group the data by CoC Name, Year, and Homelessness Type
grouped = df.groupby(['CoC Name', 'Year', 'Homelessness.Type'])['Count'].sum().unstack(level='Homelessness.Type').reset_index()

# Rename columns to match the expected format
grouped = grouped.rename(columns={
    'Overall.Homeless': 'Overall_Homeless',
    'Sheltered.Total.Homeless': 'Sheltered',
    'Unsheltered.Homeless': 'Unsheltered'
})

# Fill NaN values with 0
grouped = grouped.fillna(0)

# Convert data to the required format
data = grouped.to_dict('records')

# Create JavaScript code string
js_data = f"var data = {json.dumps(data)};"

# Read HTML template
with open('templates/index.html', 'r', encoding='utf-8') as file:
    html_template = file.read()

# Insert data into HTML template
html_with_data = html_template.replace('// INSERT_DATA_HERE', js_data)

# Write the result to a new HTML file
output_path = r'C:\Users\ykks\Desktop\zuoye\spark\project\ds-ciss-predictive-homlessness\Demo1\homeless_data_visualization.html'
with open(output_path, 'w', encoding='utf-8') as file:
    file.write(html_with_data)

print(f"HTML file has been generated: {output_path}")

# Verify the data was inserted into the HTML
print("Data insertion verification:")
print("Data variable definition found:" if "var data = " in html_with_data else "Data variable definition NOT found")
print("First data item found:" if str(data[0]) in html_with_data else "First data item NOT found")

# Print the first few items of the processed data
print("\nProcessed data sample:")
print(json.dumps(data[:5], indent=2))

Merge data with calculation:

In [None]:
import pandas as pd
import numpy as np

# 读取两个文件
df_table3 = pd.read_csv(r'C:\Users\ykks\Desktop\zuoye\spark\10.20\ACS_Data\merged_final_data_cleaned.csv')
df_table4 = pd.read_csv(r'C:\Users\ykks\Desktop\zuoye\spark\10.20\data2\poverty_cost_burden_rates_cleaned.csv')

# 确保year列的类型一致
df_table3['year'] = df_table3['year'].astype(float)
df_table4['year'] = df_table4['year'].astype(float)

# 在表3中添加表4的数据
df_merged = pd.merge(
    df_table3,
    df_table4[['geo_id', 'year', 'CoC_Number', 'poverty_rate', 'cost_burden_rate']],
    on=['geo_id', 'year', 'CoC_Number'],
    how='left'
)

# 重新排列列的顺序
cols = list(df_table3.columns)  # 获取原始列顺序
# 找到unemployment_rate和rental_vacancy_rate的位置
rate_pos = cols.index('rental_vacancy_rate')
# 在这个位置后插入新列
new_cols = cols[:rate_pos+1] + ['poverty_rate', 'cost_burden_rate'] + cols[rate_pos+1:]

# 按新的列顺序排列
df_final = df_merged[new_cols]

# 保存结果
output_file = r'C:\Users\ykks\Desktop\zuoye\spark\10.20\ACS_Data\final_merged_data.csv'
df_final.to_csv(output_file, index=False)

# 显示一些统计信息
print("合并结果统计：")
print(f"总行数: {len(df_final)}")
print("\n各比率的数据完整性：")
print(f"unemployment_rate 非空值数量: {df_final['unemployment_rate'].notna().sum()}")
print(f"rental_vacancy_rate 非空值数量: {df_final['rental_vacancy_rate'].notna().sum()}")
print(f"poverty_rate 非空值数量: {df_final['poverty_rate'].notna().sum()}")
print(f"cost_burden_rate 非空值数量: {df_final['cost_burden_rate'].notna().sum()}")

# 显示前几行数据来验证
print("\n合并后的数据示例（前几列）：")
print(df_final[['geo_id', 'year', 'unemployment_rate', 'rental_vacancy_rate', 'poverty_rate', 'cost_burden_rate']].head())

Calculate per capita data:

In [None]:
import pandas as pd

# 读取数据
df = pd.read_csv(r"C:\Users\ykks\Desktop\zuoye\spark\model\Final_estimate.csv")

# 假设原始数据中有以下列名，请根据您的实际数据列名进行修改：
# "Total Population", "Overall Homeless", "Overall Homeless Individuals", 
# "Overall Homeless People in Families", "Unsheltered Homeless", "Sheltered Total Homeless"

# 计算人均无家可归相关指标
df["Overall_Homeless_Per_Capita"] = df["Overall Homeless"] / df["Total Population"]
df["Overall_Homeless_Individuals_Per_Capita"] = df["Overall Homeless Individuals"] / df["Total Population"]
df["Overall_Homeless_People_in_Families_Per_Capita"] = df["Overall Homeless People in Families"] / df["Total Population"]
df["Unsheltered_Homeless_Per_Capita"] = df["Unsheltered Homeless"] / df["Total Population"]
df["Sheltered_Homeless_Per_Capita"] = df["Sheltered Total Homeless"] / df["Total Population"]

# 如有需要，可将结果保存为新的CSV文件
df.to_csv(r"C:\Users\ykks\Desktop\zuoye\spark\model\Final_estimate_per_capita.csv", index=False)

print("人均无家可归数据已成功计算并保存！")


Random_forest_model:


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

# 读取数据

data = pd.read_csv(r'C:\Users\ykks\Desktop\zuoye\spark\model\Merged_Data.csv')

# 过滤只包含 2010-2023 年的数据
data = data[(data['Year'] >= 2010) & (data['Year'] <= 2023)]

# 检查数据，去除缺失值
print(data.info())
data = data.dropna()

# 对非数值型特征进行编码
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# 定义不同的目标变量列表
targets = ['Overall Homeless', 'Overall Homeless Individuals', 'Overall Homeless People in Families', 'Unsheltered Homeless', 'Sheltered Total Homeless']

# 创建空的结果列表用于存储每个目标变量的评估结果
results = []

# 对每个目标变量进行预测和评估
for target in targets:
    # 特征和目标变量选择
    X = data.drop(columns=targets)
    y = data[target]
    
    # 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # 创建随机森林回归模型
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    
    # 训练模型
    model.fit(X_train, y_train)
    
    # 进行预测
    y_pred = model.predict(X_test)
    
    # 评估模型性能
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results.append((target, mse, r2))

# 打印所有目标变量的评估结果
for target, mse, r2 in results:
    print(f"{target} - Mean Squared Error: {mse:.2e}, R-squared: {r2:.4f}")
