In [1]:
import pandas as pd
import numpy as np
import re

# This script creates census-tract based socio-economic variables over the Fema v2 database. The variables that we have here are:
# 1. Population density
# 2. Housing density
# 3. Percentage of White Population   
# 4. Percentage of Black Population    
# 5. Percentage of Indian Population  
# 6. Percentage of Asian Population  
# 7. Percentage of Dual Race Population  (available only 1990 onwards)
# 8. Median Income of the census tract
# 9. Median Housing value of the census tract

# Our main source of data are:
# for 1990, 2000, 2010, 2011 - 2021 data: IPUMS crosswalk + IPUMS dataset
# for 1980 data: Logan et al (2014)

## Processing 1990's dataset

In [2]:
cw90 = pd.read_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/IPUMS Crosswalk/nhgis_blk1990_blk2010_gj/nhgis_blk1990_blk2010_gj.csv")

In [5]:
cw90.head()

Unnamed: 0,GJOIN1990,GJOIN2010,WEIGHT,PAREA_VIA_BLK00,len90,len10
0,G01000100201101A,G01000100201002004,0.000753,0.014284,16,18
1,G01000100201101A,G01000100201002005,0.04202,0.109618,16,18
2,G01000100201101A,G01000100201002006,0.262146,0.498133,16,18
3,G01000100201101A,G01000100201002016,0.237187,0.218109,16,18
4,G01000100201101A,G01000100201002023,0.099097,0.012864,16,18


In [4]:
# Getting string length
cw90['len90'] = cw90['GJOIN1990'].astype(str).apply(len)
cw90['len10'] = cw90['GJOIN2010'].astype(str).apply(len)

In [6]:
# Instead of recreating 2010's CBG, we instead try to identify which 1990's census tract does the 2010's CBG belong to
# Crosswalk: cw90 results
# If census tract code = 6 digits -> 18 or 17 -> then first 14 digits
# If census tract code = 4 digits -> 16 or 15 -> then first 12 digits

cw90['tract90'] = cw90.apply(lambda row: row['GJOIN1990'][:14] if row['len90'] >= 17 else row['GJOIN1990'][:12], axis=1)
cw90['blockgroup10'] = cw90['GJOIN2010'].str[:15]
cw90 = cw90[cw90['tract90'] != ""]

TypeError: 'float' object is not subscriptable

In [None]:
# Checking if blockgroup10 uniquely identify tract90 (it does not)
bg10 = cw90.groupby('blockgroup10').size().reset_index(name='n')
bg10_90 = cw90.groupby(['blockgroup10', 'tract90']).size().reset_index(name='n_2')

In [None]:
bg10_nomatchtract90 = pd.merge(bg10_90, bg10, on='blockgroup10', how='left')
bg10_nomatchtract90['n_diff'] = bg10_nomatchtract90['n'] - bg10_nomatchtract90['n_2']
bg10_nomatchtract90 = bg10_nomatchtract90[bg10_nomatchtract90['n_diff'] != 0]
bg10_nomatchtract90 = bg10_nomatchtract90.groupby('blockgroup10').size().reset_index(name='n_match')

In [None]:
bg10_nomatchtract90

In [None]:
# save the part that is matching multiple tract
bg10_nomatchtract90.to_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/IPUMS Crosswalk/nhgis_blk1990_blk2010_gj/multi_match90.csv", index=False)

In [None]:
# Exact tract matching
exact_match90 = pd.merge(bg10_90, bg10, on='blockgroup10', how='left')
exact_match90['n_diff'] = exact_match90['n'] - exact_match90['n_2']
exact_match90 = exact_match90[exact_match90['n_diff'] == 0]
exact_match90 = exact_match90.groupby(['blockgroup10', 'tract90']).size().reset_index(name='n')

# Creating weighting factor in bcg + diff census tract level
bg10_nomatchtract90w = pd.merge(cw90, exact_match90, on=['blockgroup10', 'tract90'], how='left')
bg10_nomatchtract90w = bg10_nomatchtract90w.groupby(['blockgroup10', 'tract90']).agg({'WEIGHT': 'sum', 'n': 'size'}).reset_index()

# Creating total weight in bcg
bg10_totweight = bg10_nomatchtract90w.groupby('blockgroup10').agg({'WEIGHT': 'sum'}).reset_index().rename(columns={'WEIGHT': 'WEIGHT_TOTAL'})

# Writing as CBG and combining match and nonmatch values
bg10_nomatchtract90weight = pd.merge(bg10_nomatchtract90w, bg10_totweight, on='blockgroup10', how='left')
bg10_nomatchtract90weight['weight'] = bg10_nomatchtract90weight['WEIGHT'] / bg10_nomatchtract90weight['WEIGHT_TOTAL']
bg10_nomatchtract90weight = bg10_nomatchtract90weight[['blockgroup10', 'tract90', 'weight']].rename(columns={'tract90': 'GJOIN1990'})

bg10_nomatchtract90weight.to_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/IPUMS Crosswalk/nhgis_blk1990_blk2010_gj/bcg_tract90_weighted.csv", index=False)

In [None]:
#### 2000 dataset -- similar methods

cw00 = pd.read_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/IPUMS Crosswalk/nhgis_blk2000_blk2010_gj/nhgis_blk2000_blk2010_gj.csv")

# Getting string length
cw00['len00'] = cw00['GJOIN2000'].astype(str).apply(len)
cw00['len10'] = cw00['GJOIN2010'].astype(str).apply(len)

# Identifying census tracts and block groups
cw00['tract00'] = cw00['GJOIN2000'].str[:14]
cw00['blockgroup10'] = cw00['GJOIN2010'].str[:15]
cw00 = cw00[cw00['tract00'] != ""]

# Total weight in cbg
bg10 = cw00.groupby('blockgroup10')['WEIGHT'].sum().reset_index(name='weight_total')

# Total weight in cbg + census tract
bg10_00 = cw00.groupby(['blockgroup10', 'tract00'])['WEIGHT'].sum().reset_index(name='weight_b')

# Calculating weight portion
bg10_nomatchtract00weight = pd.merge(bg10_00, bg10, on='blockgroup10')
bg10_nomatchtract00weight['weight'] = bg10_nomatchtract00weight['weight_b'] / bg10_nomatchtract00weight['weight_total']
bg10_nomatchtract00weight = bg10_nomatchtract00weight[['blockgroup10', 'tract00', 'weight']].rename(columns={'tract00': 'GJOIN2000'})

bg10_nomatchtract00weight.to_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/IPUMS Crosswalk/nhgis_blk2000_blk2010_gj/bcg_tract00_weighted.csv", index=False)

In [None]:
#### 2020 dataset
cw20 = pd.read_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/IPUMS Crosswalk/nhgis_blk2020_blk2010_gj/nhgis_blk2020_blk2010_gj.csv")

# Identifying census tracts and block groups
cw20['tract20'] = cw20['GJOIN2020'].str[:14]
cw20['blockgroup10'] = cw20['GJOIN2010'].str[:15]
cw20 = cw20[cw20['tract20'] != ""]
cw20 = cw20.drop(columns=['GJOIN2020']).rename(columns={'tract20': 'GJOIN2020'})

# Total weight in cbg
bg10 = cw20.groupby('blockgroup10')['WEIGHT'].sum().reset_index(name='weight_total')

# Total weight in cbg + census tract
bg10_20 = cw20.groupby(['blockgroup10', 'GJOIN2020'])['WEIGHT'].sum().reset_index(name='weight_b')

# Calculating weight portion
bg20_nomatchtract00weight = pd.merge(bg10_20, bg10, on='blockgroup10')
bg20_nomatchtract00weight['weight'] = bg20_nomatchtract00weight['weight_b'] / bg20_nomatchtract00weight['weight_total']
bg20_nomatchtract00weight = bg20_nomatchtract00weight[['blockgroup10', 'GJOIN2020', 'weight']]

bg20_nomatchtract00weight.to_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/IPUMS Crosswalk/bcg_tract20_weighted.csv", index=False)

In [None]:
# 1980 dataset -> Using tract TS from Trent
# Lowest unit in GSJOIN2010 -> Census tract instead of census block

# Reading datasets and initial filtering commented out in the original script
# cwts = pd.read_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/nghis census tract/nhgisCensusTract_csv/nhgis0009_csv/nhgis0009_ts_nominal_tract.csv")

# Trying to combine the data with the original dataset
data2 = pd.read_parquet("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/july_24_flood_data.parquet.gzip")
data2 = pd.read_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/FimaNfipClaims.csv", dtype={'censusBlockGroupFips': str})

# All CBG data: 1980, 1990, 2000, 2020

def gjoin_splitter(var_name):
    a = var_name[1:3]
    b = var_name[4:7]
    c = var_name[8:15]
    return "".join([a, b, c])

# Example call to the function
print(gjoin_splitter("ancadasdadasdaadasds"))

# Creating census block group data and calculating match percentages is outlined but not fully implementable without additional context or data.

# Reading additional datasets for comparison
all90 = pd.read_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/bcg_tract90_weighted.csv")
all00 = pd.read_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/bcg_tract00_weighted.csv")
all20 = pd.read_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/bcg_tract20_weighted.csv")

In [None]:
# Reading CSV and initial data processing for 1990 dataset
cwts = pd.read_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/nghis census tract/nhgisCensusTract_csv/nhgis0009_csv/nhgis0009_ts_nominal_tract.csv")

# Calculating empty counts
empty_counts = (cwts == "").sum()

# Processing data for 1980, noting the lack of a direct crosswalk
all80 = pd.read_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/nghis census tract/nhgisCensusTract_csv/crosswalk_1980_2010.csv", 
                    dtype={'trtid80': str, 'trtid10': str})
all80['nchar80'] = all80['trtid80'].apply(len)
all80['nchar10'] = all80['trtid10'].apply(len)

# Adding FIPS codes and merging for 1980 data
all80 = all80.assign(
    state_fips=all80['trtid10'].str[:2],
    county_fips=all80['trtid10'].str[2:5],
    census_block_group=all80['trtid10'].str[5:11]
).assign(
    censusTract10=lambda x: "G" + x['state_fips'] + "0" + x['county_fips'] + "0" + x['census_block_group'],
    GJOIN1980=lambda x: "G" + x['state_fips'] + "0" + x['county_fips'] + "0" + x['census_block_group']
).loc[:, ['GJOIN1980', 'censusTract10', 'weight']]

# Summarizing weights for 1980 data
weight_summary_80 = all80.groupby('censusTract10')['weight'].sum().reset_index(name='tot_weight')

# Processing and transforming data for 1990
cwts_90 = cwts.query("GJOIN1990 != ''").assign(
    population=lambda x: x['AV0AA1990'],
    populationWhite=lambda x: x['B18AA1990'],
    populationBlack=lambda x: x['B18AB1990'],
    populationIndian=lambda x: x['B18AC1990'],
    populationAsian=lambda x: x['B18AD1990'],
    housingUrban=lambda x: x['AZ7AA1990'],
    housingRural=lambda x: x['AZ7AD1990'],
    medianIncome=lambda x: x['B79AA1990'],
    housingTotal=lambda x: x['A41AA1990']
).assign(
    exist=1
)

# Converting population figures into percentages for 1990
cwts_90 = cwts_90.assign(
    population_tot=lambda x: x[['populationWhite', 'populationBlack', 'populationIndian', 'populationAsian']].sum(axis=1)
).assign(
    percpopulationWhite=lambda x: x['populationWhite'] / x['population_tot'],
    percpopulationBlack=lambda x: x['populationBlack'] / x['population_tot'],
    percpopulationIndian=lambda x: x['populationIndian'] / x['population_tot'],
    percpopulationAsian=lambda x: x['populationAsian'] / x['population_tot']
)

In [None]:
# TODO!
# values in 2010 onwards (need some flip-join maybe)

# Also to do: Obtain values of LAND_AREA and median house value in these areas:

# Conversion factor
sqmt_to_sqmiles = 3.86102e-7

# Processing data for 1990
df90 = pd.read_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/nghis census tract/nhgisCensusTract_csv/nhgis0009_csv/nhgis0009_ds120_1990_tract.csv")
df90 = df90[['GISJOIN', 'AREALAND', 'EST001']].rename(columns={'AREALAND': 'LAND_AREA', 'EST001': 'Value'})
df90['LAND_AREA'] = df90['LAND_AREA'] * sqmt_to_sqmiles

# Processing data for 2000
df00 = pd.read_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/nghis census tract/nhgisCensusTract_csv/nhgis0009_csv/nhgis0009_ds151_2000_tract.csv")
df00 = df00[['GISJOIN', 'AREALAND', 'GB7001']].rename(columns={'AREALAND': 'LAND_AREA', 'GB7001': 'Value'})
df00['LAND_AREA'] = df00['LAND_AREA'] * sqmt_to_sqmiles

# Processing data for 2010
df10 = pd.read_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/nghis census tract/nhgisCensusTract_csv/pdb2012bgv9_us.csv", dtype={'GIDBG': str})
df10 = df10[['GIDBG', 'LAND_AREA', 'Med_house_val_tr_ACS_06_10']].rename(columns={'Med_house_val_tr_ACS_06_10': 'Value'})
df10['Value'] = pd.to_numeric(df10['Value'], errors='coerce')  # Assuming parse_number equivalent
df10['gidbg'] = df10['GIDBG'].str[:11]
df10 = df10.assign(
    state_fips=df10['gidbg'].str[:2],
    county_fips=df10['gidbg'].str[2:5],
    census_tract=df10['gidbg'].str[5:11]
).assign(GISJOIN=lambda x: "G" + x['state_fips'] + "0" + x['county_fips'] + "0" + x['census_tract'])
df10 = df10.groupby('GISJOIN').agg({'LAND_AREA': 'sum', 'Value': 'mean'}).reset_index()

# Processing data for 2020
df20 = pd.read_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/nghis census tract/nhgisCensusTract_csv/pdb2022tr.csv", dtype={'GIDTR': str})
df20 = df20[['GIDTR', 'LAND_AREA', 'Med_House_Value_ACS_16_20']].rename(columns={'Med_House_Value_ACS_16_20': 'Value'})
df20['Value'] = pd.to_numeric(df20['Value'], errors='coerce')  # Assuming parse_number equivalent
df20 = df20.assign(
    state_fips=df20['GIDTR'].str[:2],
    county_fips=df20['GIDTR'].str[2:5],
    census_tract=df20['GIDTR'].str[5:11]
).assign(GISJOIN=lambda x: "G" + x['state_fips'] + "0" + x['county_fips'] + "0" + x['census_tract'])
df20 = df20[['GISJOIN', 'LAND_AREA', 'Value']]

In [None]:
# Assuming all90, df90, all00, df00, and cwts dataframes are already loaded as per previous instructions.

# Conversion for square meters to square miles
sqmt_to_sqmiles = 3.86102e-7

# Correcting the name convention for all
all90.rename(columns={'tract90': 'GJOIN1990'}, inplace=True)
all90 = all90[['blockgroup10', 'GJOIN1990', 'weight']]

df90.rename(columns={'GISJOIN': 'GJOIN1990'}, inplace=True)

all00.rename(columns={'tract00': 'GJOIN2000'}, inplace=True)
all00 = all00[['blockgroup10', 'GJOIN2000', 'weight']]

df00.rename(columns={'GISJOIN': 'GJOIN2000'}, inplace=True)

# Writing 90's data
full_90_data = pd.merge(all90, df90, on='GJOIN1990', how='left')
full_90_data = pd.merge(full_90_data, cwts_90, on='GJOIN1990', how='left')
full_90_data['Year'] = 1990
full_90_data.to_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/nghis census tract/nhgisCensusTract_csv/all90_data.csv", index=False)

# Writing 00's data
full_00_data = pd.merge(all00, df00, on='GJOIN2000', how='left')
full_00_data = pd.merge(full_00_data, cwts_00, on='GJOIN2000', how='left')
full_00_data['Year'] = 2000
full_00_data.to_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/nghis census tract/nhgisCensusTract_csv/all00_data.csv", index=False)

# For 2010's data, assuming cwts is already loaded and prepared
# The R script uses tidyverse functions to rename columns based on patterns and pivot_longer which are not directly available in pandas. We will approximate these steps.

# Renaming columns based on patterns (assuming cwts is already prepared similarly to cwts_90 and cwts_00)
rename_columns = {col: col.split('_')[0] + col[-4:] for col in cwts.columns if '2010' in col}
cwts.rename(columns=rename_columns, inplace=True)

# Pivoting longer - converting wide format to long format (an approximate approach using pd.melt, since pivot_longer is not a direct pandas function)
cwts_melted = cwts.melt(id_vars=['GJOIN2010'], var_name='variable', value_name='value')
cwts_melted['Year'] = cwts_melted['variable'].apply(lambda x: int(x[-4:]))

# Assuming df10 is already loaded and prepared, and joining data for 2010's
full_10_data = pd.merge(cwts_melted, df10, left_on='GJOIN2010', right_on='GISJOIN', how='left')
full_10_data = full_10_data[full_10_data['Year'] < 2020]
full_10_data.to_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/nhgisCensusTract_csv/all10_data.csv", index=False)


In [None]:
# Calculating missing values
missing_values = cwts.isna().sum()

# Filtering and selecting for 2020
cwts20 = cwts[cwts['GJOIN2020'] != ""].filter(regex='(AV|B18|AZ7|B79|A41).*5|GJOIN2020')

# Dynamically renaming columns based on patterns
pattern_subs = {
    'AV0AA': 'population_',
    'B18AA': 'populationWhite_',
    'B18AB': 'populationBlack_',
    'B18AC': 'populationIndian_',
    'B18AD': 'populationAsian_',
    'B18AE': 'populationDualRace_',
    'AZ7AA': 'housingUrban_',
    'AZ7AD': 'housingRural_',
    'B79AA': 'medianIncome_',
    'A41AA': 'housingTotal_'
}

for pattern, replacement in pattern_subs.items():
    cwts20.columns = [re.sub(pattern + '(\d+)', replacement + '\\1', col) for col in cwts20.columns]

# Pivoting longer - converting wide format to long format. We use pandas.melt() as an approximate solution.
cwts20_melted = cwts20.melt(id_vars=['GJOIN2020'], var_name='variable', value_name='value')
cwts20_melted['Group'] = cwts20_melted['variable'].str.extract('(\d+)$').astype(int)
cwts20_melted['Year'] = (cwts20_melted['Group'] - 5) / 10 + 2000
cwts20_melted.rename(columns={'GJOIN2020': 'GISJOIN'}, inplace=True)

# Assuming all20 dataframe is already prepared and df20 loaded from previous steps
all21 = all20.copy()

cwts_20_only = cwts20_melted[cwts20_melted['Year'] == 2020]
cwts_21_only = cwts20_melted[cwts20_melted['Year'] == 2021]

# Merging and preparing the final data for 2020
full_20_data = pd.merge(all20.rename(columns={'tract20': 'GISJOIN'}), cwts_20_only, on='GISJOIN', how='left')
full_20_data = pd.merge(full_20_data, df20, on='GISJOIN', how='left').drop(columns=['Group'])

full_20_data.to_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/nhgisCensusTract_csv/all20_data.csv", index=False)

# Preparing the final data for 2021
full_21_data = pd.merge(all21.rename(columns={'tract20': 'GISJOIN'}), cwts_21_only, on='GISJOIN', how='left')
full_21_data = pd.merge(full_21_data, df20, on='GISJOIN', how='left')

full_21_data.to_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/nhgisCensusTract_csv/all21_data.csv", index=False)

In [None]:
# FEMA data processing for different decades
fema90 = data2[(data2['yearOfLoss'] >= 1990) & (data2['yearOfLoss'] < 2000)]
fema00 = data2[(data2['yearOfLoss'] >= 2000) & (data2['yearOfLoss'] < 2010)]
fema20 = data2[(data2['yearOfLoss'] >= 2020) & (data2['yearOfLoss'] < 2029)]

# Assuming all90, all00, and all20 dataframes are already loaded as per previous instructions

# Identifying non-matching weights for different datasets
weight_nonmatch90 = all90[all90['weight'] != 1]
non_match90 = weight_nonmatch90.groupby('blockgroup10').agg(n=('weight', 'size'), tot_weight=('weight', 'sum')).reset_index()

weight_nonmatch00 = all00[all00['weight'] != 1]
non_match00 = weight_nonmatch00.groupby('blockgroup10').agg(n=('weight', 'size'), tot_weight=('weight', 'sum')).reset_index()

weight_nonmatch20 = all20[all20['weight'] != 1]
non_match20 = weight_nonmatch20.groupby('blockgroup10').agg(n=('weight', 'size'), tot_weight=('weight', 'sum')).reset_index()

# Adding GIS mode to FEMA datasets and joining with non-match data
def add_gis_mode(df):
    df['nchar'] = df['censusBlockGroupFips'].apply(len)
    df['state_fips'] = df['censusBlockGroupFips'].str[:2]
    df['county_fips'] = df['censusBlockGroupFips'].str[2:5]
    df['census_block_group'] = df['censusBlockGroupFips'].str[5:12]
    df['blockgroup10'] = "G" + df['state_fips'] + "0" + df['county_fips'] + "0" + df['census_block_group']
    return df

fema90_GIS = add_gis_mode(fema90)
fema00_GIS = add_gis_mode(fema00)
fema20_GIS = add_gis_mode(fema20)

fema90_GIS = pd.merge(fema90_GIS, non_match90, on='blockgroup10', how='left').dropna(subset=['tot_weight'])
fema00_GIS = pd.merge(fema00_GIS, non_match00, on='blockgroup10', how='left').dropna(subset=['tot_weight'])
fema20_GIS = pd.merge(fema20_GIS, non_match20, on='blockgroup10', how='left').dropna(subset=['tot_weight'])

# Exporting to CSV
fema90_GIS.to_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/nhgisCensusTract_csv/multi_tract90_confirm.csv", index=False)
fema00_GIS.to_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/nhgisCensusTract_csv/multi_tract00_confirm.csv", index=False)
fema20_GIS.to_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/nhgisCensusTract_csv/multi_tract20_confirm.csv", index=False)

# The section for 1980's data and the crosswalk utilization are noted but not directly implemented due to the nature of the instructions and the need for specific datasets and additional context.

In [None]:
# Assuming all80, all80sam, and area dataframes are loaded as per previous instructions

# Renaming and calculating population percentages for the 1980 dataset
all80_new = all80.rename(columns={
    'NHWHT80': 'populationWhite',
    'NTV80': 'populationIndian',
    'NHBLK80': 'populationBlack',
    'ASIAN80': 'populationAsian',
    'POP80': 'population',
    'MHMVAL80': 'Value',
    'TRTID10': 'trtid10'
})
all80_new['population_tot'] = all80_new[['populationWhite', 'populationBlack', 'populationIndian', 'populationAsian']].sum(axis=1)
all80_new['population_diff'] = all80_new['population_tot'] - all80_new['population']  # Note on potential discrepancies
all80_new['percpopulationWhite'] = all80_new['populationWhite'] / all80_new['population_tot']
all80_new['percpopulationBlack'] = all80_new['populationBlack'] / all80_new['population_tot']
all80_new['percpopulationIndian'] = all80_new['populationIndian'] / all80_new['population_tot']
all80_new['percpopulationAsian'] = all80_new['populationAsian'] / all80_new['population_tot']
all80_new['trtid10'] = all80_new['trtid10'].str.zfill(11)  # Padding trtid10 to ensure 11 characters

# Processing all80sam data
all80sam_new = all80sam.rename(columns={'hinc80': 'medianIncome', 'hh80': 'house'})
all80sam_new['Year'] = 1980
all80sam_new['exist'] = 1
all80sam_new['trtid10'] = all80sam_new['trtid10'].str.zfill(11)  # Padding trtid10

# Assuming 'area' dataframe is loaded from the area data mentioned in the comment
area['GIDBG'] = area['GIDBG'].apply(lambda x: x.zfill(12))
area['trtid10'] = area['GIDBG'].str[:11]
area = area.groupby('trtid10')['LAND_AREA'].sum().reset_index()

# Joining datasets
all80_new = pd.merge(all80_new, all80sam_new, on='trtid10', how='left')
all80_new = pd.merge(all80_new, area, on='trtid10', how='left')

# Final adjustment and selection for each year
all80_final = all80_new.assign(
    populationDensity=all80_new['population'] / all80_new['LAND_AREA'],
    housingDensity=all80_new['house'] / all80_new['LAND_AREA']
).filter(items=[
    'trtid10', 'percpopulationWhite', 'percpopulationBlack', 'percpopulationIndian', 'percpopulationAsian',
    'populationDensity', 'housingDensity', 'medianIncome', 'Year'
])

all80_final.to_csv("C:/Users/Asus/Box/Flood Damage PredictionProject/Dataset/nhgisCensusTract_csv/all80_data.csv", index=False)

# Adjusting LAND_AREA for 1990 data in full_90_data
# Assuming full_90_data dataframe is loaded as per previous instructions
full_90_data['LAND_AREA'] *= 1000  # Conversion if necessary
full_90_data['populationDensity'] = full_90_data['population'] / full_90_data['LAND_AREA']
full_90_data['housingDensity'] = full_90_data['housingTotal'] / full_90_data['LAND_AREA']
# Selecting and renaming for final 1990 dataset preparation
all90_final = full_90_data[['blockgroup10', 'GJOIN1990', 'percpopulationWhite', 'percpopulationBlack', 
                            'percpopulationIndian', 'percpopulationAsian', 'populationDensity', 
                            'housingDensity', 'medianIncome', 'Year']]

# Assuming fema80 dataset is loaded as per previous instructions
fema80new = fema80.assign(cbgexist=fema80['censusBlockGroupFips'].apply(len) > 0)
fema80new['trtid10'] = fema80new['censusBlockGroupFips'].str[:11]

# Final join for FEMA data with all80 data
final_data = pd.merge(fema80new, all80_final, on='trtid10', how='left')