In [5]:
import pandas as pd

# Load the environmental data
env_data = pd.read_csv("C:/Users/emman/Downloads/PM2.5_Pollution_Data-Public/Scripts/Environmental_Data/Imputed_Clean_Environmental_Data/Finial_Prophet_Imputed_Cleaned_Environmental_Data.csv")

# Load the clinical data
clinical_data = pd.read_csv("C:/Users/emman/Downloads/PM2.5_Pollution_Data-Public/Scripts/Clinical_Data/Phenotype_Disease_Conditions/Merged_Clinical_Filtered_Data/merged_clinical_data_V1.csv")

# Convert clinical Time period to numeric Year
clinical_data['Year'] = clinical_data['Time period'].str.extract(r'(\d{4})').astype(int)

# Keep only relevant columns: Area Code, Year, Condition, Value
clinical_subset = clinical_data[['Area Code', 'Year', 'Indicator Name', 'Value']].copy()

# Pivot clinical data so each Condition becomes a column
clinical_pivot = clinical_subset.pivot_table(
    index=['Area Code', 'Year'], 
    columns='Indicator Name', 
    values='Value'
).reset_index()

# Optional: rename columns if needed (columns will be the condition names automatically)
clinical_pivot.columns.name = None  # Remove the pivot_table name

# Merge with environmental data
merged_data = pd.merge(env_data, clinical_pivot, on=['Area Code', 'Year'], how='left')

# Check the merged dataset
print(merged_data.head())


   Numeric Area Code  Area Code Local Authority Name  Year  \
0                1.0  E07000223                 Adur  2012   
1                1.0  E07000223                 Adur  2013   
2                1.0  E07000223                 Adur  2014   
3                1.0  E07000223                 Adur  2015   
4                1.0  E07000223                 Adur  2016   

   PM2.5_Anthropogenic  PM2.5_Non_Anthropogenic  PM2.5_Total  \
0               8.3472                   2.2917      10.6389   
1               8.4336                   2.2585      10.6921   
2               7.6801                   2.2662       9.9463   
3               8.0684                   1.3994       9.4679   
4              10.5668                   0.5557      11.1224   

  PM2.5_Anthropogenic_ImputationMethod  \
0                             Original   
1                             Original   
2                             Original   
3                             Original   
4                             Or

In [4]:
# Save the merged dataset to a CSV
merged_data.to_csv("merged_environmental_clinical_data.csv", index=False)

print("Merged file saved successfully!")


Merged file saved successfully!


In [32]:
import pandas as pd

# Load the environmental data
env_data = pd.read_csv(
    "C:/Users/emman/Downloads/PM2.5_Pollution_Data-Public/Scripts/Environmental_Data/Imputed_Clean_Environmental_Data/Finial_Prophet_Imputed_Cleaned_Environmental_Data.csv"
)

# Load the clinical data
clinical_data = pd.read_csv(
    "C:/Users/emman/Downloads/PM2.5_Pollution_Data-Public/Scripts/Clinical_Data/Phenotype_Disease_Conditions/Merged_Clinical_Filtered_Data/merged_clinical_data_V1.csv"
)

# Extract numeric Year from clinical Time period
clinical_data['Year'] = clinical_data['Time period'].str.extract(r'(\d{4})').astype(int)

# Optional: rename columns to avoid duplicates
# For example, if both datasets have 'Area Name'
# env_data.rename(columns={'Local Authority Name': 'Env_Local_Authority_Name'}, inplace=True)
# clinical_data.rename(columns={'Area Name': 'Clinical_Area_Name'}, inplace=True)

# Check columns first
print(env_data.columns.tolist())
print(clinical_data.columns.tolist())

# Rename to avoid confusion
env_data.rename(columns={'Local Authority Name': 'Env_Local_Authority_Name'}, inplace=True)
clinical_data.rename(columns={'Area Name': 'Clinical_Area_Name'}, inplace=True)


# Merge environmental and clinical data on Area Code and Year
merged_data2 = pd.merge(
    env_data,
    clinical_data,
    left_on=['Area Code', 'Year'],
    right_on=['Area Code', 'Year'],
    how='inner'
)

# Columns you want to keep
columns_of_interest = [
    'Area Code', 'Year',
    'Env_Local_Authority_Name',        # renamed environmental column
    'Clinical_Area_Name',              # renamed clinical column
    'Sex', 'Age',
    'PM2.5_Total',                     # from environmental data
    'PM2.5_Anthropogenic',
    'PM2.5_Non_Anthropogenic',
    'Condition', 'Value', 'Count', 'Denominator',     # from clinical data
    'Lower CI 95.0 limit', 'Upper CI 95.0 limit',
    'Lower CI 99.8 limit', 'Upper CI 99.8 limit',
    'Compared to England value or percentiles',
    'Time period'                      # keep original period if needed
]


# Select only the columns you want
merged_data2 = merged_data2[columns_of_interest]
# Display first few rows
print(merged_data2.head())


['Numeric Area Code', 'Area Code', 'Local Authority Name', 'Year', 'PM2.5_Anthropogenic', 'PM2.5_Non_Anthropogenic', 'PM2.5_Total', 'PM2.5_Anthropogenic_ImputationMethod', 'PM2.5_Non_Anthropogenic_ImputationMethod', 'PM2.5_Total_ImputationMethod']
['Indicator ID', 'Indicator Name', 'Parent Code', 'Parent Name', 'Area Code', 'Area Name', 'Area Type', 'Sex', 'Age', 'Category Type', 'Category', 'Time period', 'Value', 'Lower CI 95.0 limit', 'Upper CI 95.0 limit', 'Lower CI 99.8 limit', 'Upper CI 99.8 limit', 'Count', 'Denominator', 'Value note', 'Recent Trend', 'Compared to England value or percentiles', 'Compared to percentiles', 'Time period Sortable', 'New data', 'Compared to goal', 'Time period range', 'Condition', 'Year']
   Area Code  Year Env_Local_Authority_Name Clinical_Area_Name      Sex  \
0  E08000016  2012                 Barnsley           Barnsley  Persons   
1  E08000016  2014                 Barnsley           Barnsley  Persons   
2  E08000016  2015                 Barnsl

In [36]:
print(merged_data2.columns.tolist())


['Area Code', 'Year', 'Env_Local_Authority_Name', 'Clinical_Area_Name', 'Sex', 'Age', 'PM2.5_Total', 'PM2.5_Anthropogenic', 'PM2.5_Non_Anthropogenic', 'Condition', 'Value', 'Count', 'Denominator', 'Lower CI 95.0 limit', 'Upper CI 95.0 limit', 'Lower CI 99.8 limit', 'Upper CI 99.8 limit', 'Compared to England value or percentiles', 'Time period']


In [37]:
# Save the merged dataset to a CSV
merged_data2.to_csv("merged_environmental_clinical_data-v2.csv", index=False)

print("Merged file saved successfully!")


Merged file saved successfully!
