### Importing Libraries

In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import os

In [16]:
# Path
path= r"C:\Users\nabar\OneDrive\Documents\A6 Project"
# Dataframe
df=pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'cleaned_dataset.csv'))
df1=pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'Population by Race.csv'))


### Mapping Ethnicity

In [55]:

print(df["Ethnicity"].unique())

['Others' 'White' 'Hispanic' 'Black' 'Asian' 'South Asian' 'Native Indian'
 'Mixed' 'Pacifica']


In [18]:
# Standardizing ethnicity labels for consistency
mapping_dict = {
    "middle eastern": "Middle Eastern",
    "White European": "White",
    "black": "Black",
    "asian": "Asian",
    "south asian": "South Asian",
    "mixed": "Mixed",
    "PaciFica": "Pacifica"
}

# Apply mapping to fix inconsistencies
df["Ethnicity"] = df["Ethnicity"].replace(mapping_dict)

In [19]:
print(df["Ethnicity"].unique())

['Middle Eastern' 'White' 'Hispanic' 'Black' 'Asian' 'South Asian'
 'Native Indian' 'Others' 'Latino' 'Mixed' 'Pacifica']


In [20]:
# Adjusting ethnicity categories
mapping_dict = {
    "middle eastern": "Others",  # Merge Middle Eastern into Others
    "Middle Eastern": "Others",
    "Latino": "Hispanic",  # Merge Latino into Hispanic
}

# Apply mapping
df["Ethnicity"] = df["Ethnicity"].replace(mapping_dict)

### Cleaning Data

In [21]:
print(df.duplicated().sum())  # Count duplicate entries

0


In [22]:
print(df.isnull().sum())

CASE_NO_PATIENT'S                                     0
A1                                                    0
A2                                                    0
A3                                                    0
A4                                                    0
A5                                                    0
A6                                                    0
A7                                                    0
A8                                                    0
A9                                                    0
A10_Autism_Spectrum_Quotient                          0
Social_Responsiveness_Scale                           9
Age_Years                                             0
Qchat_10_Score                                        0
Speech Delay/Language Disorder                        0
Learning disorder                                     0
Genetic_Disorders                                     0
Depression                                      

In [23]:
# Fill missing values with the median of the column
df["Social_Responsiveness_Scale"] = df["Social_Responsiveness_Scale"].fillna(df["Social_Responsiveness_Scale"].median())

In [24]:
print(df.isnull().sum())

CASE_NO_PATIENT'S                                     0
A1                                                    0
A2                                                    0
A3                                                    0
A4                                                    0
A5                                                    0
A6                                                    0
A7                                                    0
A8                                                    0
A9                                                    0
A10_Autism_Spectrum_Quotient                          0
Social_Responsiveness_Scale                           0
Age_Years                                             0
Qchat_10_Score                                        0
Speech Delay/Language Disorder                        0
Learning disorder                                     0
Genetic_Disorders                                     0
Depression                                      

In [26]:
print(df.columns)  # Check column names in your original dataset
print(df1.columns)  # Check column names in the new dataset

Index(['CASE_NO_PATIENT'S', 'A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8',
       'A9', 'A10_Autism_Spectrum_Quotient', 'Social_Responsiveness_Scale',
       'Age_Years', 'Qchat_10_Score', 'Speech Delay/Language Disorder',
       'Learning disorder', 'Genetic_Disorders', 'Depression',
       'Global developmental delay/intellectual disability',
       'Social/Behavioural Issues', 'Childhood Autism Rating Scale',
       'Anxiety_disorder', 'Sex', 'Ethnicity', 'Jaundice',
       'Family_mem_with_ASD', 'Who_completed_the_test', 'ASD_traits'],
      dtype='object')
Index(['IBRC_Geo_ID', 'State', 'District Name', 'Year', 'Total Population',
       'White Alone', 'Black Alone', 'American Indian or Alaskan Native',
       'Asian Alone', 'Hawaiian or Pacific Islander Alone',
       'Two or More Races', 'Not Hispanic', 'Hispanic'],
      dtype='object')


In [33]:
print(df1.duplicated().sum())

0


In [34]:
print(df1.isnull().sum())

IBRC_Geo_ID            0
State                  0
District Name          0
Year                   0
Total Population       0
White                  0
Black                  0
American Indian        0
Asian                  0
Pacifica            3980
Mixed               3980
Not Hispanic           0
Hispanic               0
dtype: int64


In [37]:
# Fill with median value of each column

df1["Pacifica"] = df1["Pacifica"].fillna(df1["Pacifica"].median())
df1["Mixed"] = df1["Mixed"].fillna(df1["Mixed"].median())

In [39]:
print(df1.isnull().sum())

IBRC_Geo_ID         0
State               0
District Name       0
Year                0
Total Population    0
White               0
Black               0
American Indian     0
Asian               0
Pacifica            0
Mixed               0
Not Hispanic        0
Hispanic            0
dtype: int64


In [38]:
non_numeric = df1[~df1["Mixed"].apply(lambda x: pd.api.types.is_number(x))]
print(non_numeric)

Empty DataFrame
Columns: [IBRC_Geo_ID, State, District Name, Year, Total Population, White, Black, American Indian, Asian, Pacifica, Mixed, Not Hispanic, Hispanic]
Index: []


In [27]:
# Rename ethnicity columns
df1.rename(columns={
    "White Alone": "White",
    "Black Alone": "Black",
    "American Indian or Alaskan Native": "American Indian",
    "Asian Alone": "Asian",
    "Hawaiian or Pacific Islander Alone": "Pacifica",
    "Two or More Races": "Mixed"
}, inplace=True)

In [28]:
print(df1.columns)

Index(['IBRC_Geo_ID', 'State', 'District Name', 'Year', 'Total Population',
       'White', 'Black', 'American Indian', 'Asian', 'Pacifica', 'Mixed',
       'Not Hispanic', 'Hispanic'],
      dtype='object')


In [29]:
ethnicity_columns = ["White", "Black", "American Indian", "Asian", "Pacifica", "Mixed", "Hispanic"]

df_ethnicity_long = df1.melt(
    id_vars=["State", "Total Population"],
    value_vars=ethnicity_columns,
    var_name="Ethnicity",
    value_name="Population"
)

In [31]:
df_ethnicity_long.head

<bound method NDFrame.head of       State  Total Population Ethnicity  Population
0        AK             41125     White     37508.0
1        AK             42649     White     38893.0
2        AK             43496     White     39670.0
3        AK             44201     White     40265.0
4        AK             45588     White     41435.0
...     ...               ...       ...         ...
83575    CT            944943  Hispanic    178438.0
83576    CT            944347  Hispanic    182054.0
83577    CT            943038  Hispanic    185617.0
83578    CT            943971  Hispanic    190098.0
83579    CT            943332  Hispanic    193772.0

[83580 rows x 4 columns]>

###  ASD Trait Rates Per Ethnicity

In [40]:
# Count occurrences of ASD traits (0 and 1) per ethnicity
asd_counts = df.groupby("Ethnicity")["ASD_traits"].value_counts().unstack().fillna(0)

# Rename columns for clarity 
asd_counts.columns = ["No_ASD_Traits", "Has_ASD_Traits"]

# Reset index to flatten the DataFrame
asd_counts.reset_index(inplace=True)

In [41]:
# Calculate ASD trait rate as a percentage
asd_counts["ASD Rate"] = asd_counts["Has_ASD_Traits"] / (asd_counts["Has_ASD_Traits"] + asd_counts["No_ASD_Traits"])

In [43]:
# percentage 
asd_counts["ASD Rate (%)"] = asd_counts["ASD Rate"] * 100

In [45]:
# Check the output
print(asd_counts)

       Ethnicity  No_ASD_Traits  Has_ASD_Traits  ASD Rate  ASD Rate (%)
0          Asian          262.0           343.0  0.566942     56.694215
1          Black           14.0            39.0  0.735849     73.584906
2       Hispanic           16.0            50.0  0.757576     75.757576
3          Mixed            3.0             5.0  0.625000     62.500000
4  Native Indian            0.0             3.0  1.000000    100.000000
5         Others          313.0           125.0  0.285388     28.538813
6       Pacifica            1.0             7.0  0.875000     87.500000
7    South Asian          218.0            37.0  0.145098     14.509804
8          White           84.0           465.0  0.846995     84.699454


In [46]:
print(asd_counts[asd_counts["Ethnicity"] == "Hispanic"])  # Check Hispanic ASD rate

  Ethnicity  No_ASD_Traits  Has_ASD_Traits  ASD Rate  ASD Rate (%)
2  Hispanic           16.0            50.0  0.757576     75.757576


In [47]:
print(asd_counts.isnull().sum())  # See if any ethnicity has missing ASD rate

Ethnicity         0
No_ASD_Traits     0
Has_ASD_Traits    0
ASD Rate          0
ASD Rate (%)      0
dtype: int64


### Merge ASD Trait Rates with Population Data

In [49]:
choropleth_df = df_ethnicity_long.merge(
    asd_counts[["Ethnicity", "ASD Rate"]],
    on="Ethnicity",
    how="left"
)

In [50]:
# Compute Estimated ASD Cases per State
choropleth_df["Estimated ASD Cases"] = (
    choropleth_df["Population"] * choropleth_df["ASD Rate"]
)

In [53]:
# save the dataframes
df1.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'cleaned_population_data.csv'), index=False)
asd_counts.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'asd_trait_rates.csv'), index=False)
choropleth_df.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'choropleth_ready_data.csv'), index=False)