## **Libraries**

In [1]:
import json
import pandas as pd
import pymongo
from pymongo import MongoClient
import numpy as np

 # JSON for read json file
 # Pandas for data manipulation

## **Importing JSON file**

In [2]:
filepath = "C:/Users/cmanu/OneDrive/Escritorio/NCIDA/ANALYTICS/PROJECT/indicatorsad.json"

def import_json(filepath):
    try:
        with open (filepath, "r") as file:
            data = json.load(file)
        return data
    except FileNotFoundError:
        print(f"The file {filepath} was not found")
    except PermissionError:
        print(f"Insufficient permission to read the file {filepath}")
    except json.JSONDecodeError as jde:
        print(f"JSONDecodeError: {jde}")
        print("Please ensure the file contains valid JSON.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    return {}

## **Loading data**

In [3]:
data = import_json(filepath) # Reading JSON file

indicators = data
indicators

{'meta': {'view': {'id': '8pt5-q6wp',
   'name': 'Indicators of Anxiety or Depression Based on Reported Frequency of Symptoms During Last 7 Days',
   'assetType': 'dataset',
   'attribution': 'NCHS/DHIS',
   'averageRating': 0,
   'category': 'NCHS',
   'createdAt': 1589990736,
   'description': 'The U.S. Census Bureau, in collaboration with five federal agencies, launched the Household Pulse Survey to produce data on the social and economic impacts of Covid-19 on American households.  The Household Pulse Survey was designed to gauge the impact of the pandemic on employment status, consumer spending, food security, housing, education disruptions, and dimensions of physical and mental wellness.\n\nThe survey was designed to meet the goal of accurate and timely weekly estimates. It was conducted by an internet questionnaire, with invitations to participate sent by email and text message. The sample frame is the Census Bureau Master Address File Data. Housing units linked to one or more e

In [4]:
pd.set_option('display.max_columns', None)

# Column names from the JSON file
columns = [col["name"] for col in data["meta"]["view"]["columns"]]
rows = data["data"]

# JSON file into data frame
indicators = pd.DataFrame(rows, columns=columns)
indicators.head()

Unnamed: 0,sid,id,position,created_at,created_meta,updated_at,updated_meta,meta,Indicator,Group,State,Subgroup,Phase,Time Period,Time Period Label,Time Period Start Date,Time Period End Date,Value,Low CI,High CI,Confidence Interval,Quartile Range
0,row-bg55_9ag3~k28u,00000000-0000-0000-3CE1-869377BF2C8A,0,1728050795,,1728050795,,{ },Symptoms of Depressive Disorder,National Estimate,United States,United States,1,1,"Apr 23 - May 5, 2020",2020-04-23T00:00:00,2020-05-05T00:00:00,23.5,22.7,24.3,22.7 - 24.3,
1,row-54up.86wf-96i2,00000000-0000-0000-BCEA-D97191E97FCC,0,1728050795,,1728050795,,{ },Symptoms of Depressive Disorder,By Age,United States,18 - 29 years,1,1,"Apr 23 - May 5, 2020",2020-04-23T00:00:00,2020-05-05T00:00:00,32.7,30.2,35.2,30.2 - 35.2,
2,row-n4zm.k6qt~nvs5,00000000-0000-0000-5A5F-DE9322C29F44,0,1728050795,,1728050795,,{ },Symptoms of Depressive Disorder,By Age,United States,30 - 39 years,1,1,"Apr 23 - May 5, 2020",2020-04-23T00:00:00,2020-05-05T00:00:00,25.7,24.1,27.3,24.1 - 27.3,
3,row-cjc8_5q9v-zj6z,00000000-0000-0000-976D-89DBD361B062,0,1728050795,,1728050795,,{ },Symptoms of Depressive Disorder,By Age,United States,40 - 49 years,1,1,"Apr 23 - May 5, 2020",2020-04-23T00:00:00,2020-05-05T00:00:00,24.8,23.3,26.2,23.3 - 26.2,
4,row-jsrf-vxiz~9i6r,00000000-0000-0000-BDF1-6960841430B0,0,1728050795,,1728050795,,{ },Symptoms of Depressive Disorder,By Age,United States,50 - 59 years,1,1,"Apr 23 - May 5, 2020",2020-04-23T00:00:00,2020-05-05T00:00:00,23.2,21.5,25.0,21.5 - 25.0,


In [5]:
indicators.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16794 entries, 0 to 16793
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   sid                     16794 non-null  object
 1   id                      16794 non-null  object
 2   position                16794 non-null  int64 
 3   created_at              16794 non-null  int64 
 4   created_meta            0 non-null      object
 5   updated_at              16794 non-null  int64 
 6   updated_meta            0 non-null      object
 7   meta                    16794 non-null  object
 8   Indicator               16794 non-null  object
 9   Group                   16794 non-null  object
 10  State                   16794 non-null  object
 11  Subgroup                16794 non-null  object
 12  Phase                   16794 non-null  object
 13  Time Period             16794 non-null  object
 14  Time Period Label       16794 non-null  object
 15  Ti

In [6]:
indicators.describe()

Unnamed: 0,position,created_at,updated_at
count,16794.0,16794.0,16794.0
mean,0.0,1728051000.0,1728051000.0
std,0.0,0.0,0.0
min,0.0,1728051000.0,1728051000.0
25%,0.0,1728051000.0,1728051000.0
50%,0.0,1728051000.0,1728051000.0
75%,0.0,1728051000.0,1728051000.0
max,0.0,1728051000.0,1728051000.0


In [7]:
# Dropping columns that are not necessary for analysis
drop_columns = indicators[["sid", "position", "created_at", "created_meta", "updated_at", "meta", "updated_meta", "Time Period Label"]]
indicators = indicators.drop(columns=drop_columns)
indicators.head()

Unnamed: 0,id,Indicator,Group,State,Subgroup,Phase,Time Period,Time Period Start Date,Time Period End Date,Value,Low CI,High CI,Confidence Interval,Quartile Range
0,00000000-0000-0000-3CE1-869377BF2C8A,Symptoms of Depressive Disorder,National Estimate,United States,United States,1,1,2020-04-23T00:00:00,2020-05-05T00:00:00,23.5,22.7,24.3,22.7 - 24.3,
1,00000000-0000-0000-BCEA-D97191E97FCC,Symptoms of Depressive Disorder,By Age,United States,18 - 29 years,1,1,2020-04-23T00:00:00,2020-05-05T00:00:00,32.7,30.2,35.2,30.2 - 35.2,
2,00000000-0000-0000-5A5F-DE9322C29F44,Symptoms of Depressive Disorder,By Age,United States,30 - 39 years,1,1,2020-04-23T00:00:00,2020-05-05T00:00:00,25.7,24.1,27.3,24.1 - 27.3,
3,00000000-0000-0000-976D-89DBD361B062,Symptoms of Depressive Disorder,By Age,United States,40 - 49 years,1,1,2020-04-23T00:00:00,2020-05-05T00:00:00,24.8,23.3,26.2,23.3 - 26.2,
4,00000000-0000-0000-BDF1-6960841430B0,Symptoms of Depressive Disorder,By Age,United States,50 - 59 years,1,1,2020-04-23T00:00:00,2020-05-05T00:00:00,23.2,21.5,25.0,21.5 - 25.0,


In [8]:
indicators["Time Period Start Date"] = pd.to_datetime(indicators["Time Period Start Date"])
indicators["Time Period End Date"] = pd.to_datetime(indicators["Time Period End Date"])
indicators.head()

Unnamed: 0,id,Indicator,Group,State,Subgroup,Phase,Time Period,Time Period Start Date,Time Period End Date,Value,Low CI,High CI,Confidence Interval,Quartile Range
0,00000000-0000-0000-3CE1-869377BF2C8A,Symptoms of Depressive Disorder,National Estimate,United States,United States,1,1,2020-04-23,2020-05-05,23.5,22.7,24.3,22.7 - 24.3,
1,00000000-0000-0000-BCEA-D97191E97FCC,Symptoms of Depressive Disorder,By Age,United States,18 - 29 years,1,1,2020-04-23,2020-05-05,32.7,30.2,35.2,30.2 - 35.2,
2,00000000-0000-0000-5A5F-DE9322C29F44,Symptoms of Depressive Disorder,By Age,United States,30 - 39 years,1,1,2020-04-23,2020-05-05,25.7,24.1,27.3,24.1 - 27.3,
3,00000000-0000-0000-976D-89DBD361B062,Symptoms of Depressive Disorder,By Age,United States,40 - 49 years,1,1,2020-04-23,2020-05-05,24.8,23.3,26.2,23.3 - 26.2,
4,00000000-0000-0000-BDF1-6960841430B0,Symptoms of Depressive Disorder,By Age,United States,50 - 59 years,1,1,2020-04-23,2020-05-05,23.2,21.5,25.0,21.5 - 25.0,


In [9]:
indicators["Start year"] = indicators["Time Period Start Date"].dt.year

In [10]:
indicators["Time Period Start Date"].value_counts()

2021-09-15    234
2023-02-01    234
2022-12-09    234
2022-11-02    234
2022-10-05    234
             ... 
2021-07-06     63
2021-10-12     63
2020-07-22     57
2020-12-22     57
2021-03-30     57
Name: Time Period Start Date, Length: 82, dtype: int64

In [11]:
indicators["End year"] = indicators["Time Period End Date"].dt.year

In [12]:
indicators["Days count of symptoms"] = (indicators["Time Period End Date"] - indicators["Time Period Start Date"]).dt.days
indicators["Days count of symptoms"].value_counts()

12    11442
5      2310
27     2163
10      234
14      177
6       162
20       81
8        81
42       81
49       63
Name: Days count of symptoms, dtype: int64

In [13]:
indicators["Symtoms during the same year"] = indicators["End year"]- indicators["Start year"]

indicators["Symtoms during the same year"] = np.where(
    indicators["End year"] - indicators["Start year"] == 0, "Yes", "No")

In [14]:
indicators["Subgroup"].value_counts()

United States                      246
Hispanic or Latino                 246
18 - 29 years                      246
Bachelor's degree or higher        246
Some college/Associate's degree    246
                                  ... 
Cis-gender female                  132
Transgender                        132
Gay or lesbian                     132
Straight                           132
Bisexual                           132
Name: Subgroup, Length: 78, dtype: int64

In [15]:
def checkage(row):
    subgroup = row["Subgroup"]
    return "Yes" if subgroup[0].isdigit() else "No"

indicators["Related to Age"] = indicators.apply(checkage, axis=1)
indicators["Related to Age"]

0         No
1        Yes
2        Yes
3        Yes
4        Yes
        ... 
16789     No
16790     No
16791     No
16792     No
16793     No
Name: Related to Age, Length: 16794, dtype: object

In [16]:
indicators["Group"].value_counts()

By State                      11016
By Age                         1722
By Race/Hispanic ethnicity     1230
By Education                    984
By Sex                          492
By Gender identity              396
By Sexual orientation           396
By Disability status            312
National Estimate               246
Name: Group, dtype: int64

In [17]:
indicators[["Start Age", "End Age"]] = indicators["Subgroup"].str.extract(r"(\d+)\s*-\s*(\d+)")
indicators[["Start Age", "End Age"]] = indicators[["Start Age", "End Age"]].replace({np.nan: 0})
indicators[["Start Age", "End Age"]]

Unnamed: 0,Start Age,End Age
0,0,0
1,18,29
2,30,39
3,40,49
4,50,59
...,...,...
16789,0,0
16790,0,0
16791,0,0
16792,0,0


In [18]:
indicators["Phase"].value_counts()

1                        2520
3.1                      1998
3.2                      1404
3.0 (Jan 6 - Mar 29)     1260
2                        1050
4.1                       936
3.0 (Oct 28 - Dec 21)     840
3.7                       702
4                         702
3.9                       702
3.8                       702
3.4                       702
3.6                       702
3.5                       702
-1                        702
3.3                       702
4.2                       468
Name: Phase, dtype: int64

In [19]:
indicators["Phase"] = pd.to_numeric(indicators["Phase"], errors='coerce')

In [20]:
indicators["Phase"] = indicators["Phase"].fillna(3.0)

In [21]:
indicators["Phase"].value_counts()

 1.0    2520
 3.0    2100
 3.1    1998
 3.2    1404
 2.0    1050
 4.1     936
-1.0     702
 3.3     702
 3.4     702
 3.5     702
 3.6     702
 3.7     702
 3.8     702
 3.9     702
 4.0     702
 4.2     468
Name: Phase, dtype: int64

In [22]:
indicators["Phase"].value_counts()

 1.0    2520
 3.0    2100
 3.1    1998
 3.2    1404
 2.0    1050
 4.1     936
-1.0     702
 3.3     702
 3.4     702
 3.5     702
 3.6     702
 3.7     702
 3.8     702
 3.9     702
 4.0     702
 4.2     468
Name: Phase, dtype: int64

In [23]:
indicators.tail()

Unnamed: 0,id,Indicator,Group,State,Subgroup,Phase,Time Period,Time Period Start Date,Time Period End Date,Value,Low CI,High CI,Confidence Interval,Quartile Range,Start year,End year,Days count of symptoms,Symtoms during the same year,Related to Age,Start Age,End Age
16789,00000000-0000-0000-E9BB-21CBDA3DE35D,Symptoms of Anxiety Disorder or Depressive Dis...,By State,Virginia,Virginia,4.2,72,2024-08-20,2024-09-16,20.8,17.4,24.5,17.4 - 24.5,19.5-21.5,2024,2024,27,Yes,No,0,0
16790,00000000-0000-0000-7667-7C217E2842BF,Symptoms of Anxiety Disorder or Depressive Dis...,By State,Washington,Washington,4.2,72,2024-08-20,2024-09-16,19.9,17.4,22.6,17.4 - 22.6,19.5-21.5,2024,2024,27,Yes,No,0,0
16791,00000000-0000-0000-7B0A-ED227968BED7,Symptoms of Anxiety Disorder or Depressive Dis...,By State,West Virginia,West Virginia,4.2,72,2024-08-20,2024-09-16,26.3,21.5,31.4,21.5 - 31.4,23.1-27.8,2024,2024,27,Yes,No,0,0
16792,00000000-0000-0000-DF47-DF7C1F6C02E8,Symptoms of Anxiety Disorder or Depressive Dis...,By State,Wisconsin,Wisconsin,4.2,72,2024-08-20,2024-09-16,17.0,13.4,21.2,13.4 - 21.2,16.6-19.4,2024,2024,27,Yes,No,0,0
16793,00000000-0000-0000-B7F5-FBCA895109C3,Symptoms of Anxiety Disorder or Depressive Dis...,By State,Wyoming,Wyoming,4.2,72,2024-08-20,2024-09-16,25.6,19.3,32.8,19.3 - 32.8,23.1-27.8,2024,2024,27,Yes,No,0,0


In [24]:
indicators["Quartile Range"].value_counts()

23.2-27.3                37
23.4-25.7                27
28.8-31.4                27
14.9-19.5                27
12.1-13.8                27
                         ..
29.4-34.2                10
38.0-43.5                10
23.9-25.7                10
32.4-33.7                 9
Estimate not reliable     1
Name: Quartile Range, Length: 841, dtype: int64

In [25]:
# Null values from CSV file
indicators.isna().sum()

id                                 0
Indicator                          0
Group                              0
State                              0
Subgroup                           0
Phase                              0
Time Period                        0
Time Period Start Date             0
Time Period End Date               0
Value                            707
Low CI                           707
High CI                          707
Confidence Interval              707
Quartile Range                  5777
Start year                         0
End year                           0
Days count of symptoms             0
Symtoms during the same year       0
Related to Age                     0
Start Age                          0
End Age                            0
dtype: int64

In [26]:
indicators["Value"] = indicators["Value"].fillna(0)
indicators["Low CI"] = indicators["Low CI"].fillna(0)
indicators["High CI"] = indicators["High CI"].fillna(0)
indicators["Confidence Interval"] = indicators["Confidence Interval"].fillna(0)
indicators["Quartile Range"] = indicators["Quartile Range"].fillna(0)
indicators["Start Age"].fillna(0)
indicators["End Age"].fillna("N/A")
indicators["End Age"]

0         0
1        29
2        39
3        49
4        59
         ..
16789     0
16790     0
16791     0
16792     0
16793     0
Name: End Age, Length: 16794, dtype: object

In [27]:
by_age = indicators[indicators["Group"].str.contains("By Age", case=False)].copy()
by_age["Group"] = by_age["Group"].str.strip()
by_age.head()

by_age["Subgroup"].shape
indicators.update(by_age)


# Null values from CSV file
indicators.isna().sum()

id                              0
Indicator                       0
Group                           0
State                           0
Subgroup                        0
Phase                           0
Time Period                     0
Time Period Start Date          0
Time Period End Date            0
Value                           0
Low CI                          0
High CI                         0
Confidence Interval             0
Quartile Range                  0
Start year                      0
End year                        0
Days count of symptoms          0
Symtoms during the same year    0
Related to Age                  0
Start Age                       0
End Age                         0
dtype: int64

In [28]:
by_age = indicators[indicators["Group"].str.contains("By Age", case=False)]

In [29]:
by_age.head()

Unnamed: 0,id,Indicator,Group,State,Subgroup,Phase,Time Period,Time Period Start Date,Time Period End Date,Value,Low CI,High CI,Confidence Interval,Quartile Range,Start year,End year,Days count of symptoms,Symtoms during the same year,Related to Age,Start Age,End Age
1,00000000-0000-0000-BCEA-D97191E97FCC,Symptoms of Depressive Disorder,By Age,United States,18 - 29 years,1.0,1,2020-04-23,2020-05-05,32.7,30.2,35.2,30.2 - 35.2,0,2020.0,2020.0,12.0,Yes,Yes,18,29
2,00000000-0000-0000-5A5F-DE9322C29F44,Symptoms of Depressive Disorder,By Age,United States,30 - 39 years,1.0,1,2020-04-23,2020-05-05,25.7,24.1,27.3,24.1 - 27.3,0,2020.0,2020.0,12.0,Yes,Yes,30,39
3,00000000-0000-0000-976D-89DBD361B062,Symptoms of Depressive Disorder,By Age,United States,40 - 49 years,1.0,1,2020-04-23,2020-05-05,24.8,23.3,26.2,23.3 - 26.2,0,2020.0,2020.0,12.0,Yes,Yes,40,49
4,00000000-0000-0000-BDF1-6960841430B0,Symptoms of Depressive Disorder,By Age,United States,50 - 59 years,1.0,1,2020-04-23,2020-05-05,23.2,21.5,25.0,21.5 - 25.0,0,2020.0,2020.0,12.0,Yes,Yes,50,59
5,00000000-0000-0000-D6A5-01AEBD5B505A,Symptoms of Depressive Disorder,By Age,United States,60 - 69 years,1.0,1,2020-04-23,2020-05-05,18.4,17.0,19.7,17.0 - 19.7,0,2020.0,2020.0,12.0,Yes,Yes,60,69


## **Exporting full data set into csv**

In [30]:
indicators.to_csv("C:/Users/cmanu/OneDrive/Escritorio/NCIDA/ANALYTICS/PROJECT/indicators.csv", ',')

In [31]:
client = MongoClient("localhost", 27017)
database = client["anxiety"]
database

filepath = "C:/Users/cmanu/OneDrive/Escritorio/NCIDA/ANALYTICS/PROJECT/indicators.csv"

def importcsv_tomongo(filepath, database_name, collection_name, host="localhost", port=27017):
    client = MongoClient("localhost", 27017)
    database_name = client["anxiety"]
    collection_name = database["indicators"]

    try:
        data = pd.read_csv(filepath)
        print(f"CSV File {filepath} has been successfully loaded.")
    except Exception as e:
        print(f"Error reading CSV file {e}")
        return 0
    datadict = data.to_dict(orient="records")

    try:
        result = collection_name.insert_many(datadict)
        print(f"Inserted {len(result.inserted_ids)} documents into the '{collection_name}' collection.")
        return len(result.inserted_ids)
    except Exception as e:
        print(f"Error inserting data into MongoDB: {e}")
        return 0

In [32]:
if __name__ == "__main__":
    filepath = filepath
    database_name = "anxiety"
    collection_name = "indicators"

    inserted_count = importcsv_tomongo(filepath, database_name, collection_name)
    print(f"Total documents inserted: {inserted_count}")

CSV File C:/Users/cmanu/OneDrive/Escritorio/NCIDA/ANALYTICS/PROJECT/indicators.csv has been successfully loaded.
Inserted 16794 documents into the 'Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'anxiety'), 'indicators')' collection.
Total documents inserted: 16794


## **Exporting only age related data into csv file**

In [33]:
by_age.to_csv("C:/Users/cmanu/OneDrive/Escritorio/NCIDA/ANALYTICS/PROJECT/indicators_byage.csv", ',')

In [34]:
client = MongoClient("localhost", 27017)
database = client["anxiety"]
database

filepath = "C:/Users/cmanu/OneDrive/Escritorio/NCIDA/ANALYTICS/PROJECT/indicators_byage.csv"

def importcsv_tomongo(filepath, database_name, collection_name, host="localhost", port=27017):
    client = MongoClient("localhost", 27017)
    database_name = client["anxiety"]
    collection_name = database["indicators_byage"]

    try:
        data = pd.read_csv(filepath)
        print(f"CSV File {filepath} has been successfully loaded.")
    except Exception as e:
        print(f"Error reading CSV file {e}")
        return 0
    datadict = data.to_dict(orient="records")

    try:
        result = collection_name.insert_many(datadict)
        print(f"Inserted {len(result.inserted_ids)} documents into the '{collection_name}' collection.")
        return len(result.inserted_ids)
    except Exception as e:
        print(f"Error inserting data into MongoDB: {e}")
        return 0

In [35]:
if __name__ == "__main__":
    filepath = filepath
    database_name = "anxiety"
    collection_name = "indicators_byage"

    inserted_count = importcsv_tomongo(filepath, database_name, collection_name)
    print(f"Total documents inserted: {inserted_count}")

CSV File C:/Users/cmanu/OneDrive/Escritorio/NCIDA/ANALYTICS/PROJECT/indicators_byage.csv has been successfully loaded.
Inserted 1722 documents into the 'Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'anxiety'), 'indicators_byage')' collection.
Total documents inserted: 1722


## **Extracting the categories of the data set**

In [36]:
categories = pd.read_csv("C:/Users/cmanu/OneDrive/Escritorio/NCIDA/ANALYTICS/PROJECT/indicators.csv")


In [37]:
categories.nunique().sort_values()

Related to Age                      2
Symtoms during the same year        2
Indicator                           3
End year                            5
Start year                          5
End Age                             7
Start Age                           7
Group                               9
Days count of symptoms             10
Phase                              16
State                              52
Time Period                        72
Subgroup                           78
Time Period Start Date             82
Time Period End Date               82
Low CI                            574
Value                             606
High CI                           642
Quartile Range                    842
Confidence Interval             12516
id                              16794
Unnamed: 0                      16794
dtype: int64

In [38]:
categories = categories[["Group", "Subgroup"]]
categories.head()

Unnamed: 0,Group,Subgroup
0,National Estimate,United States
1,By Age,18 - 29 years
2,By Age,30 - 39 years
3,By Age,40 - 49 years
4,By Age,50 - 59 years


In [39]:
categories["Group"].value_counts()

By State                      11016
By Age                         1722
By Race/Hispanic ethnicity     1230
By Education                    984
By Sex                          492
By Gender identity              396
By Sexual orientation           396
By Disability status            312
National Estimate               246
Name: Group, dtype: int64

In [40]:
categories["Subgroup"].value_counts()

United States                      246
Hispanic or Latino                 246
18 - 29 years                      246
Bachelor's degree or higher        246
Some college/Associate's degree    246
                                  ... 
Cis-gender female                  132
Transgender                        132
Gay or lesbian                     132
Straight                           132
Bisexual                           132
Name: Subgroup, Length: 78, dtype: int64

### **Tables by group**

In [41]:
grouped_tables = {
    group: pd.DataFrame({"Subgroup": group_data["Subgroup"].unique()})
    for group, group_data in categories.groupby("Group")
}

for group, table in grouped_tables.items():
    print(f"Group {group} Table:")
    print(table)
    print()

Group By Age Table:
             Subgroup
0       18 - 29 years
1       30 - 39 years
2       40 - 49 years
3       50 - 59 years
4       60 - 69 years
5       70 - 79 years
6  80 years and above

Group By Disability status Table:
             Subgroup
0     With disability
1  Without disability

Group By Education Table:
                          Subgroup
0  Less than a high school diploma
1       High school diploma or GED
2  Some college/Associate's degree
3      Bachelor's degree or higher

Group By Gender identity Table:
            Subgroup
0    Cis-gender male
1  Cis-gender female
2        Transgender

Group By Race/Hispanic ethnicity Table:
                                       Subgroup
0                            Hispanic or Latino
1               Non-Hispanic White, single race
2               Non-Hispanic Black, single race
3               Non-Hispanic Asian, single race
4  Non-Hispanic, other races and multiple races

Group By Sex Table:
  Subgroup
0     Male
1   Female



### **By Age**

In [42]:
table_byage = categories[categories["Group"] == "By Age"]
table_byage = table_byage.drop_duplicates()
table_byage


table_byage.to_csv("C:/Users/cmanu/OneDrive/Escritorio/NCIDA/ANALYTICS/PROJECT/categories_byage.csv", ',')

### **By State**

In [43]:
table_bystate = categories[categories["Group"] == "By State"]
table_bystate = table_bystate.drop_duplicates()
table_bystate

table_bystate.to_csv("C:/Users/cmanu/OneDrive/Escritorio/NCIDA/ANALYTICS/PROJECT/categories_bystate.csv", ',')

### **Other categories**

In [44]:
table_others = categories[categories["Group"] != "By State"]
table_others = categories[categories["Group"] != "By Age"]
table_others = table_others.drop_duplicates()
table_others

table_others.to_csv("C:/Users/cmanu/OneDrive/Escritorio/NCIDA/ANALYTICS/PROJECT/categories_byothers.csv", ',')