In [None]:
# Matt Garlock
# Milestone 4
# 7/20/2024

In [1]:
# Step 1: Import Libraries


In [3]:
import requests
import pandas as pd
import numpy as np


In [4]:
# Step 2: Connect to the API and Pull in the Data


In [5]:
api_url = 'https://api.census.gov/data/2020/acs/acs5?get=NAME,B19013_001E&for=county:*&key=b052b141e035945892840b02ecf138665be411c7'
response = requests.get(api_url)
data = response.json()

# Convert the data into a DataFrame
columns = data[0]
rows = data[1:]
df = pd.DataFrame(rows, columns=columns)


In [6]:
## Step 3: Replace Headers


In [7]:
df.columns = ['County_Name', 'Median_Income', 'State_Code', 'County_Code']
print("Step #1: Replace Headers\n", df.head())


Step #1: Replace Headers
                County_Name Median_Income State_Code County_Code
0  Autauga County, Alabama         57982         01         001
1  Baldwin County, Alabama         61756         01         003
2  Barbour County, Alabama         34990         01         005
3     Bibb County, Alabama         51721         01         007
4   Blount County, Alabama         48922         01         009


In [8]:
## Step 4: Format Data


In [9]:
df['Median_Income'] = pd.to_numeric(df['Median_Income'], errors='coerce')
print("Step #2: Format Data\n", df.dtypes)


Step #2: Format Data
 County_Name      object
Median_Income     int64
State_Code       object
County_Code      object
dtype: object


In [10]:
## Step 5: Identify Outliers


In [11]:
q1 = df['Median_Income'].quantile(0.25)
q3 = df['Median_Income'].quantile(0.75)
iqr = q3 - q1
outliers = df[(df['Median_Income'] < (q1 - 1.5 * iqr)) | (df['Median_Income'] > (q3 + 1.5 * iqr))]
print("Step #3: Identify Outliers\n", outliers)


Step #3: Identify Outliers
                        County_Name  Median_Income State_Code County_Code
26         Oldham County, Kentucky         103761         21         185
63        Carver County, Minnesota         104011         27         019
66       Chisago County, Minnesota          86900         27         025
89    Williamson County, Tennessee         111196         47         187
94              Davis County, Utah          87570         49         011
...                            ...            ...        ...         ...
2825       Howard County, Maryland         124042         24         027
2826   Montgomery County, Maryland         111812         24         031
2827   St. Mary's County, Maryland          95864         24         037
2893        Johnson County, Kansas          91650         20         091
3218   Sherburne County, Minnesota          88671         27         141

[182 rows x 4 columns]


In [12]:
## Step 6: Fix Casing/Inconsistent Values


In [13]:
df['County_Name'] = df['County_Name'].str.title()
print("Step #4: Fix Casing/Inconsistent Values\n", df.head())


Step #4: Fix Casing/Inconsistent Values
                County_Name  Median_Income State_Code County_Code
0  Autauga County, Alabama          57982         01         001
1  Baldwin County, Alabama          61756         01         003
2  Barbour County, Alabama          34990         01         005
3     Bibb County, Alabama          51721         01         007
4   Blount County, Alabama          48922         01         009


In [14]:
## Step 7: Remove Duplicates


In [15]:
df = df.drop_duplicates()
print("Step #5: Remove Duplicates\n", df.head())


Step #5: Remove Duplicates
                County_Name  Median_Income State_Code County_Code
0  Autauga County, Alabama          57982         01         001
1  Baldwin County, Alabama          61756         01         003
2  Barbour County, Alabama          34990         01         005
3     Bibb County, Alabama          51721         01         007
4   Blount County, Alabama          48922         01         009


In [16]:
## Cleaned Dataset


In [17]:
print("Cleaned Dataset:\n", df.head())


Cleaned Dataset:
                County_Name  Median_Income State_Code County_Code
0  Autauga County, Alabama          57982         01         001
1  Baldwin County, Alabama          61756         01         003
2  Barbour County, Alabama          34990         01         005
3     Bibb County, Alabama          51721         01         007
4   Blount County, Alabama          48922         01         009


In [18]:
## Ethical Implications


In [19]:
ethical_implications = """
Ethical Implications of Data Wrangling:

1. What changes were made to the data?
   Headers were renamed, data types were formatted, outliers were identified, casing was standardized, and duplicates were removed.

2. Are there any legal or regulatory guidelines for your data or project topic?
   The data is sourced from a publicly available API (US Census Bureau), and thus complies with public data usage policies.

3. What risks could be created based on the transformations done?
   Removing outliers might exclude relevant data points, leading to potential bias. Ensuring that the transformations do not distort the data's original meaning is crucial.

4. Did you make any assumptions in cleaning/transforming the data?
   Assumed that all numeric entries should be converted to numeric types and that standardizing county names to title case would not affect analysis.

5. How was your data sourced / verified for credibility?
   Data was sourced directly from the US Census Bureau API, a reputable source for demographic data.

6. Was your data acquired in an ethical way?
   Yes, data was obtained from a publicly accessible API.

7. How would you mitigate any of the ethical implications you have identified?
   Carefully document all steps and decisions made during data cleaning and ensure that outlier handling and transformations are justified based on the analysis context.
"""
print("\nEthical Implications\n", ethical_implications)



Ethical Implications
 
Ethical Implications of Data Wrangling:

1. What changes were made to the data?
   Headers were renamed, data types were formatted, outliers were identified, casing was standardized, and duplicates were removed.

2. Are there any legal or regulatory guidelines for your data or project topic?
   The data is sourced from a publicly available API (US Census Bureau), and thus complies with public data usage policies.

3. What risks could be created based on the transformations done?
   Removing outliers might exclude relevant data points, leading to potential bias. Ensuring that the transformations do not distort the data's original meaning is crucial.

4. Did you make any assumptions in cleaning/transforming the data?
   Assumed that all numeric entries should be converted to numeric types and that standardizing county names to title case would not affect analysis.

5. How was your data sourced / verified for credibility?
   Data was sourced directly from the US Ce

In [20]:
# Save the cleaned dataset as a JSON file
df.to_json('/Users/mattgarlock/Downloads/cleaned_output_milestone4.json', orient='records', lines=True)
