In [None]:
# Milestone 3: Cleaning and Formatting Website Data
# Matt Garlock
# 7/8/24

In [None]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Step 1: Load the data from the website
url = "https://meric.mo.gov/data/cost-living-data-series"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Locate the table in the HTML content
table = soup.find('table')


In [None]:
# Step 2: Replace Headers


In [None]:
# Step 2: Replace Headers
headers = [header.text.strip().replace(' ', '_').lower() for header in table.find_all('th')]

# Extract rows and cells
rows = []
for row in table.find_all('tr')[1:]:  # Skip the header row
    cells = row.find_all('td')
    cells = [cell.text.strip() for cell in cells]
    rows.append(cells)

# Create a DataFrame
df = pd.DataFrame(rows, columns=headers)
print("Step 2: Replace Headers\n", df.head())


In [None]:
# Step 3: Format Data


In [None]:
# Step 3: Format data into a more readable format
df.replace("", pd.NA, inplace=True)
df.dropna(how='all', inplace=True)
df.reset_index(drop=True, inplace=True)
print("\nStep 3: Format data into a more readable format\n", df.head())


In [None]:
# Step 4: Identify and Handle Outliers


In [None]:
# Step 4: Identify and Handle Outliers
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
Q1 = df[numeric_columns].quantile(0.25)
Q3 = df[numeric_columns].quantile(0.75)
IQR = Q3 - Q1

df_outliers_removed = df[~((df[numeric_columns] < (Q1 - 1.5 * IQR)) | (df[numeric_columns] > (Q3 + 1.5 * IQR))).any(axis=1)]
print("\nStep 4: Identify and Handle Outliers\n", df_outliers_removed.describe())


In [None]:
# Step 5: Find and Remove Duplicates


In [None]:
# Step 5: Find and Remove Duplicates
df_outliers_removed.drop_duplicates(inplace=True)
print("\nStep 5: Find and Remove Duplicates\n", df_outliers_removed.duplicated().sum())


In [None]:
# Step 6: Fix Casing or Inconsistent Values


In [None]:
# Step 6: Fix Casing or Inconsistent Values
df_outliers_removed = df_outliers_removed.apply(lambda col: col.map(lambda x: x.lower() if isinstance(x, str) else x))
print("\nStep 6: Fix Casing or Inconsistent Values\n", df_outliers_removed.head())



In [None]:
# Final Cleaned Data


In [None]:
# Final Human Readable Data
print("\nFinal Cleaned Data\n", df_outliers_removed)

# Save to a new HTML file to maintain format
df_outliers_removed.to_html('cleaned_data.html', index=False)


In [None]:
# Ethical Implications


In [None]:
# Ethical Implications
ethical_implications = """
Ethical Implications of Data Wrangling:

1. What changes were made to the data?
   Headers were replaced, data was formatted for readability, outliers were handled, duplicates were removed, and inconsistencies in casing were fixed.

2. Are there any legal or regulatory guidelines for your data or project topic?
   The data is sourced from a public website, ensuring no legal restrictions. However, ensuring accuracy and ethical use of the data is important.

3. What risks could be created based on the transformations done?
   Handling outliers and filling missing values can introduce bias. Removing duplicates might inadvertently remove legitimate repeated entries.

4. Did you make any assumptions in cleaning/transforming the data?
   Assumed that median is a suitable replacement for missing values and that outliers do not represent typical data points.

5. How was your data sourced / verified for credibility?
   Data was sourced from a credible public domain (MERIC website).

6. Was your data acquired in an ethical way?
   Yes, the data was acquired from a publicly available source.

7. How would you mitigate any of the ethical implications you have identified?
   Documenting assumptions and methods used in data cleaning. Conducting sensitivity analysis to understand the impact of transformations on results.
"""
print("\nEthical Implications\n", ethical_implications)



In [None]:
# Assuming df_milestone3_cleaned is your cleaned DataFrame from Milestone 3
df_milestone3_cleaned.to_csv('/Users/mattgarlock/Downloads/cleaned_output_milestone3.csv', index=False)
