In [None]:
# Milestone 3: Cleaning and Formatting Website Data
# Matt Garlock
# 7/8/24
# DSC 540

In [1]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Step 1: Load the data from the website
url = "https://meric.mo.gov/data/cost-living-data-series"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Locate the table in the HTML content
table = soup.find('table')


In [2]:
# Step 2: Replace Headers


In [3]:
# Step 2: Replace Headers
headers = [header.text.strip().replace(' ', '_').lower() for header in table.find_all('th')]

# Extract rows and cells
rows = []
for row in table.find_all('tr')[1:]:  # Skip the header row
    cells = row.find_all('td')
    cells = [cell.text.strip() for cell in cells]
    rows.append(cells)

# Create a DataFrame
df = pd.DataFrame(rows, columns=headers)
print("Step 2: Replace Headers\n", df.head())


Step 2: Replace Headers
   rank          state index grocery housing utilities transportation health  \
0    1  West Virginia  83.8    99.6    57.4      94.5           92.8  101.3   
1    2       Oklahoma  86.4    95.4    68.9      97.3           89.3   95.2   
2    3         Kansas  87.3    95.6    74.5      96.2           87.2   98.6   
3    4        Alabama  88.1    97.1    69.4     100.7           91.2   87.2   
4    5    Mississippi  88.3    96.1    75.2      90.7           89.7  100.7   

  misc.  
0  90.9  
1  92.0  
2  90.5  
3  95.6  
4  93.2  


In [4]:
# Step 3: Format Data


In [5]:
# Step 3: Format data into a more readable format
df.replace("", pd.NA, inplace=True)
df.dropna(how='all', inplace=True)
df.reset_index(drop=True, inplace=True)
print("\nStep 3: Format data into a more readable format\n", df.head())



Step 3: Format data into a more readable format
   rank          state index grocery housing utilities transportation health  \
0    1  West Virginia  83.8    99.6    57.4      94.5           92.8  101.3   
1    2       Oklahoma  86.4    95.4    68.9      97.3           89.3   95.2   
2    3         Kansas  87.3    95.6    74.5      96.2           87.2   98.6   
3    4        Alabama  88.1    97.1    69.4     100.7           91.2   87.2   
4    5    Mississippi  88.3    96.1    75.2      90.7           89.7  100.7   

  misc.  
0  90.9  
1  92.0  
2  90.5  
3  95.6  
4  93.2  


In [6]:
# Step 4: Identify and Handle Outliers


In [7]:
# Step 4: Identify and Handle Outliers
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
Q1 = df[numeric_columns].quantile(0.25)
Q3 = df[numeric_columns].quantile(0.75)
IQR = Q3 - Q1

df_outliers_removed = df[~((df[numeric_columns] < (Q1 - 1.5 * IQR)) | (df[numeric_columns] > (Q3 + 1.5 * IQR))).any(axis=1)]
print("\nStep 4: Identify and Handle Outliers\n", df_outliers_removed.describe())



Step 4: Identify and Handle Outliers
        rank          state index grocery housing utilities transportation  \
count    53             53    53      53      53        53             53   
unique   53             53    50      40      51        51             50   
top       1  West Virginia  88.5    99.6    88.1      97.9          107.5   
freq      1              1     2       3       2         2              2   

       health misc.  
count      53    53  
unique     48    49  
top     100.7  92.0  
freq        2     2  


In [8]:
# Step 5: Find and Remove Duplicates


In [9]:
# Step 5: Find and Remove Duplicates
df_outliers_removed.drop_duplicates(inplace=True)
print("\nStep 5: Find and Remove Duplicates\n", df_outliers_removed.duplicated().sum())



Step 5: Find and Remove Duplicates
 0


In [10]:
# Step 6: Fix Casing or Inconsistent Values


In [17]:
# Step 6: Fix Casing or Inconsistent Values
df_outliers_removed = df_outliers_removed.apply(lambda col: col.map(lambda x: x.lower() if isinstance(x, str) else x))
print("\nStep 6: Fix Casing or Inconsistent Values\n", df_outliers_removed.head())




Step 6: Fix Casing or Inconsistent Values
   rank          state index grocery housing utilities transportation health  \
0    1  west virginia  83.8    99.6    57.4      94.5           92.8  101.3   
1    2       oklahoma  86.4    95.4    68.9      97.3           89.3   95.2   
2    3         kansas  87.3    95.6    74.5      96.2           87.2   98.6   
3    4        alabama  88.1    97.1    69.4     100.7           91.2   87.2   
4    5    mississippi  88.3    96.1    75.2      90.7           89.7  100.7   

  misc.  
0  90.9  
1  92.0  
2  90.5  
3  95.6  
4  93.2  


In [18]:
# Final Cleaned Data


In [13]:
# Final Human Readable Data
print("\nFinal Cleaned Data\n", df_outliers_removed)

# Save to a new HTML file to maintain format
df_outliers_removed.to_html('cleaned_data.html', index=False)



Final Cleaned Data
    rank                 state  index grocery housing utilities transportation  \
0     1         west virginia   83.8    99.6    57.4      94.5           92.8   
1     2              oklahoma   86.4    95.4    68.9      97.3           89.3   
2     3                kansas   87.3    95.6    74.5      96.2           87.2   
3     4               alabama   88.1    97.1    69.4     100.7           91.2   
4     5           mississippi   88.3    96.1    75.2      90.7           89.7   
5     6              missouri   88.5    95.5    77.7      98.5           84.0   
6     7              arkansas   88.5    95.0    74.9      90.7           88.2   
7     8                  iowa   90.1    96.7    75.1      93.5           96.9   
8     9               indiana   90.3    97.6    77.8      90.4           94.8   
9    10             tennessee   90.3    97.1    82.8      88.6           89.1   
10   11               georgia   91.3    97.7    79.7      94.0           97.7   
11   12

In [14]:
# Ethical Implications


In [19]:
# Ethical Implications
ethical_implications = """
Ethical Implications of Data Wrangling:

1. What changes were made to the data?
   Headers were replaced, data was formatted for readability, outliers were handled, duplicates were removed, and inconsistencies in casing were fixed.

2. Are there any legal or regulatory guidelines for your data or project topic?
   The data is sourced from a public website, ensuring no legal restrictions. However, ensuring accuracy and ethical use of the data is important.

3. What risks could be created based on the transformations done?
   Handling outliers and filling missing values can introduce bias. Removing duplicates might inadvertently remove legitimate repeated entries.

4. Did you make any assumptions in cleaning/transforming the data?
   Assumed that median is a suitable replacement for missing values and that outliers do not represent typical data points.

5. How was your data sourced / verified for credibility?
   Data was sourced from a credible public domain (MERIC website).

6. Was your data acquired in an ethical way?
   Yes, the data was acquired from a publicly available source.

7. How would you mitigate any of the ethical implications you have identified?
   Documenting assumptions and methods used in data cleaning. Conducting sensitivity analysis to understand the impact of transformations on results.
"""
print("\nEthical Implications\n", ethical_implications)




Ethical Implications
 
Ethical Implications of Data Wrangling:

1. What changes were made to the data?
   Headers were replaced, data was formatted for readability, outliers were handled, duplicates were removed, and inconsistencies in casing were fixed.

2. Are there any legal or regulatory guidelines for your data or project topic?
   The data is sourced from a public website, ensuring no legal restrictions. However, ensuring accuracy and ethical use of the data is important.

3. What risks could be created based on the transformations done?
   Handling outliers and filling missing values can introduce bias. Removing duplicates might inadvertently remove legitimate repeated entries.

4. Did you make any assumptions in cleaning/transforming the data?
   Assumed that median is a suitable replacement for missing values and that outliers do not represent typical data points.

5. How was your data sourced / verified for credibility?
   Data was sourced from a credible public domain (MERI