In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np

# Load the new dataset
hosp = '/Users/sathwik/Downloads/COVID-19 Hospitalization Trends Report Data File - Claims thru 6.2.2023.csv'
data = pd.read_csv(hosp)

# Handling Missing Values
# Fill numerical columns with their median and categorical with the mode
numerical_cols = data.select_dtypes(include=['number']).columns
categorical_cols = data.select_dtypes(exclude=['number', 'datetime']).columns

for col in numerical_cols:
    data[col].fillna(data[col].median(), inplace=True)

for col in categorical_cols:
    data[col].fillna(data[col].mode()[0], inplace=True)

# Correct Data Types (if needed)
# Convert date columns to datetime format if applicable
# Example:
# data['date_column'] = pd.to_datetime(data['date_column'], errors='coerce')

# Removing Duplicates
data.drop_duplicates(inplace=True)

# Handling Outliers
# Using the IQR method to identify outliers
Q1 = data[numerical_cols].quantile(0.25)
Q3 = data[numerical_cols].quantile(0.75)
IQR = Q3 - Q1

# Removing outliers - rows that have values outside 1.5 * IQR from Q1 and Q3
data = data[~((data[numerical_cols] < (Q1 - 1.5 * IQR)) | (data[numerical_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

# Data Standardization
# Standardizing numerical columns
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

# Displaying the first few rows after data cleaning and standardization
print(data.head())

        Year    Month Bene_Geo_Desc Bene_Mdcd_Mdcr_Enrl_Stus  \
3  -1.264831  Overall      National                      All   
16 -1.264831  Overall      National                      All   
29 -1.264831  Overall      National                      All   
42 -1.264831  Overall      National                      All   
55 -1.264831  Overall      National                      All   

                   Bene_Race_Desc Bene_Sex_Desc Bene_Mdcr_Entlmt_Stus  \
3                             All           All                   All   
16  American Indian/Alaska Native           All                   All   
29         Asian/Pacific Islander           All                   All   
42         Black/African American           All                   All   
55                       Hispanic           All                   All   

   Bene_Age_Desc Bene_RUCA_Desc  Total_Hosp  Total_Enrl  Total_Hosp_Per100K  \
3            All        Unknown   -0.209006   -0.509521            0.130717   
16           All  