In [None]:
import pandas as pd
from sqlalchemy import create_engine
import great_expectations as ge
context = ge.data_context.DataContext()
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Define the paths to your CSV files
hospital_csv_path = r'C:\Users\gezeu\Project_3\Hospital.csv'
physician_csv_path = r'C:\Users\gezeu\Project_3\Physician and Clinics.csv'

In [None]:
# Read the CSV files
hospital_df = pd.read_csv(hospital_csv_path)
physician_df = pd.read_csv(physician_csv_path)

In [None]:
# Remove the United States row and keep only states
hospital_df = hospital_df[hospital_df['Region/state of residence'] != 'United States']
For the physician_df dataframe:
physician_df = physician_df[~physician_df['Region/state of residence'].str.contains("United States|New England")]

In [None]:
# Rename columns for consistency
hospital_df.rename(columns={'Region/state of residence': 'state'}, inplace=True)
physician_df.rename(columns={'region/state of residence': 'state'}, inplace=True)

In [None]:
# Convert columns to integers
years = [str(year) for year in range(2008, 2021)]
for year in years:
    hospital_df[year] = hospital_df[year].astype(int)
    physician_df[year] = physician_df[year].astype(int)


In [None]:
# Initialize Great Expectations in the current directory
context = ge.data_context.DataContext()
context.add_store("expectations_store", {"class_name": "ExpectationsStore"})
context.add_store("validations_store", {"class_name": "ValidationsStore"})
context.add_store("checkpoint_store", {"class_name": "CheckpointStore"})
context.add_store("store", {"class_name": "ExpectationsStore"})

# Save the configuration
context.save_config()

In [None]:
# Validate data using Great Expectations
context = ge.data_context.DataContext()


In [None]:
# Create a Great Expectations suite
suite_name = "hospital_and_physician_services_suite"
context.create_expectation_suite(suite_name, overwrite_existing=True)

In [None]:
# Validate Hospital DataFrame
hospital_ge_df = ge.from_pandas(hospital_df)
for year in years:
    hospital_ge_df.expect_column_values_to_not_be_null(year)

In [None]:
# Validate Physician DataFrame
physician_ge_df = ge.from_pandas(physician_df)
for year in years:
    physician_ge_df.expect_column_values_to_not_be_null(year)

In [None]:
# Run validations
hospital_results = hospital_ge_df.validate()
physician_results = physician_ge_df.validate()


In [None]:
# Check validation results
assert hospital_results["success"], "Hospital data validation failed!"
assert physician_results["success"], "Physician data validation failed!"

In [None]:
print("Data validation successful!")

In [None]:
# Connect to PostgreSQL database
engine = create_engine('postgresql://gl_moni:team3@localhost:5432/healthcare_db')

In [None]:
# Load data into PostgreSQL
hospital_df.to_sql('hospital_services', engine, if_exists='replace', index=False)
physician_df.to_sql('physician_clinics', engine, if_exists='replace', index=False)

In [None]:
# Read data from PostgreSQL
hospital_df = pd.read_sql('SELECT * FROM hospital_services', engine)
physician_df = pd.read_sql('SELECT * FROM physician_clinics', engine)


In [None]:
# Melt the dataframes to have a long format for visualization
hospital_df_melted = hospital_df.melt(id_vars=["state"], var_name="year", value_name="value")
physician_df_melted = physician_df.melt(id_vars=["state"], var_name="year", value_name="value")


In [None]:
# Create a combined dataframe for comparison
hospital_df_melted['type'] = 'Hospital Services'
physician_df_melted['type'] = 'Physician and Clinical Services'
combined_df = pd.concat([hospital_df_melted, physician_df_melted])

In [None]:
# Plot data
plt.figure(figsize=(14, 8))
sns.lineplot(data=combined_df, x='year', y='value', hue='state', style='type', markers=True, dashes=False)
plt.title('Comparison of Hospital Services and Physician and Clinical Services by State')
plt.xlabel('Year')
plt.ylabel('Value')
plt.legend(title='Service Type', loc='upper left')
plt.grid(True)
plt.show()