# Test

In [90]:
import pandas as pd
import os
import re

class Table:
    def __init__(self, df, views):
        self._df = df
        self._views = views

    def __getattr__(self, view_name):
        clean_view_name = view_name[1:] if view_name.startswith('_') else view_name
        if clean_view_name in self._views:
            return self._df[self._views[clean_view_name]]
        raise AttributeError(f"View '{view_name}' not found")

    def __call__(self):
        return self._df

    def get_view(self, view_name):
        if view_name in self._views:
            return self._df[self._views[view_name]]
        raise ValueError(f"View '{view_name}' not found")

class Database:
    def __init__(self):
        self.dataframes = self.read_dataframes()
        self.views = self.create_views()
        self.merged = self.merged_all()

        self.tables = {}
        for table_name, df in self.dataframes.items():
            self.tables[table_name] = Table(df, self.views.get(table_name, {}))
            setattr(self, table_name, self.tables[table_name])

    @staticmethod
    def standardize_column_name(name):
        name = name.lower()
        name = re.sub(r'[^\w\s]', '_', name)
        name = re.sub(r'\s+', '_', name)
        name = name.strip('_')
        return name

    def read_dataframes(self):
        base_path = r'003_data\002_clean-data'
        file_names = {
            'ballot': 'BALLOT_CLEANED.csv',
            'demo': 'DEMO_CLEANED.csv',
            'demo_add': 'DEMO_ADD_CLEANED.csv',
            'facility': 'FASCILITY_CLEANED.csv',
            'medicare': 'MEDICARE_CLEAN.csv',
            'voter': 'VOTER_REG_CEANED.csv'
        }
        dataframes = {}
        
        for table_name, file_name in file_names.items():
            file_path = os.path.join(base_path, file_name)
            df = pd.read_csv(file_path)
            
            # Standardize column names
            df.columns = [self.standardize_column_name(col) for col in df.columns]
            
            # Rename county column to 'county_name' if it exists
            county_columns = ['county_name', 'county', 'county']
            for col in county_columns:
                if col in df.columns:
                    df.rename(columns={col: 'county_name'}, inplace=True)
                    break
            
            dataframes[table_name] = df
        
        # Merge demo and demo_add dataframes
        dataframes['demo'] = pd.merge(dataframes['demo'], dataframes['demo_add'], on='county_name', how='outer')
        del dataframes['demo_add']  # Remove the separate demo_add dataframe
        
        return dataframes

    def create_views(self):
        views = {
            'ballot': {
                '2018': ['county_name', 'yes_count_2018', 'no_count_2018', 'total_count_2018', 'yes_perc_2018', 'no_perc_2018'],
                '2020': ['county_name', 'yes_count_2020', 'no_count_2020', 'total_count_2020', 'yes_perc_2020', 'no_perc_2020'],
                '2022': ['county_name', 'yes_count_2022', 'no_count_2022', 'total_count_2022', 'yes_perc_2022', 'no_perc_2022'],
                'all_counts': ['county_name', 'yes_count_2020', 'no_count_2020', 'total_count_2020', 'yes_count_2018', 'no_count_2018', 'total_count_2018', 'yes_count_2022', 'no_count_2022', 'total_count_2022'],
                'all_percentages': ['county_name', 'yes_perc_2020', 'no_perc_2020', 'yes_perc_2018', 'no_perc_2018', 'yes_perc_2022', 'no_perc_2022'],
            
            },
            'demo': {
                'population': ['county_name', 'population_january_2023', 'median_household_income_2021'],
                'race_ethnicity': ['county_name', 'one_race', 'some_other_race', 'two_or_more_races', 'total_races_tallied', 
                                'white_alone_or_in_combination_with_one_or_more_other_races', 
                                'black_or_african_american_alone_or_in_combination_with_one_or_more_other_races', 
                                'american_indian_and_alaska_native_alone_or_in_combination_with_one_or_more_other_races', 
                                'asian_alone_or_in_combination_with_one_or_more_other_races', 
                                'native_hawaiian_and_other_pacific_islander_alone_or_in_combination_with_one_or_more_other_races', 
                                'hispanic_or_latino__of_any_race', 'race_ethnicity__american_indian__2023', 
                                'race_ethnicity__asian__2023', 'race_ethnicity__black__2023', 'race_ethnicity__hispanic__2023', 
                                'race_ethnicity__multi_racial_ethnic__2023', 'race_ethnicity__hawaiian__pacific_island__2023', 
                                'race_ethnicity__white__2023'],
                'age_groups': ['county_name', 'age__0_5__2023', 'age__6_17__2023', 'age__18_64__2023', 'age__65___2023'],
                'age_distribution': ['county_name', 'under_5_years', '5_to_9_years', '10_to_14_years', '15_to_19_years', 
                                    '20_to_24_years', '25_to_29_years', '30_to_34_years', '35_to_39_years', '40_to_44_years', 
                                    '45_to_49_years', '50_to_54_years', '55_to_59_years', '60_to_64_years', '65_to_69_years', 
                                    '70_to_74_years', '75_to_79_years', '80_to_84_years', '85_years_and_over', '16_years_and_over', 
                                    '18_years_and_over', '21_years_and_over', '62_years_and_over', '65_years_and_over'],
                'gender_distribution': ['county_name', 'male_population', 'female_population'],
                'household_composition': ['county_name', 'in_households', 'householder', 'total_households', 'married_couple_household', 
                                        'cohabiting_couple_household', 'male_householder__no_spouse_or_partner_present', 
                                        'female_householder__no_spouse_or_partner_present', 'households_with_individuals_under_18_years', 
                                        'households_with_individuals_65_years_and_over', 'median_household_income__2021']
            },
            'facility': {
                'summary': ['county_name', 'fac_count', 'stations', 'prof_np', 'chain_own'],
                'ratings': ['county_name', 'fac_star', 'xp_star', 'comm_scr', 'quality_scr', 'info_scr', 'phys_scr', 'staff_scr', 'fac_scr']
            },
            'medicare': {
                'payments': ['county_name', 'pymt_amt', 'pymt_pct', 'pymt_pc', 'pymt_per_user', 'stdz_pymt_amt', 'stdz_pymt_pct', 'stdz_pymt_pc', 'stdz_pymt_per_user'],
                'standardized_payments': ['county_name', 'stdz_pymt_amt', 'stdz_pymt_pct', 'stdz_pymt_pc', 'stdz_pymt_per_user'],
                'usage': ['county_name', 'user_cnt', 'user_pct', 'visits_per_1000']
            },
            'voter': {
                '2018': ['county_name', 'eligible_2018', 'total_registered_2018', 'democratic_2018', 'republican_2018', 'american_independent_2018', 'green_2018', 
                        'libertarian_2018', 'peace_and_freedom_2018', 'unknown_2018', 'other_2018', 'no_party_preference_2018'],
                '2020': ['county_name', 'eligible_2020', 'total_registered_2020', 'democratic_2020', 'republican_2020', 'american_independent_2020', 'green_2020', 
                        'libertarian_2020', 'peace_and_freedom_2020', 'unknown_2020', 'other_2020', 'no_party_preference_2020'],
                '2022': ['county_name', 'eligible_2022', 'total_registered_2022', 'democratic_2022', 'republican_2022', 'american_independent_2022', 'green_2022', 
                        'libertarian_2022', 'peace_and_freedom_2022', 'unknown_2022', 'other_2022', 'no_party_preference_2022']
            }
        }
        return views

    
    def add_view(self, table_name, view_name, columns):
        if table_name in self.tables:
            table = self.tables[table_name]
            current_views = self.tables[table_name]._views
            table._views[view_name] = columns
        else:
            raise ValueError(f"Table '{table_name}' not found")
    
    def get_view(self, table_name, view_name):
        if table_name in self.tables:
            table = self.tables[table_name]
            return table.get_view(view_name)
        else:
            raise ValueError(f"Table '{table_name}' not found")

    def merge_views(self, views_list):
        merged_df = None
        for table_name, view_name in views_list:
            view_df = self.get_view(table_name, view_name)
            if merged_df is None:
                merged_df = view_df
            else:
                merged_df = pd.merge(merged_df, view_df, on='county_name', how='outer')
        return merged_df
    
    def merged_all(self):
        merged_df = None
        for df_name, df in self.dataframes.items():
            # Drop unnamed columns (usually from the index or if they were introduced by error during data loading)
            df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

            # Ensure the 'county_name' column exists in the dataframe
            if 'county_name' not in df.columns:
                raise KeyError(f"'county_name' not found in dataframe '{df_name}'")

            if merged_df is None:
                merged_df = df  # Start with the first dataframe
            else:
                # Merge the next dataframe on 'county_name' with custom suffixes for duplicate column names
                merged_df = pd.merge(merged_df, df, on='county_name', how='outer', suffixes=('_left', '_right'))
                
                # Optionally, handle duplicate columns after the merge if needed
                merged_df = self.handle_duplicates(merged_df)

        return merged_df

    def handle_duplicates(self, merged_df):
        """
        Optionally handle duplicated columns by removing or renaming them based on the logic you need.
        For example, you could drop columns with a certain suffix.
        """
        # Example: Drop columns with the '_right' suffix
        cols_to_drop = [col for col in merged_df.columns if col.endswith('_right')]
        merged_df = merged_df.drop(columns=cols_to_drop)

        # Alternatively, you can rename them or handle duplicates differently.
        # merged_df = merged_df.rename(columns={...})

        return merged_df

        

# # Usage example:
# db = Database()

# # Access entire table
# ballot_full = db.ballot()

# # Access a specific view
# ballot_2020 = db.ballot._2020

# # Merge views
# merged_data = db.merge_views([('ballot', '2020'), ('demo', 'race_ethnicity')])

# # Print the columns in the merged data
# print("\nColumns in merged data:")
# print(merged_data.columns.tolist())

In [91]:
db = Database()
db.merged

MergeError: Passing 'suffixes' which cause duplicate columns {'unnamed__0_left'} is not allowed.

In [73]:
for table_name, table in db.tables.items():
    print(f"\nTable: {table_name}")
    for view_name in table._views:
        print(f"View: {view_name}")
        print(table._views[view_name])


Table: ballot
View: 2018
['county_name', 'yes_count_2018', 'no_count_2018', 'total_count_2018', 'yes_perc_2018', 'no_perc_2018']
View: 2020
['county_name', 'yes_count_2020', 'no_count_2020', 'total_count_2020', 'yes_perc_2020', 'no_perc_2020']
View: 2022
['county_name', 'yes_count_2022', 'no_count_2022', 'total_count_2022', 'yes_perc_2022', 'no_perc_2022']
View: all_counts
['county_name', 'county_name', 'yes_count_2020', 'no_count_2020', 'total_count_2020', 'yes_count_2018', 'no_count_2018', 'total_count_2018', 'yes_count_2022', 'no_count_2022', 'total_count_2022']
View: all_percentages
['county_name', 'yes_perc_2020', 'no_perc_2020', 'yes_perc_2018', 'no_perc_2018', 'yes_perc_2022', 'no_perc_2022']
View: test
['county_name', 'yes_count_2020']

Table: demo
View: population
['county_name', 'population_january_2023', 'median_household_income_2021']
View: race_ethnicity
['county_name', 'one_race', 'some_other_race', 'two_or_more_races', 'total_races_tallied', 'white_alone_or_in_combinati