In [1]:
import pandas as pd
from var_names import *
# import numpy as np
import yaml

In [2]:
# read variables' categories file
def read_categories_file(file_path):
    """Read variables and their categories from a YAML file."""
    try:
        with open(file_path, 'r') as file:
            categories_dictionary = yaml.safe_load(file)
        return categories_dictionary
    except IOError as e:
        print(f"An error occurred: {e.strerror}")
    except yaml.YAMLError as exc:
        print(f"YAML error: {exc}")
    return {}

In [3]:
# Map variables to its respective category sets
def apply_categories_to_variables(categories_dictionary):
    """This function for linking each variable (questions) with possible values (categories)"""
    category_set = categories_dictionary['categories']  # all categories; references and variables' choices
    variable_mappings = categories_dictionary['variables']  # all variables; (questions) and references to categories
    
    categories_sets = {}
    for var in variable_mappings:
        var_name = var['name']  # all variables; (questions)
        category_set_name = var['category_set']  # all references to categories
        # Filling the dictionary with the key: variable and value: choice's categories.
        # This dictionary links between each variable with possible values
        categories_sets[var_name] = category_set[category_set_name]
    
    return categories_sets

In [4]:
def frequency_distribution(series, variable_categories):
    """Calculate value counts and percentages for a pandas Series including all possible categories."""
    # Convert the series to a categorical series with the specified categories
    categorical_series = pd.Categorical(series, categories=variable_categories)
    
    # Calculate frequency counts and percentages
    freq_count = pd.Series(categorical_series).value_counts(dropna=False).sort_index()
    relative_freq = pd.Series(categorical_series).value_counts(normalize=True, dropna=False).mul(100).round(2).sort_index()
    relative_freq_str = relative_freq.astype(str) + '%'
    
    # Combine into a DataFrame
    result = pd.DataFrame({
        'N': freq_count,
        '%': relative_freq_str
    })
    
    # Reindex the DataFrame to include all categories, filling with 0 for the counts and '0%' for the percentages
    result = result.reindex(variable_categories, fill_value=0)
    result['%'] = result['%'].replace({'0.0%': '0%'})

    return result

In [5]:
def tabulation(dataframe, variable, categories_sets: dict):
   """Tabulate value counts for a specific variable in a DataFrame, fetching categories from a provided dictionary."""
   # Fetch categories for the variable
   variable_categories = categories_sets.get(variable, [])
   
   series = dataframe[variable]
   frequencies = frequency_distribution(series=series, variable_categories=variable_categories)
   return frequencies

In [6]:
# Read categories data from YAML file
categories_dict = read_categories_file('variables_categories.yaml')
# Apply categories to variables
categories = apply_categories_to_variables(categories_dict)

In [7]:
# Read the dataset
baseline = pd.read_csv('tba_data.csv')

In [70]:
variable_name = 'concept2'

In [71]:
frequency = tabulation(dataframe=baseline, variable=variable_name, categories_sets=categories)
frequency.to_clipboard()

In [72]:
frequency

Unnamed: 0,N,%
not_at_all,2,1.89%
a_little,3,2.83%
moderate_amount,20,18.87%
very_much,41,38.68%
extreme_amount,40,37.74%
888,0,0%
999,0,0%


In [None]:
# Create a Pandas Excel writer using XlsxWriter as the engine.
# with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
#    for var in question:
#       Check if the variable exists in the dataframe
#       if var in baseline.columns:
         # Apply your function to the variable, and get the results
         # result_df = normalized_value_counts(baseline[var])
         # Write each DataFrame to a specific sheet
         # result_df.to_excel(writer, sheet_name=var)
      # else:
      #    print(f"Variable {var} not found in DataFrame.")

In [None]:
baseline['network_size'].value_counts()

In [None]:
# baseline['relative_support2'].value_counts(normalize=True) * 100