In [6]:
import pandas as pd
import numpy as np

def load_data(filepath):
    """
    Load survey data from a CSV file.

    Parameters:
    - filepath (str): Path to the CSV file.

    Returns:
    - pd.DataFrame: Loaded DataFrame.
    """
    return pd.read_csv(filepath)

def calculate_weights(data, population_distribution):
    """
    Calculate post-stratification weights based on population proportions.

    Parameters:
    - data (pd.DataFrame): Survey sample data.
    - population_distribution (dict): Target population proportions per variable.

    Returns:
    - pd.Series: Calculated weights for each row.
    """
    weights = pd.Series(np.ones(len(data)), index=data.index)

    for column in population_distribution:
        sample_distribution = data[column].value_counts(normalize=True)

        for category, pop_prop in population_distribution[column].items():
            if category in sample_distribution:
                sample_prop = sample_distribution[category]
                weights[data[column] == category] *= pop_prop / sample_prop
            else:
                # Category not in sample: assign a small default weight
                weights[data[column] == category] *= 0.01

    # Normalize weights to keep the total sum aligned with the sample size
    weights /= weights.sum() / len(data)
    
    return weights

def apply_weights(data, weights, cap=2.5):
    """
    Apply weights to data and cap extreme values.

    Parameters:
    - data (pd.DataFrame): Survey data.
    - weights (pd.Series): Calculated weights.
    - cap (float): Maximum allowed weight value.

    Returns:
    - pd.DataFrame: Data with weights added.
    """
    data = data.copy()
    data['weights'] = weights
    data.loc[data['weights'] > cap, 'weights'] = cap
    return data

def export_weights(data, id_column, output_path):
    """
    Export a CSV with ID and weights.

    Parameters:
    - data (pd.DataFrame): Survey data with weights.
    - id_column (str): Column name for unique identifiers.
    - output_path (str): File path to save the output.
    """
    data[[id_column, 'weights']].to_csv(output_path, index=False)


In [9]:
# --- Example Usage Below ---

# Define file paths
input_path = '/Users/**ID**/Downloads/survey_data.csv'
output_path = '/Users/**ID**/Downloads/survey_data_weight.csv'

# Define expected population proportions 
## Update as key and values 
population_distribution = {
    'schooltype3': {'2-year college': 0.2, '4-year university': 0.58, 'graduate school': 0.22},
    'age3': {'15-17': 0.01, '18-19': 0.2262, '20-24': 0.4360, '25-34': 0.1938, '35+': 0.1340},
    'gender2': {'male': 0.41, 'female': 0.57, 'Non-Binary': 0.01, 'Prefer not to say': 0.01},
    'major2': {'STEM': 0.5, 'non-STEMB': 0.5}
}



In [10]:
# Process
data = load_data(input_path)
weights = calculate_weights(data, population_distribution)
data_with_weights = apply_weights(data, weights, cap=2.5) ### modify the cap 
## update id_column  name
export_weights(data_with_weights, id_column='uuid', output_path=output_path)

print(f"Output file generated at: {output_path}")