# Foundations of CSS Final Group Project: *Question Goes Here*

TODO: Add link to research poster and report

## Prerequisits

### Install Dependencies

In [87]:
# !pip install pyreadstat pandas numpy matplotlib wbgapi pycaret

### Import Libraries

In [88]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import zipfile
import shutil
import re
import wbgapi as wb

### Decompress Data

In [89]:
with zipfile.ZipFile("data.zip", "r") as zip_ref:
    zip_ref.extractall("data")
 
# For data changes, re-zip data to zip file with python:
# shutil.make_archive("data", 'zip', "data")
# or in the shell with command `zip data.zip data/ -9 -r` (linux)


### Load and Prepare Data

#### Attitudes Data


International Social Survey Programme: Environment I-IV Cumulation. \
ISSP Research Group (2024) \
GESIS, Cologne. \
ZA8793 Data file Version 1.0.0 \
https://doi.org/10.4232/1.14332

See [terms of use](https://www.gesis.org/fileadmin/upload/dienstleistung/daten/umfragedaten/_bgordnung_bestellen/2023-06-30_Nutzungsbedingungen.pdf) (category A) for more information.

The survey data is used in [Long-run trends in partisan polarization of climate policy-relevant attitudes across countries](https://doi.org/10.1080/09644016.2024.2403957) as well which we partly reproduce here.

In [90]:
survey_data_raw_df = pd.read_stata("data/ZA8793_v1-0-0_survey.dta")

There are 145 columns in the dataset.
In the following, we make a selection of relevant ones
and additionally group certain questions into
"support for public" vs "support for individual" action.

In [91]:
# Define mapping functions to support getting survey data in usable format

def extract_integer(x: str | int | float):
    if (type(x) == int): return x
    if (type(x) == float): return int(x)

    return int(x.split('.')[0]) # assuming format like 2. agree somewhat

# The below functions are helpers to make mapping the liker-scale responses (1.-5. (agree or disagree) or 1-4).
# The responses are stretched to values between -2 and 2.
# See in the next cell how it's used.
def likert_scale_to_float(input: str | int | float, invert: bool, shift: float):
    int_val = extract_integer(input)
    if int_val < 0:
        return np.nan;
    else:
        return float(int_val) + shift if invert == False else (float(int_val) + shift) * -1

def likert_scale_to_float_5(input):
    return likert_scale_to_float(input, invert=False, shift=-3)

def likert_scale_to_float_invert_5(input):
    return likert_scale_to_float(input, invert=True, shift=-3)

def likert_scale_to_float_invert_4(input, shift=-2.5):
    return likert_scale_to_float(input, invert=True, shift=shift) * 2/3

def likert_scale_to_float_4(input, invert=False):
    return likert_scale_to_float(input, invert, shift=-2.5) * 2/3


In [92]:
# Comments beginning with letters (abcd) indicate which study have the question, if not all (1993, 2000, 2010, 2020).
# Some mapping comments contain the exact question asked in the survey.
# Detailed explanation of variables can be found here: https://search.gesis.org/research_data/ZA8793#variables|exploredata-ZA8793_VarWRKHRS|0|variable_order|asc

# The below mapping is used to rename the original column names
# and define functions to convert the columns, if necessary.
# Not all of the dataset's questions are mapped (and kept).
# Those are all that seemed vaguely interesting
# but for the main analysis we only consider a subset.
mapping = {
  "cumu_id": {  "name": 'id',
                "mapper": int },
  "year": {     "name": 'year',
                "mapper": extract_integer },
  "country": {  "name": 'country_iso2',
                "mapper": lambda country: re.search(r'^\d+\. ([A-Z]+)-', country).group(1) },
  "AGE": {      "name": 'age',
                "mapper": extract_integer },
  "SEX": {      "name": 'sex',
                "mapper": None },
  "EDUCYRS": {  "name": 'education_years',
                "mapper": lambda year: extract_integer(year) if extract_integer(year) < 30 else -2 },
  'MARITAL': {  "name": 'marital_status',
                "mapper": None },
  "COHAB": {    "name": 'steady_life_partner',
                "mapper": None },
  "DEGREE": {   "name": 'highest_degree',
                "mapper": None },
  "WORK": {     "name": 'work_currently_former_never',
                "mapper": None },
  "WRKHRS": {   "name": 'weekly_work_hours',
                "mapper": extract_integer },
  "EMPREL": {   "name": 'self_employed',
                "mapper": None },
  "WRKSUP": {   "name": 'work_supervises',
                "mapper": None },
  "ISCO08": {   "name": 'work_type_isco',
                "mapper": None },
  "MAINSTAT": { "name": 'employed_self_not',
                "mapper": None },
  "UNION": {    "name": 'work_member_of_union',
                "mapper": None },
  "HOMPOP": {   "name": 'household_size',
                "mapper": None },
  "CHILDHH": {  "name": 'children_in_household',
                "mapper": None },
  "PARTY_LR1": {"name": 'party_lef_right_derived',
                "mapper": None },
  "PARTY_LR2": {"name": 'party_lef_right_asked',
                "mapper": None },
  "VOTE_LE": {  "name": 'party_did_vote',
                "mapper": None },
  "URBRURAL": { "name": 'urban_or_rural',
                "mapper": None }, # Would you describe the place where you live as…
  "INCOME": {   "name": 'income_relative',
                "mapper": None },
  "WEIGHT": {   "name": 'regional_weight_factor',
                "mapper": lambda w: 1 if w == "1. No weighting" else w },

  "v5": {       "name": 'most_important_env_problem',
                "mapper": None }, # cd
  "v8": {       "name": 'trust_most',
                "mapper": None },  # cd
  "v9": {       "name": 'believe_too_often_in_science',
                "mapper": None }, # abc
  "v10": {      "name": 'science_more_harm_than_good',
                "mapper": None }, # abc
  "v11": {      "name": 'science_solves_env_problems',
                "mapper": likert_scale_to_float_invert_5 },
  "v12": {      "name": 'worry_too_much_about_env_not_prices_jobs',
                "mapper": None },
  "v13": {      "name": 'modern_life_harms_env',
                "mapper": likert_scale_to_float_5 },
  "v14": {      "name": 'ppl_worry_too_much_progress_harms_env', 
                "mapper": likert_scale_to_float_5 }, # People worry too much about human progress harming the environment.
  "v15": {      "name": 'env_needs_econ_growth',
                "mapper": likert_scale_to_float_5 }, # In order to protect the environment [COUNTRY] needs economic growth.
  "v17": {      "name": 'econ_growth_harms_env',
                "mapper": likert_scale_to_float_5 },
  "v18": {      "name": 'population_growth_unsustainable',
                "mapper": likert_scale_to_float_5 },
  "v20": {      "name": 'support_indiv_pay_more',
                "mapper":  likert_scale_to_float_invert_5},
  "v21": {      "name": 'support_pay_higher_taxes',
                "mapper": likert_scale_to_float_invert_5 },
  "v22": {      "name": 'support_cut_living_standards',
                "mapper": likert_scale_to_float_invert_5 },
  "v23": {      "name": 'indiv_action_too_difficult',
                "mapper": likert_scale_to_float_invert_5 }, # It is just too difficult for someone like me to do much about the environment.
  "v24": {      "name": 'indiv_support_sacrifice',
                "mapper": likert_scale_to_float_invert_5 }, # I do what is right for the environment, even when it costs more money or takes more time.
  "v25": {      "name": 'indiv_less_important',
                "mapper": None }, # There are more important things to do in life than protect the environment.
  "v26": {      "name": 'support_only_with_others',
                "mapper": likert_scale_to_float_5 }, # bcd; There is no point in doing what I can for the environment unless others do the same.
  "v27": {      "name": 'threats_exaggerated',
                "mapper": likert_scale_to_float_invert_5 }, # bcd
  "v28": {      "name": 'hard_to_know_right_wrong',
                "mapper": None }, # cd; I find it hard to know whether the way I live is helpful or harmful to the environment.
  "v33": {      "name": 'climate_change_no_hole_atmosphere',
                "mapper": likert_scale_to_float_4 }, # abc, Climate change [/ greenhouse effect] is caused by a hole in the earth's atmosphere. (afterwards inverted)
  "v34": {      "name": 'climate_change_oil_gas',
                "mapper": likert_scale_to_float_invert_4 }, # abc, Every time we use coal or oil or gas, we contribute to climate change [/ greenhouse effect].
  "v35": {      "name": 'env_problems_everyday_life',
                "mapper": likert_scale_to_float_invert_5 }, # cd
  "v36": {      "name": 'car_air_poll_danger_env',
                "mapper": likert_scale_to_float_invert_5 }, # Air pollution caused by cars is (dangerous/ not dangerous) for environment
  "v37": {      "name": 'car_air_poll_danger_indiv',
                "mapper": likert_scale_to_float_invert_5 }, # Air pollution caused by cars is (dangerous/ not dangerous) for you and your family
  "v39": {      "name": 'industry_air_poll_danger_env',
                "mapper": likert_scale_to_float_invert_5 }, # In general, do you think that air pollution caused by industry is (dangerous/ not dangerous)
  "v42": {      "name": 'temp_rise_danger_env',
                "mapper": likert_scale_to_float_invert_5 }, # bcd In general, do you think that a rise in the world's temperature caused by climate change is (dangerous/ not dangerous)
  "v44": {      "name": 'gov_laws_not_indiv',
                "mapper": lambda val: 1 if extract_integer(val) == 2 else 0 }, # abc Government or ordinary people: decide themselves how to protect environment (2 = government)
  "v45": {      "name": 'gov_laws_not_business',
                "mapper": lambda val: 1 if extract_integer(val) == 2 else 0 }, # abc Government or business: decide themselves how to protect environment
  "v47": {      "name": 'internation_agreement_support',
                "mapper": likert_scale_to_float_invert_5 }, # bc
  "v50": {      "name": 'gov_force_business',
                "mapper": lambda val: 1 if val.startswith('1. Heavy fines') or val.startswith('2. Use the tax system') else 0 }, # cd Which of these approaches do you think would be the best way of getting business and industry in [COUNTRY] to protect the environment?
  "v50_2": {      "name": 'gov_voluntary_business',
                "mapper": lambda val: 1 if val.startswith('3. More information and education') else 0 }, # cd Which of these approaches do you think would be the best way of getting business and industry in [COUNTRY] to protect the environment?
  "v51": {      "name": 'gov_force_indiv',
                "mapper": lambda val: 1 if val.startswith('1. Heavy fines') or val.startswith('2. Use the tax system') else 0 }, # cd Which of these approaches do you think would be the best way of getting people and their families in [COUNTRY] to protect the environment?
  "v51_2": {      "name": 'gov_voluntary_indiv',
                "mapper": lambda val: 1 if val.startswith('3. More information and education') else 0 }, # cd Which of these approaches do you think would be the best way of getting people and their families in [COUNTRY] to protect the environment?
  "v52": {      "name": 'indiv_avoid_buying',
                "mapper": lambda val: likert_scale_to_float_invert_5(val) + 2 }, # cd And how often do you avoid buying certain products for environmental reasons?
  "v53": {      "name": 'indiv_sort_waste',
                "mapper": lambda val: likert_scale_to_float_invert_5(val) + 2 }, # ac How often do you make a special effort to sort glass or tins or plastic or newspapers and so on for recycling?
  "v54": {      "name": 'indiv_buy_organic',
                "mapper": lambda val: likert_scale_to_float_invert_5(val) + 2 }, # ac How often do you make a special effort to buy fruit and vegetables grown without pesticides or chemicals?
  "v55": {      "name": 'indiv_avoid_car',
                "mapper": lambda val: likert_scale_to_float_invert_5(val) + 2 }, # abc And how often do you cut back on driving a car for environmental reasons?
  "v56": {      "name": 'member_group_preserve_env',
                "mapper": lambda val: 1 if val == '1. Yes' else 0 }, # Are you a member of any group whose main aim is to preserve or protect the environment?
  "v57": {      "name": 'signed_petition',
                "mapper": lambda val: 1 if val.startswith('1. Yes') else 0 }, # In the last five years, have you signed a petition about an environmental issue?
  "v58": {      "name": 'donated_money',
                "mapper": lambda val: 1 if val.startswith('1. Yes') else 0 }, # In the last five years, Given money to an environmental group?
  "v59": {      "name": 'protest',
                "mapper": lambda val: 1 if val.startswith('1. Yes') else 0 }, # In the last five years, Taken part in a protest or demonstration about an environmental issue?
}

# We are going to use those columns twice to map to different categories (supports forced action vs supports voluntary action)
survey_data_raw_df['v50_2'] = survey_data_raw_df['v50']
survey_data_raw_df['v51_2'] = survey_data_raw_df['v51']


column_naming_map = { key: value["name"] for key, value in mapping.items() }

survey_df = survey_data_raw_df[list(column_naming_map.keys())].rename(columns=column_naming_map)

# For some columns, we need to convert responses e.g. '1. Strongly disagree' might become -2
# The conversion functions are defined in the mapping dict above.
for mapping_key in mapping.keys():
  new_key = mapping[mapping_key]["name"]
  mapping_fn = mapping[mapping_key]["mapper"]
  if mapping_fn:
    survey_df[new_key] = survey_df[new_key].map(mapping_fn)

# reduced

##### Categorization & Deriving Scores

The following is a categorization of the questions in one of the following classes:
1. Awareness to assess general awareness of environmental / climate issues
2. Support for Individual action and talking points often brought up by the fossil industry
3. Support for collective action

###### Helper Functions


In [93]:


def normalize(values: pd.Series):
  values = values.copy()
  values -= values.min()
  values /= values.max()
  return values

def calc_weighted_sum(df: pd.DataFrame, column_weights: dict):
  """
  Calculates the weight sums of columns in a data frame using dicts of column names to weights.
  """
  columns, weights = [list(items) for items in zip(*column_weights.items())]

  df_subset = df[columns]

  averages = np.ma.average(np.ma.array(df_subset, mask=df_subset.isna().values), weights=weights, axis=1)

  return averages.data


###### Classification and Weighting of Responses in Three Categories

The weights are not chosen based on previous research but are rather a good guess.

In [94]:
awareness_questions = {
  'climate_change_oil_gas': 5,
  'modern_life_harms_env': 2,
  'car_air_poll_danger_env': 1,
  'car_air_poll_danger_indiv': 1,
  'industry_air_poll_danger_env': 1.5,
  'temp_rise_danger_env': 3, # ?
  # need to be reverse-interpreted
  'climate_change_no_hole_atmosphere': 2
}

propaganda_questions = {
  'science_solves_env_problems': 3,
  'support_indiv_pay_more': 1,
  'support_cut_living_standards': 1,
  'indiv_support_sacrifice': 1,
  'indiv_avoid_buying': 1,
  'indiv_sort_waste': 1,
  'indiv_buy_organic': 1,
  'indiv_avoid_car': 1,
  'gov_voluntary_business': 2,
  'env_needs_econ_growth': 2,
  'ppl_worry_too_much_progress_harms_env': 1,
  'threats_exaggerated': 1,
}

collective_action_questions = {
  'support_pay_higher_taxes': 1,
  'indiv_action_too_difficult': 1,
  'support_only_with_others': 2,
  'internation_agreement_support': 2,
  'protest': 3,
  'member_group_preserve_env': 1,
  'gov_laws_not_indiv': 1,
  'gov_laws_not_business': 1,
  'gov_force_indiv': 1,
}


###### Calculate Scores for Categories

In [95]:
# Remove elements with more than 3 missing responses.
survey_df = survey_df[
    (survey_df[awareness_questions.keys()].isnull().sum(axis=1) <= 3)
  & (survey_df[propaganda_questions.keys()].isnull().sum(axis=1) <= 3)
  & (survey_df[collective_action_questions.keys()].isnull().sum(axis=1) <= 3)
]


# Calculate normalized scores
survey_df['score_awareness'] = calc_weighted_sum(survey_df, awareness_questions)
survey_df['score_individual_action'] = calc_weighted_sum(survey_df, propaganda_questions)
survey_df['score_collective_action'] = calc_weighted_sum(survey_df, collective_action_questions)

# TODO: We either need to normalize or adjust the questions with 4 response options to have the same range.
# survey_df['score_awareness'] = normalize(survey_df['score_awareness'])
# survey_df['score_individual_action'] = normalize(survey_df['score_individual_action'])
# survey_df['score_collective_action'] = normalize(survey_df['score_collective_action'])

In [96]:
# Filter respondents with very low awareness since they are not interesting to look at.
# TODO: This seems to be a sensible default and done in our main reference paper. But does it make sense to keep it too?
# survey_df = survey_df[survey_df['calc_awareness'] > 0.5]

###### Show Histograms of Normalized Scores

In [None]:
survey_df['score_individual_action'].hist(bins=30, label="Focus: Individual Action", alpha=0.4, color="red")
survey_df['score_collective_action'].hist(bins=30, label="Focus: Collective Action", alpha=0.4, color="blue")
plt.legend()
plt.show()
survey_df['score_awareness'].hist(bins=30, label="Focus: Awareness", alpha=0.5, color="green")
plt.legend()

#### Fossil dependence data (Fossil Rent as Fraction of GDP)

In [None]:
survey_iso2_codes = pd.Series(survey_df['country_iso2'].unique())
survey_country_names = pd.Series(survey_data_raw_df['country'].unique()).apply(lambda st: st.split("-")[1])
survey_wb_economy_codes = wb.economy.coder(survey_country_names)
country_code_df = pd.DataFrame({'country_iso2': survey_iso2_codes, 'country_iso3': survey_wb_economy_codes})

In [100]:
fossil_dependence_df = wb.data.DataFrame([
    'NY.GDP.NGAS.RT.ZS',
    'NY.GDP.COAL.RT.ZS',
    'NY.GDP.PETR.RT.ZS',
  ],
  economy=survey_wb_economy_codes, 
  time=[1993, 2000, 2010, 2020],
  numericTimeKeys=True,
  skipBlanks=True,
  columns='series'

).reset_index()

In [None]:
fossil_dependence_df['fossil_rent'] = fossil_dependence_df[['NY.GDP.NGAS.RT.ZS', 'NY.GDP.COAL.RT.ZS', 'NY.GDP.PETR.RT.ZS']].sum(axis=1)
fossil_dependence_df

### Merge Datasets

In [None]:
# Add iso2 country codes to wb data
fossil_dependence_df = pd.DataFrame.merge(fossil_dependence_df, country_code_df, left_on='economy', right_on='country_iso3')

# merge survey and wb data by time and country (for a year and country, each respondent gets the same fossil dependence scores)
merged_df = pd.DataFrame.merge(fossil_dependence_df, survey_df, left_on=['time', "country_iso2"], right_on=['year', 'country_iso2'])

## Analysis

### Variables

- **Independent Variable**: Fossil fuel dependence
- **Dependent Variable**: Support for individual vs collective action?
- **Mediator**: Climate change awareness

### Identify Awareness, Public, and Collective Action variables

In [7]:
# Model


