Code to read ES Survey Data

Number of participants by region and language used.
The type of sample method used in the survey (simple, complex, etc.).
What is the survey representative of when you use the sample weights and when you don’t use the sample weights. This could be country, household, or individual.
The questions you would like to use in the thesis together with some descriptive statistics (for this you can omit considering the sample design and weights):
type of data: dichotomous; categorical (which categories); continuous (which range); open question (which language).
Correlation analysis

Data Reading and Preparation:

In [7]:
import pandas as pd
import numpy as np
import openpyxl
from bs4 import BeautifulSoup

In [8]:
Survey_Responses  = pd.read_csv('ESS_files\ESS10.csv', low_memory=False)
# Clean the survey questions dataframe to make it more usable for mapping
Survey_Responses

Unnamed: 0,name,essround,edition,proddate,idno,cntry,dweight,pspwght,pweight,anweight,...,vinwe,inwde,jinws,jinwe,inwtm,mode,domain,prob,stratum,psu
0,ESS10e03_2,10,3.2,02.11.2023,10038,BE,0.882220,0.972276,0.718075,0.698167,...,2022-09-01 17:47:00,2022-09-01 17:47:00,2022-09-01 17:47:00,2022-09-01 17:47:00,36.0,1,1.0,0.000397,188,2596
1,ESS10e03_2,10,3.2,02.11.2023,10053,BE,1.047643,0.888635,0.718075,0.638107,...,2022-04-08 11:07:00,2022-04-08 11:10:00,2022-04-08 11:07:00,2022-04-08 11:10:00,54.0,2,2.0,0.000334,194,2206
2,ESS10e03_2,10,3.2,02.11.2023,10055,BE,1.087741,0.722811,0.718075,0.519033,...,2022-05-20 11:08:00,2022-05-20 11:10:00,2022-05-20 11:08:00,2022-05-20 11:10:00,77.0,1,2.0,0.000322,198,2114
3,ESS10e03_2,10,3.2,02.11.2023,10062,BE,0.909910,1.005565,0.718075,0.722072,...,2022-05-22 13:58:00,2022-05-22 13:59:00,2022-05-22 13:58:00,2022-05-22 13:59:00,55.0,1,1.0,0.000385,150,2645
4,ESS10e03_2,10,3.2,02.11.2023,10064,BE,0.918949,0.638705,0.718075,0.458639,...,2022-05-18 11:44:00,2022-05-18 11:45:00,2022-05-18 11:44:00,2022-05-18 11:45:00,55.0,1,1.0,0.000381,149,2313
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37606,ESS10e03_2,10,3.2,02.11.2023,27808,SK,0.515714,0.339385,0.323800,0.109893,...,2021-06-08 14:28:34,2021-06-08 14:30:41,2021-06-08 14:29:01,2021-06-08 14:31:44,70.0,1,1.0,0.001522,2610,27206
37607,ESS10e03_2,10,3.2,02.11.2023,27826,SK,0.297974,0.196093,0.323800,0.063495,...,2021-08-02 10:33:27,2021-08-02 10:36:27,2021-08-02 10:35:22,2021-08-02 10:37:34,45.0,1,2.0,0.002635,2610,27217
37608,ESS10e03_2,10,3.2,02.11.2023,27834,SK,0.965931,0.857000,0.323800,0.277497,...,2021-06-26 20:52:15,2021-06-26 20:53:05,2021-06-26 20:52:27,2021-06-26 20:54:32,33.0,1,1.0,0.000813,2631,27134
37609,ESS10e03_2,10,3.2,02.11.2023,27846,SK,0.854279,0.624287,0.323800,0.202144,...,2021-07-21 14:14:41,2021-07-21 14:17:31,2021-07-21 14:16:38,2021-07-21 14:18:38,43.0,1,1.0,0.000919,2638,27183


In [9]:
def improveResponses(df):
    # Iterate through the unique ids
    for question_id in df['id'].unique():
        # Select all rows with the same id
        question_rows = df[df['id'] == question_id]

        #Create a filtered list of question responses that are uninformative 
        filtered_min_responses = question_rows[question_rows['value'] == question_rows['response']]

        # Check if there are multiple response options
        if len(filtered_min_responses) >= 1:
           
            # Extract the unique values
            unique_values = filtered_min_responses['value'].unique()
            
            min_value = str(int(unique_values.min()) - 1)
            max_value = str(int(unique_values.max()) + 1)
            
            min_response = question_rows[question_rows['value'] == min_value]['response'].iloc[0]
            max_response = question_rows[question_rows['value'] == max_value]['response'].iloc[0]
            
            # Iterate over each row and update the response for intermediate values
            for idx, row in filtered_min_responses.iterrows():
                if row['value'] not in [min_value, max_value]:# and row['value'] == row['response']:
                    df.at[idx, 'response'] = f"{row['value']}, where {min_value}: {min_response} and {max_value}: {max_response}"
    
    return df['response']

In [10]:
import pandas as pd
from bs4 import BeautifulSoup

# Define the path to the HTML file
html_file_path = 'ESS_files/ESS10 codebook.html'

# Initialize a list to hold the rows of the DataFrame
rows = []

# Read the HTML file
with open(html_file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()

# Parse the HTML content
soup = BeautifulSoup(html_content, 'lxml')

# Find all <h3> tags with an 'id' attribute (these contain the questions)
question_tags = soup.find_all('h3', id=True)

# Iterate over the question tags to extract the details
for tag in question_tags:
    idnumber = tag.get('id')  # Get the ID of the question
    question = tag.find_next_sibling('div').text.strip()  # Get the question text
    
    # Find the next div that possibly contains the table
    table_container = tag.find_next_sibling('div').find_next_sibling('div')
    if table_container:
        table = table_container.find('table')
    else:
        table = None

    # If a table is found, extract options
    if table:
        for row in table.find_all('tr'):
            cells = row.find_all('td')
            if len(cells) == 2:  # Ensure exactly 2 cells are found (value and response)
                value = cells[0].text.strip()
                response = cells[1].text.strip()
                # Append the extracted information as a row to the rows list
                rows.append({
                    'respid': idnumber+str(value), 
                    'id': idnumber,
                    'question': question,
                    'value': value,
                    'response': response
                })
    else:
        # If no table is found, append the question without response options
        rows.append({
            'respid': idnumber, 
            'id': idnumber,
            'question': question,
            'value': None,
            'response': None
        })

# Create a DataFrame from the rows list
Survey_Overview = pd.DataFrame(rows)

# Apply the improveResponses function to the DataFrame
Survey_Overview['response'] = improveResponses(Survey_Overview)

Survey_Overview['question_answers_combined'] = Survey_Overview['question'] +" - With response: "+ Survey_Overview['response']

# Save the DataFrame to a CSV file
Survey_Overview.to_csv("uncategorizedESS_Overview.csv", index=False, sep=',')
Survey_Overview
Survey_Overview['ADICO_Category'] = ""
Survey_Overview = Survey_Overview[Survey_Overview['value'] != None]
Survey_Overview.set_index('respid', inplace=True)
Survey_Overview = Survey_Overview.drop_duplicates()

In [11]:
Survey_Overview.sample(10)

Unnamed: 0_level_0,id,question,value,response,question_answers_combined,ADICO_Category
respid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
accalaw10,accalaw,Acceptable for country to have a strong leader...,10,Completely acceptable,Acceptable for country to have a strong leader...,
prtclesk1,prtclesk,"Which party feel closer to, Slovakia",1,Obyčajní Ľudia a nezávislé osobnosti,"Which party feel closer to, Slovakia - With re...",
lnghom1ALG,lnghom1,Language most often spoken at home: first ment...,ALG,Algonquian languages,Language most often spoken at home: first ment...,
occm14b1,occm14b,Mother's occupation when respondent 14,1,Professional and technical occupations,Mother's occupation when respondent 14 - With ...,
edlvfdpt4,edlvfdpt,"Father's highest level of education, Portugal",4,Cursos de educação e formação de tipo 1. Atrib...,"Father's highest level of education, Portugal ...",
lnghom1GRB,lnghom1,Language most often spoken at home: first ment...,GRB,Grebo,Language most often spoken at home: first ment...,
lnghom2MAK,lnghom2,Language most often spoken at home: second men...,MAK,Makasar,Language most often spoken at home: second men...,
lnghom1XAL,lnghom1,Language most often spoken at home: first ment...,XAL,"Kalmyk, Oriat",Language most often spoken at home: first ment...,
edlvebg14,edlvebg,"Highest level of education, Bulgaria",14,Visshe - Doktor,"Highest level of education, Bulgaria - With re...",
colprop99,colprop,Proportion of colleagues based at the same loc...,99,No answer*,Proportion of colleagues based at the same loc...,


ADICO Categorization of Questions

In [12]:
from typing import List, Optional
import json

from pydantic import BaseModel
from groq import Groq
# Now you can import the config module
from config import groqkey, OPENAI_Key
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
groq = Groq(api_key=groqkey)

import openai

# Make sure to set your OpenAI API key
openai.api_key = OPENAI_Key

In [15]:
models = ["gemma-7b-it", "llama3-8b-8192", "mixtral-8x7b-32768", "llama3-70b-8192"]
llm = ChatGroq(temperature=0, model=models[1], api_key=groqkey)

"Attribute: Only questions that directly ask for the responder's age, gender, location, or education. If it is not one of those it is a Condition.\n None"

def categorize_ADICO(request):
    
    system = """You are a helpful assistant that categorizes survey questions and presents them in JSON format.
    The data you will receive is a json with the following structure:
    ['id of the question':['question':'description of the question','response':'description of a potential response']]
    
    Possible categories are: 
    Aim: Question to identify if responder has performed an action,
    Condition: Question on factors might impact the responder's behavior,
    None: none of the above
    
    Format of your output:
    ['id of the question':['category':'your assigned catgory'],]
    """

    human = "{text}"
    prompt  = ChatPromptTemplate.from_messages([("system", system), ("human", human)])

    chain = prompt | llm
    response = chain.invoke({"text": request})
    return response.content

filled_ids = []

Here is the categorized survey questions in JSON format:

[
  {"anctry182010": {"category": "Condition"}},
  {"edlvesi11": {"category": "Condition"}},
  {"cntbrthdNU": {"category": "Condition"}}
]

Let me know if you have any further questions or if there's anything else I can help you with!


In [50]:
while len(Survey_Overview[Survey_Overview['ADICO_Category'] == ""]) > 0:
    set_to_categorize = Survey_Overview[Survey_Overview['ADICO_Category'] == ""][['question', 'response']].sample(min(100, len(Survey_Overview[Survey_Overview['ADICO_Category'] == ""])))
    set_to_categorizejson = set_to_categorize.to_json(orient='index', index=True)

    response = categorize_ADICO(set_to_categorizejson)
    
    if "error" in response or "Error" in response: print(response) 

    try:
        # Parse the JSON string into a Python dictionary
        data_dict = json.loads(response)
    except:
        try:
            # Parse the JSON string into a Python dictionary
            data_dict = json.loads("[" + response.split('[')[1].rsplit(']', 1)[0] + '}]')   
        except:
            data_dict = json.loads('[{' + '}'.join('{'.join(response.split('{')[1:]).split('}')[:-1]) + '}]')
            
    # Flatten the list of dictionaries into a single dictionary
    try: flattened_data = {k: v['category'] for d in data_dict for k, v in d.items()}
    except: flattened_data = {k: v for d in data_dict for k, v in d.items()}
    # Convert the dictionary into a DataFrame
    IG_component_df = pd.DataFrame(list(flattened_data.items()), columns=['respid', 'ADICO_Category']).set_index('respid', drop=True)
    
    originalCats = len(Survey_Overview[Survey_Overview['ADICO_Category'] == ""])
   
    # Update Survey_Overview with values from IG_component_df
    Survey_Overview.update(IG_component_df)
    
    newCats = len(Survey_Overview[Survey_Overview['ADICO_Category'] == ""])


    # Fill any empty Survey_Overview['ADICO_Category'] values if a row with the same Survey_Overview['id'] has a value
    ids_with_category = Survey_Overview.loc[Survey_Overview['ADICO_Category'] != "", 'id'].unique()
    
    ids_with_category = [id for id in ids_with_category if id not in filled_ids]    
    
    for _id in ids_with_category:
        categorizations_in_id = Survey_Overview.loc[Survey_Overview['id'] == _id, 'ADICO_Category'].dropna().unique()
        if len(categorizations_in_id) > 2: print(Survey_Overview[Survey_Overview['id'] == _id])
        Survey_Overview.loc[(Survey_Overview['id'] == _id) & (Survey_Overview['ADICO_Category'] == ""), 'ADICO_Category'] = Survey_Overview.loc[Survey_Overview['id'] == _id, 'ADICO_Category'].dropna().iloc[0]
        filled_ids.append(_id)


99
17


In [51]:
# Save the updated DataFrame to a new CSV file
Survey_Overview_path = 'ESSQuestionData_withADICO.csv'  # Update this path accordingly
Survey_Overview.to_csv(Survey_Overview_path, index=True)

Survey_Overview.sample(10)

Unnamed: 0_level_0,id,question,value,response,question_answers_combined,ADICO_Category
respid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
mbrncntcEC,mbrncntc,"Country of birth, mother",EC,Ecuador,"Country of birth, mother - With response: Ecuador",Condition
fbrncntcCF,fbrncntc,"Country of birth, father",CF,Central African Republic,"Country of birth, father - With response: Cent...",
regionDE259,region,Region,DE259,Nürnberger Land,Region - With response: Nürnberger Land,Condition
edlvebe16,edlvebe,"Highest level of education, Belgium",16,Universitair diploma van licentiaat of master;...,"Highest level of education, Belgium - With res...",Condition
isco088150,isco08,"Occupation, ISCO08",8150,"Textile, fur and leather products machine oper...","Occupation, ISCO08 - With response: Textile, f...",Condition
regionDE128,region,Region,DE128,Rhein-Neckar-Kreis,Region - With response: Rhein-Neckar-Kreis,
lnghom1SCO,lnghom1,Language most often spoken at home: first ment...,SCO,Scots,Language most often spoken at home: first ment...,Condition
rshipa24,rshipa2,Second person in household: relationship to re...,4,Brother/sister/step/adopted/foster,Second person in household: relationship to re...,
c19spwrk3,c19spwrk,"Speak with people you work with in person, how...",3,About the same,"Speak with people you work with in person, how...",
regionES708,region,Region,ES708,Lanzarote,Region - With response: Lanzarote,Condition


Decision Tree Question Selection

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt

In [75]:
Survey_Overview_path = 'ESSQuestionData_withADICO.csv'  # Update this path accordingly

Survey_Overview_ADICO = pd.read_csv(Survey_Overview_path)

Survey_Overview_ADICO.set_index('respid', inplace=True)
Survey_Overview_ADICO.dropna(inplace=True)
# Filter out questions that are categorized as Attributes, Conditions, or Aims for clarity in analysis
Attributes = Survey_Overview_ADICO.loc[Survey_Overview_ADICO['ADICO_Category'].str.contains('Attribute', na=False), ['id', 'question_answers_combined']]
Conditions = Survey_Overview_ADICO.loc[Survey_Overview_ADICO['ADICO_Category'].str.contains('Condition', na=False), ['id', 'question_answers_combined']]
Aims = Survey_Overview_ADICO.loc[Survey_Overview_ADICO['ADICO_Category'].str.contains('Aim', na=False), ['id', 'question_answers_combined']]

chosen_aims = Aims.sample(3) 
chosen_conditions = Conditions.sample(4) 

In [76]:
chosen_aims

Unnamed: 0_level_0,id,question_answers_combined
respid,Unnamed: 1_level_1,Unnamed: 2_level_1
atcherp4,atcherp,How emotionally attached to Europe - With resp...
stpldmi4,stpldmi,Important for democracy: government sticks to ...
jbprtfp3,jbprtfp,Job prevents you from giving time to partner/f...


In [77]:
# { 
#     "alg": " HS512"
#     "typ": "JWT"
# }
numerical_answers = Survey_Responses.iloc[:,10:-34]

# Try converting each column to numeric and coerce errors to NaN
for column in numerical_answers.columns:
    numerical_answers[column] = pd.to_numeric(numerical_answers[column], errors='coerce')

# Drop columns that contain any NaN values (i.e., non-convertible columns)
numerical_answers = numerical_answers.dropna(axis=1, how='any')

for aim in chosen_aims:
    if aim['id'] not in numerical_answers.columns: continue
    aimRow = str(Survey_Overview_ADICO.loc[aim['id']]['question_answers_combined'])
    print(aimRow)
    class_names = Survey_Overview_ADICO['responseoptions']
    feat_names = numerical_answers.loc[:, ~numerical_answers.columns.isin([aim])].columns
    feature_descs = [Survey_Overview_ADICO[Survey_Overview_ADICO.index == feature]['question_answers_combined'].values for feature in feat_names if feature in list(Survey_Overview_ADICO.index)]

    X = numerical_answers.loc[:, ~numerical_answers.columns.isin([aim])].values
    Y = numerical_answers.loc[:, aim].values

    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 100)
    clf_entropy = DecisionTreeClassifier(criterion = 'entropy', random_state=100, max_depth=1, min_samples_leaf=20)
    clf_entropy.fit(X_train,y_train)
    # Visualize the decision tree
    plt.figure(figsize=(25,5))
    plot_tree(clf_entropy, filled=True, feature_names=feature_descs, class_names = [str(item) for item in Survey_Responses[aim].unique()])
    plt.show()
    y_pred_en = clf_entropy.predict(X_test)
    print(("Accuracy is"),accuracy_score(y_test, y_pred_en))

TypeError: string indices must be integers, not 'str'

In [157]:
Survey_Responses.loc[:, 'prtclbhr'].values

array([nan, nan, nan, ..., nan, nan, nan])