In [None]:
import os
import json
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast
from collections import deque
from openai import OpenAI
from dotenv import load_dotenv

In [None]:
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 25)

In [None]:
# load dataset
file = './csv/PensionReform.csv'
df = pd.read_csv(file, dtype={"id":str,"conversation_id":str,"ref_id":str,"ref_author_id":str, "in_reply_to_user_id":str, "author_id":str})


In [None]:
df.info()

# Data preprocessing

You can skip this section if not necessary

In [None]:
def update_text_columns(row):
    if pd.notna(row['ref_note_tweet']) and row['ref_type'] == "retweeted":
        return row['ref_note_tweet']
    elif pd.notna(row['ref_text']) and row['ref_type'] == "retweeted":
        return row['ref_text']
    return row['text']

df['text'] = df.apply(update_text_columns, axis=1)
df.drop(columns=['ref_note_tweet'], inplace=True)
df.rename(columns={'ref_note_tweet': 'ref_text'}, inplace=True)

df.info()
df.head()

In [None]:
df['retweet_count'] = np.int32(0)
df['reply_count'] = np.int32(0)
df['like_count'] = np.int32(0)
df['quote_count'] = np.int32(0)
df['bookmark_count'] = np.int32(0)
df['impression_count'] = np.int32(0)

df.info()

In [None]:
def extract_metrics(row):
    str_dic = ast.literal_eval(row['public_metrics'])
    row['retweet_count'] = str_dic['retweet_count']
    row['reply_count'] = str_dic['reply_count']
    row['like_count'] = str_dic['like_count']
    row['quote_count'] = str_dic['quote_count']
    row['bookmark_count'] = str_dic['bookmark_count']
    row['impression_count'] = str_dic['impression_count']
    return row

df = df.apply(extract_metrics, axis=1)
df.drop(columns=['public_metrics'], inplace=True)

df.info()
df.head()

# Classification model setup

In [None]:
df['likert_scale_Q1'] = np.nan
df['r_tag_Q1'] = np.nan

In [None]:
load_dotenv()

api_key = os.getenv("OPENAI4_API_KEY")
if api_key is None:
    raise ValueError("OPENAI4_API_KEY not found in environment variables")
client = OpenAI(api_key=api_key)

# Classification Function
def openAI_classificator(prompt, message, model="gpt-4.1-mini-2025-04-14"):
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {
                    "role": "system",
                    "content": prompt
                },
                {
                    "role": "user",
                    "content": message
                }
            ],
            max_tokens=10,
            temperature=0.0,
            top_p=0.9
        )

        return response.choices[0].message.content, response.usage.total_tokens

    except Exception as e:
        print(e)
        match = re.search(r"Requested (\d+)\.", str(e))
        if match:
            requested_tokens = int(match.group(1))
        else:
            requested_tokens = 0

        return np.nan, requested_tokens

In [None]:
# NOTE: Pensional reform context and subject example

context = "La reforma pensional en Colombia es un tema recurrente en la agenda política y social" \
" del país debido a la necesidad de abordar la sostenibilidad y la equidad del sistema de" \
" pensiones. El sistema actual combina un régimen de prima media (RPM), administrado por el" \
" Estado a través de Colpensiones, y un régimen de ahorro individual (RAIS), manejado por" \
" fondos privados. Sin embargo, el acceso a una pensión digna es limitado para muchos trabajadores," \
" especialmente aquellos en la informalidad o con bajos ingresos. La reforma busca ampliar la" \
" cobertura, mejorar la equidad entre los diferentes regímenes y asegurar la sostenibilidad" \
" financiera a largo plazo."

subject = "Apoya la reforma pensional en Colombia propuesta por el gobierno de Colombia."

In [None]:
def basic_prompt(context, subject):
    return f"""
    Contexto: {context}

    Instrucción: Clasifica el siguiente mensaje en la escala de Likert en relación con la siguiente afirmación: \"{subject}\". Tambien indica si el mensaje tiene relacion con la afirmacion o no (1 o 0).
    Las opciones son: 1: 'Completamente en desacuerdo', 2: 'En desacuerdo', 3: 'Ni de acuerdo ni en desacuerdo', 4: 'De acuerdo', 5: 'Completamente de acuerdo'.
    Solo responde con una de las etiquetas mencionadas sin ningún texto adicional y si tiene relacion el texto no. Por ejemplo "2,1" para "En desacuerdo" y "si tiene relacion con el tema" o "4,0" (De acuerdo y no tiene relacion).
    """


In [None]:
def reply_prompt(context, subject, replied_message):
    return f"""
    Contexto: {context}

    Instrucción: Clasifica el siguiente mensaje en la escala de Likert en relación con la siguiente afirmación: \"{subject}\". Tambien indica si el mensaje tiene relacion con la afirmacion o no (1 o 0).
    Las opciones son: 1: 'Completamente en desacuerdo', 2: 'En desacuerdo', 3: 'Ni de acuerdo ni en desacuerdo', 4: 'De acuerdo', 5: 'Completamente de acuerdo'.
    Ten en cuenta tambien que el mensaje al que responde es: \"{replied_message}\"
    Solo responde con una de las etiquetas mencionadas sin ningún texto adicional y si tiene relacion el texto no. Por ejemplo "2,1" para "En desacuerdo" y "si tiene relacion con el tema" o "4,0" (De acuerdo y no tiene relacion).
    """

In [None]:
# Max token per minute
MAX_TOKENS_PER_MINUTE = 450000 # Rate limit tier 2 -> 450k tokens/minute in model gpt-4.1-2025-04-14

# save (timestamp, tokens) of each request in a deque
token_window = deque()

def clean_token_window():
    #delete the values older than 60 seconds
    current_time = time.time()
    while token_window and (current_time - token_window[0][0]) > 60:
        token_window.popleft()

def wait_if_needed(new_tokens):
    clean_token_window()
    current_tokens = sum(tokens for _, tokens in token_window)

    if current_tokens + new_tokens > MAX_TOKENS_PER_MINUTE:
        excess = (current_tokens + new_tokens) - MAX_TOKENS_PER_MINUTE
        print(f"[WAIT] amount of tokens exceded ({excess}). Waiting...")

        # Calculate the time to wait based on the excess tokens
        time_to_wait = 60 - (time.time() - token_window[0][0])
        time.sleep(time_to_wait)
        clean_token_window()

# Messages classification

In [None]:
# id's of rows that are not retweets
df_nrt = df[df['ref_type'] != 'retweeted']
nrt_ids = df_nrt['id'].to_list(); #nrt_ids

In [None]:
def classify_message(context,subject,ids, class_type=2):
    i = 0
    for id in ids:
        if i % 30 == 0:
            print(f"Processing {i} of {len(ids)}")

        row = df[df["id"] == id].iloc[0]
        text = row['text']

        try:
            wait_if_needed(850)

            if class_type == 1:
                response = openAI_classificator(basic_prompt(context, subject), text, model="gpt-4o-2024-08-06")[0]

            elif class_type == 2:
                #classify tweets
                if pd.isna(row['ref_type']):
                    response = openAI_classificator(basic_prompt(context, subject), text, model="gpt-4o-2024-08-06")[0]

                #classify replies and quotes
                else:
                    ref_text = row['ref_text']
                    response = openAI_classificator(reply_prompt(context, subject,ref_text), text, model="gpt-4o-2024-08-06")[0]

            classification, r_tag = response.strip().split(",")

            df.loc[df["id"] == id, 'likert_scale_Q1'] = classification
            df.loc[df["id"] == id,'r_tag_Q1'] = r_tag

        except Exception as e:
            print(f"Error processing id {id}: {e}")
            row['likert_scale_Q1'] = "error"
            row['r_tag_Q1'] = "error"

        i += 1

In [None]:
# Step 1: Classify all messages except retweets
nrt_ids = df_nrt['id'].to_list(); #nrt_ids

classify_message(context, subject, nrt_ids)

In [None]:
# Step 2: Check if any of the messages classified was retweeted to assing the classification value to the retweets
# and not classify them again
aux_count = []
for id in nrt_ids:
    aux = df[(df['ref_id'] == str(id)) & (df['ref_type'] == 'retweeted')]
    aux_count.append(len(aux))

print(f"Number of retweets already classified: {sum(aux_count)}")

i = 0
for id in nrt_ids:
    if i % 30 == 0:
        print(f"Processing {i} of {len(nrt_ids)}")

    classification = df.loc[df['id'] == id, 'likert_scale_Q1'].values[0]
    r_tag = df.loc[df['id'] == id, 'r_tag_Q1'].values[0]

    df.loc[(df['ref_id'] == id) & (df['ref_type'] == 'retweeted'), 'likert_scale_Q1'] = classification
    df.loc[(df['ref_id'] == id) & (df['ref_type'] == 'retweeted'), 'r_tag_Q1'] = r_tag

    i += 1

In [None]:
# Step 3: Identify unique ref_ids in the dataframe
unique_ref_ids = df['ref_id'].unique()
unique_ref_ids = unique_ref_ids[~pd.isna(unique_ref_ids)]

# Identify ref_id without classification
ref_dif = []
ref_count = []

for id in unique_ref_ids:
    aux = df[(df['ref_id'] == id) & (df['ref_type'] == 'retweeted')]

    if not aux.empty:
        first = aux.iloc[0]
        if not pd.notna(first.likert_scale_Q1):
            ref_count.append(len(aux))
            ref_dif.append(first.ref_id)

print(f"Number of retweets without classification: {sum(ref_count)}")

In [None]:
# Step 4: Classify the unique retweets that have not been classified yet
# and assign the classification to the retweets
print(f"Tweets needed to classify: {len(ref_dif)}. Classifying...")

i = 0
for id in ref_dif:
    if i % 30 == 0:
        print(f"Processing {i} of {len(ref_dif)}")

    row = df[(df['ref_id'] == id) & (df['ref_type'] == 'retweeted')].iloc[0]
    text = row['text']

    response = openAI_classificator(basic_prompt(context, subject), text, model="gpt-4o-2024-08-06")[0]
    classification, r_tag = response.strip().split(",")


    df.loc[(df['ref_id'] == id) & (df['ref_type'] == 'retweeted'), 'likert_scale_Q1'] = classification
    df.loc[(df['ref_id'] == id) & (df['ref_type'] == 'retweeted'),'r_tag_Q1'] = r_tag

    i += 1

In [None]:
df.info()

In [None]:
save_path = './csv/PensionReform.csv'
df.to_csv(save_path, index=False)
# NOTE: ORIGINAL MESSAGES HAVE BEEN DELETED IN THE SHARED DATASETS TO AVOID data-exposure risks, and all author IDs are anonymized.

In [None]:
df['likert_scale_Q1'].value_counts()