# Access the MentalRiskEs data and interact with the server

This notebook has been developed by the [SINAI](https://sinai.ujaen.es/) research group for its usage in the [MentalRiskES](https://sites.google.com/view/mentalriskes2025/) evaluation campaign at IberLEF 2025.

**NOTE 1**: Please visit the [MentalRiskES competition website](https://sites.google.com/view/mentalriskes2025/evaluation) to read the instructions about how to download the data and interact with the server to send the predictions of your system.

**NOTE 2**: Along the code, please replace "URL" by the URL server and "TOKEN" by your personal token.

Remember this is a support to help you to develop your own system of communication with our server. We recommend you to download it as a Python script instead of working directly on colab and adapt the code to your needs.

# Install CodeCarbon package
Read the [documentation](https://mlco2.github.io/codecarbon/) about the library if necessary. Remember that we provide a [CodeCarbon notebook](https://colab.research.google.com/drive/1boavnGOir0urui8qktbZaOmOV2pS5cn6?usp=sharing) with the example in its specific use in our competition.


In [6]:
# !pip install codecarbon
# !pip install dotenv

Collecting dotenv
  Downloading dotenv-0.9.9-py2.py3-none-any.whl (1.9 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv, dotenv
Successfully installed dotenv-0.9.9 python-dotenv-1.1.0



[notice] A new release of pip is available: 23.0.1 -> 25.0.1
[notice] To update, run: C:\Users\maxi.rodriguez\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


# Import libraries

In [1]:
import requests, zipfile, io
from requests.adapters import HTTPAdapter, Retry
from typing import List, Dict
import random
import json
import os
from dotenv import load_dotenv
import pandas as pd
from codecarbon import EmissionsTracker

# Endpoints
These URL addresses are necessary for the connection to the server.

**IMPORTANT:** Replace "URL" by the URL server and "TOKEN" by your user token.

In [2]:
load_dotenv()
URL = os.getenv("SERVER_URL")
TOKEN = os.getenv("ACCESS_TOKEN")
print(URL, TOKEN)

# Download endpoints
ENDPOINT_DOWNLOAD_TRIAL = URL+"/{TASK}/download_trial/{TOKEN}"
ENDPOINT_DOWNLOAD_TRAIN = URL+"/{TASK}/download_train/{TOKEN}"

# Trial endpoints
ENDPOINT_GET_MESSAGES_TRIAL = URL+"/{TASK}/getmessages_trial/{TOKEN}"
ENDPOINT_SUBMIT_DECISIONS_TRIAL = URL+"/{TASK}/submit_trial/{TOKEN}/{RUN}"

# Test endpoints
ENDPOINT_GET_MESSAGES = URL+"/{TASK}/getmessages/{TOKEN}"
ENDPOINT_SUBMIT_DECISIONS = URL+"/{TASK}/submit/{TOKEN}/{RUN}"

http://s3-ceatic.ujaen.es:8036 c461869975ffb0a7ba8544ffdddf3b58


# Download Data
To download the data, you can make use of the **function defined in the following**.

The following function download the trial data. To adapt it to download the train and test data, follow the instructions given in the [website of the competition](https://sites.google.com/view/mentalriskes2024/evaluation).

In [3]:
def download_messages_trial(task: str, token: str):
    """ Allows you to download the trial data of the task.
        Args:
          task (str): task from which the data is to be retrieved
          token (str): authentication token
    """

    response = requests.get(ENDPOINT_DOWNLOAD_TRIAL.format(TASK=task, TOKEN=token))

    if response.status_code != 200:
        print("Trial - Status Code " + task + ": " + str(response.status_code) + " - Error: " + str(response.text))
    else:
      z = zipfile.ZipFile(io.BytesIO(response.content))
      os.makedirs("./data/{task}/trial/".format(task=task))
      z.extractall("./data/{task}/trial/".format(task=task))

In [4]:
def download_messages_train(task: str, token: str):
    """ Allows you to download the train data of the task.
        Args:
          task (str): task from which the data is to be retrieved
          token (str): authentication token
    """
    response = requests.get(ENDPOINT_DOWNLOAD_TRAIN.format(TASK=task, TOKEN=token))

    if response.status_code != 200:
        print("Train - Status Code " + task + ": " + str(response.status_code) + " - Error: " + str(response.text))
    else:
      z = zipfile.ZipFile(io.BytesIO(response.content))
      os.makedirs("./data/{task}/train/".format(task=task),exist_ok=True)
      z.extractall("./data/{task}/train/".format(task=task))

In [None]:
# Required basic imports
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import re
import os
import string
import joblib
from pathlib import Path

# Imports for tokenizing and vectorizing data
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Imports for vectorized data pre-processing and classification models
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import unicodedata

def clear_tokens(tokens_list: list) -> None:
    """ Removes punctuation symbols from the given tokens list """
    # Traverse the list backwards to avoid logic problems with pop() and indexes order
    for i in range(len(tokens_list) - 1, -1, -1):
        if tokens_list[i] in string.punctuation:
            tokens_list.pop(i)
    return

def remove_emojis(keyword):
    return re.sub(r"[^\w\s,!?@#áéíóúÁÉÍÓÚñÑ]", "", keyword)


def user_document_to_input(messages_list, trained_vectorizer):
    tokenizer = TweetTokenizer()
    
    # Remove emojis and tokenize
    tokens = tokenizer.tokenize(remove_emojis(" ".join(messages_list)))
    clear_tokens(tokens)
    
    # Join tokens into a single document string
    user_document = " ".join(tokens)
    
    X = trained_vectorizer.transform([user_document])
    return pd.DataFrame(X.toarray(), columns=trained_vectorizer.get_feature_names_out())


# load the final models
# --- tfidf vectorizer:
vectorizer_tfidf_task1 = joblib.load("trained_models/tfidf_vectorizer_task1_emojis.joblib")
vectorizer_tfidf_task2 = joblib.load("trained_models/tfidf_vectorizer_task2_emojis.joblib")

# --- random forest baseline task 2
model_rf_task2 = joblib.load("trained_models/RF_task2_emojis.joblib")

# --- svm baseline task 1
model_svm_task1 = joblib.load("trained_models/SVM_task1_emojis.joblib")

# --- dedicate svms task 1
topics = ['betting', 'trading', 'onlinegaming', 'lootboxes']
svm_models_task1 = {
    k: joblib.load(f"trained_models/SVM_task1_{k}.joblib") for k in topics
}
class Pipeline:
    def predict(self, user_input):
        rf_pred = model_rf_task2.predict(user_input)
        return svm_models_task1[rf_pred[0]].predict(user_input)

# Client Server
This class simulates communication with our server. The following code established the conection with the server client and simulate the GET and POST requests.

**IMPORTANT NOTE:** Please pay attention to the basic functions and remember that it is only a base for your system.

In [None]:
class Client_task1_2:
    """ Client communicating with the official server.
        Attributes:
            token (str): authentication token
            number_of_runs (int): number of systems. Must be 3 in order to advance to the next round.
            tracker (EmissionsTracker): object to calculate the carbon footprint in prediction

    """
    def __init__(self, task:str, token: str, number_of_runs: int, tracker: EmissionsTracker):
        self.task = task
        self.token = token
        self.number_of_runs = number_of_runs
        self.tracker = tracker
        self.relevant_cols = ['duration', 'emissions', 'cpu_energy', 'gpu_energy',
                              'ram_energy','energy_consumed', 'cpu_count', 'gpu_count',
                              'cpu_model', 'gpu_model', 'ram_total_size','country_iso_code']


    def get_messages(self, retries: int, backoff: float) -> Dict:
        """ Allows you to download the test data of the task by rounds.
            Here a GET request is sent to the server to extract the data.
            Args:
              retries (int): number of calls on the server connection
              backoff (float): time between retries
        """
        session = requests.Session()
        retries = Retry(
                        total = retries,
                        backoff_factor = backoff,
                        status_forcelist = [500, 502, 503, 504]
                        )
        session.mount('https://', HTTPAdapter(max_retries=retries))

        response = session.get(ENDPOINT_GET_MESSAGES_TRIAL.format(TASK=self.task, TOKEN=self.token)) # ENDPOINT

        if response.status_code != 200:
          print("GET - Task {} - Status Code {} - Error: {}".format(self.task, str(response.status_code), str(response.text)))
          return []
        else:
          return json.loads(response.content)

    def submit_decission(self, messages: List[Dict], emissions: Dict, retries: int, backoff: float):
        """ Allows you to submit the decisions of the task by rounds.
            The POST requests are sent to the server to send predictions and carbon emission data
            Args:
              messages (List[Dict]): Message set of the current round
              emissions (Dict): carbon footprint generated in the prediction
              retries (int): number of calls on the server connection
              backoff (float): time between retries
        """
        decisions_run0 = {}
        decisions_run1 = {}
        decisions_run2 = {}
        type_addiction_list = ["betting", "onlinegaming", "betting", "trading"]
        type_addiction_decision = {}

        # You must create the appropriate structure to send the predictions according to each task
        for message in messages:
            decisions_run0[message["nick"]] = messages["run0_svm"] # random.choice([0,1])
            decisions_run1[message["nick"]] = messages["run1_bert"] # random.choice([0,1])
            decisions_run2[message["nick"]] = messages["run2_pipeline"] # random.choice([0,1])
            type_addiction_decision[message["nick"]] = messages["type_addiction"] # random.choice(type_addiction_list)

        data1_run0 = {
            "predictions": decisions_run0, 
            "emissions": emissions
        }
        data1_run1 = {
            "predictions": decisions_run1,
            "emissions": emissions
        }
        data1_run2 = {
            "predictions": decisions_run2,
            "emissions": emissions
        }
        data2_run0 = {
            "predictions": decisions_run0,
            "types":type_addiction_decision,
            "emissions": emissions
        }
        data2_run1 = {
            "predictions": decisions_run1,
            "types":type_addiction_decision,
            "emissions": emissions
        }
        data2_run2 = {
            "predictions": decisions_run2,
            "types":type_addiction_decision,
            "emissions": emissions
        }

        data1 = []
        data1.append(json.dumps(data1_run0))
        data1.append(json.dumps(data1_run1))
        data1.append(json.dumps(data1_run2))

        data2 = []
        data2.append(json.dumps(data2_run0))
        data2.append(json.dumps(data2_run1))
        data2.append(json.dumps(data2_run2))

        # Session to POST request
        session = requests.Session()
        retries = Retry(
                        total = retries,
                        backoff_factor = backoff,
                        status_forcelist = [500, 502, 503, 504]
                        )
        session.mount('https://', HTTPAdapter(max_retries=retries))

        for run in range(0, self.number_of_runs):
            # For each run, new decisions
            response1 = session.post(ENDPOINT_SUBMIT_DECISIONS_TRIAL.format(TASK='task1', TOKEN=self.token, RUN=run), json=[data1[run]]) # ENDPOINT
            if response1.status_code != 200:
                print("POST - Task1 - Status Code {} - Error: {}".format(str(response1.status_code), str(response1.text)))
                return
            else:
                print("POST - Task1 - run {} - Message: {}".format(run, str(response1.text)))

            response2 = session.post(ENDPOINT_SUBMIT_DECISIONS_TRIAL.format(TASK='task2', TOKEN=self.token, RUN=run), json=[data2[run]]) # ENDPOINT
            if response2.status_code != 200:
                print("POST - Task2 - Status Code {} - Error: {}".format(str(response2.status_code), str(response2.text)))
                return
            else:
                print("POST - Task2 - run {} - Message: {}".format(run, str(response2.text)))

            with open('./data/preds/task1/round{}_run{}.json'.format(messages[0]["round"], run), 'w+', encoding='utf8') as json_file:
                json.dump(data1[run], json_file, ensure_ascii=False)
            with open('./data/preds/task2/round{}_run{}.json'.format(messages[0]["round"], run), 'w+', encoding='utf8') as json_file:
                json.dump(data2[run], json_file, ensure_ascii=False)


    def run_task1_2(self, retries: int, backoff: float):
        """ Main thread
            Args:
              retries (int): number of calls on the server connection
              backoff (float): time between retries
        """
        # Get messages for task1_2
        messages = self.get_messages(retries, backoff)

        # If there are no messages
        if len(messages) == 0:
            print("All rounds processed")
            return

        # list of messages per user
        user_messages = {m["nick"]: [m["message"]] for m in messages}
        round_count = 1
        submitted = False

        while len(messages) > 0:
            print(messages)
            print("----------------------- Processing round {}".format(messages[0]["round"]))
            # Save subjects
            with open('./data/rounds/round{}.json'.format(messages[0]["round"]), 'w+', encoding='utf8') as json_file:
                json.dump(messages, json_file, ensure_ascii=False)


            if round_count > 49:
                # Calculate emissions for each prediction
                self.tracker.start()

                user_predictions = []
                for user_id, user_messages_list in user_messages.items():
                    user_input = user_document_to_input(user_messages_list, vectorizer_tfidf_task2)
                    run0_svm = int(model_svm_task1.predict(user_input)[0])
                    run1_bert = 
                    run2_pipeline = int(Pipeline().predict(user_input)[0])
                    type_addiction = str(model_rf_task2.predict(user_input)[0])

                    user_predictions.append(
                        {
                            "round": messages[0]["round"],
                            "nick": user_id,
                            "run0_svm": run0_svm,
                            "run1_bert": run1_bert,
                            "run2_pipeline": run2_pipeline,
                            "type_addiction": type_addiction,
                        }
                    )
                emissions = self.tracker.stop()

                df = pd.read_csv("emissions.csv")
                measurements = df.iloc[-1][self.relevant_cols].to_dict()

                # self.submit_decission(user_predictions, measurements, retries, backoff)
                submitted = True
                break

            # One GET request for each round
            messages = self.get_messages(retries, backoff)
            round_count += 1

            # accumulate next message
            for message in messages:
                user_messages[message["nick"]].append(message["message"])

        if not submitted:
            self.tracker.start()
           
            user_predictions = []
            for user_id, user_messages_list in user_messages.items():
                user_input = user_document_to_input(user_messages_list, vectorizer_tfidf_task2)
                run0_svm = int(model_svm_task1.predict(user_input)[0])
                run1_bert = 
                run2_pipeline = int(Pipeline().predict(user_input)[0])
                type_addiction = str(model_rf_task2.predict(user_input)[0])

                user_predictions.append(
                    {
                        "round": messages[0]["round"],
                        "nick": user_id,
                        "run0_svm": run0_svm,
                        "run1_bert": run1_bert,
                        "run2_pipeline": run2_pipeline,
                        "type_addiction": type_addiction,
                    }
                )

            emissions = self.tracker.stop()
            df = pd.read_csv("emissions.csv")
            measurements = df.iloc[-1][self.relevant_cols].to_dict()
            # self.submit_decission(user_predictions, measurements, retries, backoff)
            submitted = True

        print("All rounds processed")

# Main

In [22]:
def download_data(task: str, token: str):
    download_messages_trial(task, token)
    # download_messages_train(task, token)

def get_post_data(task: str, token: str):
    # Emissions Tracker Config
    config = {
        "save_to_file": True,
        "log_level": "WARNING",
        "tracking_mode": "process",
        "output_dir": ".",
        "allow_multiple_runs": True
    }
    tracker = EmissionsTracker(**config)

    number_runs = 3 # Max: 3

    # Prediction period
    client_task1_2 = Client_task1_2(task, token, number_runs, tracker)
    client_task1_2.run_task1_2(5, 0.1)

Be careful! In this specific example we use the name of the task1 to do the get, knowing that it is the same data for both task 1 and task 2. In addition, the data upload is performed for both tasks.

In [27]:
if __name__ == '__main__':
    # download_data("task2", TOKEN)
    get_post_data("task1",TOKEN)

 Windows OS detected: Please install Intel Power Gadget to measure CPU



[{'id_message': 123, 'round': 1, 'nick': 'subject1', 'message': '...', 'date': '...', 'platform': 'reddit'}, {'id_message': 134, 'round': 1, 'nick': 'subject10', 'message': '...', 'date': '...', 'platform': 'telegram'}, {'id_message': 134, 'round': 1, 'nick': 'subject103', 'message': '...', 'date': '...', 'platform': 'telegram'}, {'id_message': 134, 'round': 1, 'nick': 'subject101', 'message': '...', 'date': '...', 'platform': 'telegram'}]
----------------------- Processing round 1


FileNotFoundError: [Errno 2] No such file or directory: './data/rounds/round1.json'