In [20]:
from elasticsearch_dsl import connections, Index, Search
from elasticsearch import Elasticsearch
from elasticsearch_dsl.query import Range, Term
from pathlib import Path
import fasttext
from detoxify import Detoxify
import re
import asyncio
import csv

In [21]:
ELASTIC_HOST = "https://elasticsearch.srv.webis.de"
ELASTIC_PORT = 9200
ELASTIC_USER = "po87xox"
# As a way to hide the password at least from the notebook, enter a path to a file here, which only contains the password for Elastic.
ELASTIC_PASSWORD_FILE = Path("../passwords/webis-elasticsearch.txt").expanduser()
INDEX = "corpus_mastodon_statuses*"

OUTPUT_PATH = Path("../data")

# Limit the Elastic searches to a specific date range. Crawling started on 2023-12-21.
DATE_AFTER = "2023-12-01T00:00:00"
## Ca. "2024-01-30T12:00:00" is the time when a new version of the fediverse data was gahtered.
DATE_BEFORE = "2024-02-22T00:00:00"

In [22]:
# Connect to Elastic.
with ELASTIC_PASSWORD_FILE.open("r") as f:
    password = f.readline().strip("\n")
es = Elasticsearch(
    [ELASTIC_HOST],
    port=ELASTIC_PORT,
    http_auth=(ELASTIC_USER, password),
    timeout=3000,
    scheme="https"
)

In [23]:
# Prepare the date query
date_query = Range(crawled_at={"gte": DATE_AFTER, "lte": DATE_BEFORE, "format" : "date_hour_minute_second"})
date_query.to_dict()

{'range': {'crawled_at': {'gte': '2023-12-01T00:00:00',
   'lte': '2024-02-22T00:00:00',
   'format': 'date_hour_minute_second'}}}

In [24]:
base_search: Search = Search(using=es, index=INDEX).filter(date_query)

In [43]:
instances = ["mastodon.social", "veganism.social", "pawoo.net", "shitposter.club", "social.coop", "scholar.social", "qoto.org"]
language_name = "en"

filtered_searches: Search = [base_search.filter(Term(instance=instance)).filter(Term(language=language_name)) for instance in instances]

In [44]:
responses = [filtered_search.extra(size=1000).execute() for filtered_search in filtered_searches]

In [27]:
# Add ". " at the end of each <p> content if no punctuation mark is there and not followed by </span> or </a>
def add_dot_if_no_punctuation(match):
    content = match.group(1).strip()
    if not re.search(r'[.!?;]$', content) and not re.search(r'(<\/span>|<\/a>)$', content):
        content += '. '
    return f'<p>{content}</p>'

def clean_text(html):

    html = html.replace('\n', ' ')

    # Add dot to the end of <p> content if needed
    html = re.sub(r'<p>(.*?)<\/p>', add_dot_if_no_punctuation, html)

    # Define the regex pattern to remove <a> and <span> tags along with their content
    pattern = re.compile(r'<a[^>]*>.*?<\/a>|<span[^>]*>.*?<\/span>|<[^>]+>')

    # Remove all <a> and <span> tags along with their content
    cleaned_text = re.sub(pattern, '', html)
    
    return cleaned_text

In [28]:
lang_detector = fasttext.load_model('../models/lid.176.bin')

In [12]:
en = []
not_en = []

for hit in responses[0]:
    cleaned_content = clean_text(hit.content)
    try:
        prediction = lang_detector.predict(cleaned_content)[0][0]
        if prediction == '__label__en':
            en.append(hit)
        else:
            not_en.append(hit)
    except ValueError as e:
        print(f"Error predicting language for hit: {cleaned_content, hit.content}")
        print(e)


NameError: name 'response' is not defined

In [None]:
print(len(en))
invalid_en = []
error = []

for i in range(len(en)):
    try:
        if en[i].language != 'en':
            invalid_en.append(en[i])
    except AttributeError:
        error.append(en[i])
        pass


print(len(invalid_en))

867
0


In [None]:
print(len(not_en))
invalid_not_en = []
error_2 = []
for i in range(len(not_en)):
    try:
        if not_en[i].language == 'en':
            invalid_not_en.append(not_en[i])
    except AttributeError:
        error_2.append(en[i])
        pass
            
print(clean_text(invalid_not_en[100].content))
print(invalid_not_en[100].language)

print(len(invalid_not_en))

133
Σταϊκούρας: Με τέσσερα νέα, μεγάλα έργα η Ελλάδα αναπτύσσεται, ισχυροποιείται και αναβαθμίζεται  «Τα έργα προχωρούν, η Ελλάδα αναπτύσσεται, ισχυροποιείται και αναβαθμίζεται», τονίζει ο κ. Σταϊκούρας. source. 
en
133


In [29]:
toxic_bert = Detoxify('original')



In [30]:
class Toxicity:
    def __init__(self, toxicity=0, severe_toxicity=0, obscene=0, threat=0, insult=0, identity_attack=0):
        self.toxicity = toxicity
        self.severe_toxicity = severe_toxicity
        self.obscene = obscene
        self.threat = threat
        self.insult = insult
        self.identity_attack = identity_attack
    
    def __add__(self, other):
        return Toxicity(
            self.toxicity + other.toxicity,
            self.severe_toxicity + other.severe_toxicity,
            self.obscene + other.obscene,
            self.threat + other.threat,
            self.insult + other.insult,
            self.identity_attack + other.identity_attack
        )
    
    def __truediv__(self, other):
        return Toxicity(
            self.toxicity / other,
            self.severe_toxicity / other,
            self.obscene / other,
            self.threat / other,
            self.insult / other,
            self.identity_attack / other
        )
    
    def __str__(self):
        return f"Toxicity: {self.toxicity}, Severe Toxicity: {self.severe_toxicity}, Obscene: {self.obscene}, Threat: {self.threat}, Insult: {self.insult}, Identity Attack: {self.identity_attack}"

In [60]:
from concurrent.futures import ThreadPoolExecutor

executor = ThreadPoolExecutor()

async def predict_toxicity(hit):
    invalids = 0
    cleaned_content = clean_text(hit.content)
    lang = lang_detector.predict(cleaned_content)[0][0]
        
    if lang == '__label__en':
        toxic_values = await asyncio.get_running_loop().run_in_executor(executor, toxic_bert.predict, cleaned_content)
    else:
        invalids += 1
        toxic_values = {}

    return Toxicity(**toxic_values), invalids

async def measure_toxicity(response):
    tasks = [predict_toxicity(hit) for hit in response]
    results = await asyncio.gather(*tasks)
    
    toxicity_sum = Toxicity()
    invalids = 0
    
    for result, inv in results:
        toxicity_sum += result
        invalids += inv
    
    valid_size = len(response) - invalids
    
    if valid_size > 0:
        toxicity_sum /= valid_size
    
    return toxicity_sum

In [61]:
toxicity_sums = [await measure_toxicity(response) for response in responses]

In [63]:
csv_filename = 'toxicity_sums.csv'
with open(OUTPUT_PATH/csv_filename, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(['Instance', 'Toxicity Sum'])
    for instance, toxicity_sum in zip(instances, toxicity_sums):
        csv_writer.writerow([instance, toxicity_sum])