In [64]:
from elasticsearch_dsl import Search
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
from elasticsearch_dsl.query import Range, Term
from pathlib import Path
import csv
import json

In [84]:
def filter_posts(input_json, high_threshold=0.7, mid_threshold=0.3, limit=50, attribute='toxicity'):
    # Initialize lists for the different toxicity levels
    high_posts = []
    mid_posts = []
    low_posts = []

    # Iterate through each post and categorize based on toxicity
    for post in input_json:
        toxicity_score = post['toxicity'][attribute]
        
        if toxicity_score > high_threshold and len(high_posts) < limit:
            high_posts.append({'id': post['id'], 'level': 1, attribute: toxicity_score})
        elif high_threshold >= toxicity_score > mid_threshold and len(mid_posts) < limit:
            mid_posts.append({'id': post['id'], 'level': 0, attribute: toxicity_score})
        elif toxicity_score <= mid_threshold and len(low_posts) < limit:
            low_posts.append({'id': post['id'], 'level': -1, attribute: toxicity_score})

        # Stop if all categories have enough posts
        if (len(high_posts) >= limit and
            len(mid_posts) >= limit and
            len(low_posts) >= limit):
            break

    # Combine all posts into a single list
    combined_posts = high_posts + mid_posts + low_posts

    # Write to CSV file
    with open(f'../../data/output/filtered_{attribute}_posts.csv', mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=['id', 'level', attribute])
        writer.writeheader()
        writer.writerows(combined_posts)
    
    all_ids = [post['id'] for post in combined_posts]

    return all_ids


In [85]:
with open('/mnt/ceph/storage/data-tmp/2024/po87xox/thesis-klueber/data/output/toxicity_results.json', 'r') as file:
    data = json.load(file)

ids = []
ids.append(filter_posts(data, attribute="severe_toxicity"))
ids.append(filter_posts(data, attribute="obscene"))
ids.append(filter_posts(data, attribute="threat"))
ids.append(filter_posts(data, attribute="identity_attack"))
ids.append(filter_posts(data, attribute="toxicity"))
ids.append(filter_posts(data, attribute="insult"))

In [86]:
ids_toxicity = filter_posts(data, attribute="toxicity")

In [67]:
ELASTIC_HOST = "https://elasticsearch.srv.webis.de"
ELASTIC_PORT = 9200
ELASTIC_USER = "po87xox"
# As a way to hide the password at least from the notebook, enter a path to a file here, which only contains the password for Elastic.
ELASTIC_PASSWORD_FILE = Path("../../data/passwords/webis-elasticsearch.txt").expanduser()
INDEX = "corpus_mastodon_statuses*"

OUTPUT_PATH = Path("../../data/output/")

# Limit the Elastic searches to a specific date range. Crawling started on 2023-12-21.
DATE_AFTER = "2023-12-01T00:00:00"
## Ca. "2024-01-30T12:00:00" is the time when a new version of the fediverse data was gahtered.
DATE_BEFORE = "2024-02-22T00:00:00"

In [68]:
# Connect to Elastic.
with ELASTIC_PASSWORD_FILE.open("r") as f:
    password = f.readline().strip("\n")
es = Elasticsearch(
    [ELASTIC_HOST],
    port=ELASTIC_PORT,
    http_auth=(ELASTIC_USER, password),
    timeout=3000,
    scheme="https"
)

In [69]:
# Prepare the date query
date_query = Range(crawled_at={"gte": DATE_AFTER, "lte": DATE_BEFORE, "format" : "date_hour_minute_second"})
date_query.to_dict()

{'range': {'crawled_at': {'gte': '2023-12-01T00:00:00',
   'lte': '2024-02-22T00:00:00',
   'format': 'date_hour_minute_second'}}}

In [70]:
from itertools import chain

ids = list(set(list(chain.from_iterable(ids))))

In [87]:
base_search: Search = Search(using=es, index=INDEX).filter(date_query)
response = base_search.query("terms", _id=ids_toxicity).source(["content"]).extra(size=1000).execute()


In [88]:
import pandas as pd
from resiliparse.extract.html2text import extract_plain_text

def extract_text_from_hit(hit):
    return extract_plain_text(
        hit['_source']['content'],
        main_content=True,
        alt_texts=False,
        preserve_formatting=False
    )

data = []

for hit in response['hits']['hits']:
    doc_id = hit['_id']
    plain_text = extract_text_from_hit(hit)
    data.append({"id": doc_id, "content": plain_text})

df = pd.DataFrame(data)

df.to_csv("../../data/output/texts.csv", index=False)

In [102]:
import pandas as pd

first_csv = pd.read_csv('../../data/output/filtered_toxicity_posts.csv')

second_csv = pd.read_csv('../../data/annotated.csv')

merged_df = pd.merge(first_csv, second_csv, on='id')

label_mapping = {
    "Non Toxic": -1,
    "Toxic": 1,
    "Not sure": 0
}
merged_df['label'] = merged_df['label'].map(label_mapping)

result_df = merged_df[['id', 'level', 'label', 'toxicity', 'content']]

# Save the result to a new CSV file
result_df.to_csv('../../data/output/toxicity_test.csv', index=False)


In [104]:
import pandas as pd

df = pd.read_csv('../../data/output/toxicity_test.csv')

df_non_toxic = df[df['label'] == -1] 
df_not_sure = df[df['label'] == 0]    
df_toxic = df[df['label'] == 1] 

correct_predictions_non_toxic = (df_non_toxic['label'] == df_non_toxic['level']).sum()
correct_predictions_not_sure = (df_not_sure['label'] == df_not_sure['level']).sum()
correct_predictions_toxic = (df_toxic['label'] == df_toxic['level']).sum()

accuracy_non_toxic = correct_predictions_non_toxic / df_non_toxic.shape[0]
accuracy_not_sure = correct_predictions_not_sure / df_not_sure.shape[0]
accuracy_toxic = correct_predictions_toxic / df_toxic.shape[0]

print(f"Die Genauigkeit der Vorhersagen für Non Toxic beträgt: {accuracy_non_toxic:.2f}")
print(f"Die Genauigkeit der Vorhersagen für Not sure beträgt: {accuracy_not_sure:.2f}")
print(f"Die Genauigkeit der Vorhersagen für Toxic beträgt: {accuracy_toxic:.2f}")

print(f"Anzahl non toxic: {df_non_toxic.shape[0]}")
print(f"Anzahl not sure: {df_not_sure.shape[0]}")
print(f"Anzahl toxic: {df_toxic.shape[0]}")

df_binomial = df[((df['label'] == -1) | (df['label'] == 1)) & ((df['level'] == -1) | (df['level'] == 1))]

correct_predictions = (df_binomial['label'] == df_binomial['level']).sum()
accuracy = correct_predictions / df_binomial.shape[0]

print(f"Die Genauigkeit der Vorhersagen beträgt: {accuracy:.2f}")

#ids with wrong predictions
df_binomial[df_binomial['label'] != df_binomial['level']]['content']



Die Genauigkeit der Vorhersagen für Non Toxic beträgt: 0.60
Die Genauigkeit der Vorhersagen für Not sure beträgt: 0.52
Die Genauigkeit der Vorhersagen für Toxic beträgt: 0.59
Anzahl non toxic: 72
Anzahl not sure: 23
Anzahl toxic: 54
Die Genauigkeit der Vorhersagen beträgt: 0.85


1     The kids are watching Sing. I’m peeling the sp...
2     Every #caturday, you guys make me wish my apar...
12    F**k. I cracked my iPad Air's screen. I have A...
17    toroids are a wunnerful thang #AmateurRadio ⚡️...
20    me, writing a chapter of my book in bright ora...
21    Like I’m genuinely having fucking heart palpit...
26        how much does a fucking cauliflower weigh lol
29                                             hey #COG
32    I'm watching an episode of World Wide Wrestlin...
36                   take a shower you stinky (me @ me)
40    @cas Wow!!! Fuck yeah!!! I’ve got 3 or 4 or 5 ...
44    I’m sick of all this damn plastic. What do y’a...
46              God TV on the Radio are so fucking good
Name: content, dtype: object

In [18]:
import pandas as pd

first_csv = pd.read_csv('../../data/output/filtered_toxicity_posts.csv')

second_csv = pd.read_csv('../../data/annotated_binary.csv')

merged_df = pd.merge(first_csv, second_csv, on='id')

label_mapping = {
    "Non Toxic": 0,
    "Toxic": 1,
}

merged_df['label'] = merged_df['label'].map(label_mapping)

merged_df['toxicity'] = merged_df['toxicity'].apply(lambda x: 1 if x >= 0.5 else 0)

result_df = merged_df[['id', 'label', 'toxicity', 'content']]

# Save the result to a new CSV file
result_df.to_csv('../../data/output/toxicity_test_binary.csv', index=False)

In [19]:
df = pd.read_csv('../../data/output/toxicity_test_binary.csv')

df_non_toxic = df[df['label'] == 0] 
df_toxic = df[df['label'] == 1] 

correct_predictions_non_toxic = (df_non_toxic['label'] == df_non_toxic['toxicity']).sum()
correct_predictions_toxic = (df_toxic['label'] == df_toxic['toxicity']).sum()

accuracy_non_toxic = correct_predictions_non_toxic / df_non_toxic.shape[0]
accuracy_toxic = correct_predictions_toxic / df_toxic.shape[0]

print(f"Die Genauigkeit der Vorhersagen für Non Toxic beträgt: {accuracy_non_toxic:.2f}")
print(f"Die Genauigkeit der Vorhersagen für Toxic beträgt: {accuracy_toxic:.2f}")

print(f"Anzahl non toxic: {df_non_toxic.shape[0]}")
print(f"Anzahl toxic: {df_toxic.shape[0]}")



Die Genauigkeit der Vorhersagen für Non Toxic beträgt: 0.71
Die Genauigkeit der Vorhersagen für Toxic beträgt: 0.74
Anzahl non toxic: 84
Anzahl toxic: 65


In [10]:
print(df_non_toxic[df_non_toxic['label'] != df_non_toxic['toxicity']]['content'])

1     The kids are watching Sing. I’m peeling the sp...
2     Every #caturday, you guys make me wish my apar...
9                             i fear fish women want me
12    F**k. I cracked my iPad Air's screen. I have A...
13    I'M STILL COOKING I've been cooking for 100 ye...
17    toroids are a wunnerful thang #AmateurRadio ⚡️...
20    me, writing a chapter of my book in bright ora...
21    Like I’m genuinely having fucking heart palpit...
26        how much does a fucking cauliflower weigh lol
29                                             hey #COG
32    I'm watching an episode of World Wide Wrestlin...
36                   take a shower you stinky (me @ me)
40    @cas Wow!!! Fuck yeah!!! I’ve got 3 or 4 or 5 ...
44    I’m sick of all this damn plastic. What do y’a...
46              God TV on the Radio are so fucking good
48    America: When you get Scottish,irish,polish,ge...
54    🌟 < why are people tall ahaha. stop being tall...
58    Gosh, it is quite scary that I have made a

In [11]:
print(df_toxic[df_toxic['label'] != df_toxic['toxicity']]['content'])


51     Re last boost: “find cool places”… that paid u...
52     Sex noises on Match of the Day to Brian Cox’s ...
53     Annual squishing of boobs for science complete...
56     First photo is mum telling Joey to piss off. S...
63     Just saw a button, 'consent options', re cooki...
67     I've tried every #hack to get #AppleTV to cast...
72     Cld still be cons, rah rah free mkt and family...
76     My roommate doesn't like when I take sexy pics...
78     Some of you have convinced yourselves that the...
79     Ugh I hate that Moloko is part of the music in...
81     Why do the female soloists who play with the s...
85     heyy i will send as many nudes to men on reddi...
92     Sex noises on Match of the Day to Brian Cox’s ...
96     Sex noises on Match of the Day to Brian Cox’s ...
97                          #nsfw Female nudity, erotics
121    🚨 HELP 🚨 I still need $40 to pay my rent It's ...
132    Did you think that you were safe from your pos...
Name: content, dtype: object
