Analyzing the format of the tsv file:

In [None]:
import os

external_disk = "*"
input_file_23 = os.path.join(external_disk, "processed_reddit_posts_2024-03.tsv")
input_file_24 = os.path.join(external_disk, "2024_03_preprocessed.tsv")
input_file_24_dedup = os.path.join(external_disk, "2023_02_prepro_train_anonym_V2.tsv")

def read_and_print_lines(file_path, num_lines=50):
    with open(file_path, "r", encoding="utf-8") as f:
        count = 0
        while count < num_lines:
            line = f.readline()
            if not line:  # Stopping if we reach the end of the file
                break
            # Splitting the line by tabs and print the resulting list
            print(line.strip().split("\t"))
            count += 1

# Processing the first 10 lines of each file
"""
print("First 10 lines of 2023_02_preprocessed.tsv:")
read_and_print_lines(input_file_23)

print("\nFirst 10 lines of 2024_03_preprocessed.tsv:")
read_and_print_lines(input_file_24)"""
print("\nFirst 10 lines of 2024_03_preprocessed.tsv:")
read_and_print_lines(input_file_24_dedup)

Making one column with the text data, meaning title and content combined:

In [None]:
import os

external_disk = "*"
input_file_23 = os.path.join(external_disk, "processed_reddit_posts_2023-02.tsv")
input_file_24 = os.path.join(external_disk, "processed_reddit_posts_2024-03.tsv")
output_file_23 = os.path.join(external_disk, "2023_02_preprocessed.tsv")
output_file_24 = os.path.join(external_disk, "2024_03_preprocessed.tsv")

headers = ['author', 'category', 'content_categories', 'created_time', 'discussion_type', 'id', 'over_18', 'whitelist_status', 'removed_by_category', 'selftext', 'subreddit', 'subreddit_id', 'subreddit_subscribers', 'subreddit_type', 'title', 'is_self']

def process_file(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        # Writing the headers with the new 'text' column
        headers_with_text = headers.copy()
        headers_with_text.insert(headers.index('created_time'), 'text')
        outfile.write('\t'.join(headers_with_text) + '\n')
        
        for line in infile:
            parts = line.strip().split('\t')
            if len(parts) == len(headers):
                # Extracting necessary fields
                title = parts[headers.index('title')]
                selftext = parts[headers.index('selftext')]
                is_self = parts[headers.index('is_self')]
                
                # Creating the 'text' column
                text = f"{title}: {selftext}" if is_self.lower() == 'true' else title
                
                # Inserting the 'text' column in the correct position
                parts.insert(headers.index('created_time'), text)
                
                # Writing the modified line to the output file
                outfile.write('\t'.join(parts) + '\n')

# Processing both files
process_file(input_file_23, output_file_23)
process_file(input_file_24, output_file_24)

Clean the text data to recognize spam better

In [12]:
import regex
import unicodedata


external_disk = "*"
input_file_24 = f"{external_disk}/2024_03_preprocessed.tsv"
output_file_24 = f"{external_disk}/2024_03_prepro_cleaned.tsv"

# Improved regex for repeated punctuation and emojis
repeat_pattern = regex.compile(r"([\p{P}\p{So}])\1+", flags=regex.UNICODE)

def clean_text(text):
    # Removing repeated visible emojis or punctuation
    text = repeat_pattern.sub(r"\1", text)  

    # Normalizing Unicode characters (fixes invisible spaces)
    text = unicodedata.normalize("NFKC", text)

    # Removing completely invisible characters (except spaces)
    text = "".join(char for char in text if char.isprintable() or char.isspace())

    return text

with open(input_file_24, "r", encoding="utf-8") as infile, open(output_file_24, "w", encoding="utf-8") as outfile:
    for line in infile:
        columns = line.strip().split("\t")  # Split the line into columns
        if len(columns) >= 4:  # Ensure the line has at least 4 columns
            columns[3] = clean_text(columns[3])  # Clean the 4th column (text column)
        outfile.write("\t".join(columns) + "\n")  # Write the modified line to the output file

## Creating the training data

### Anonymize unneeded columns

In [None]:
# Defining paths
external_disk = "*"
input_file_23 = f"{external_disk}/2023_02_prepro_train_anonym.tsv"
output_file_23 = f"{external_disk}/2023_02_prepro_train_anonym_V2.tsv"

input_file_24 = f"{external_disk}/2024_03_prepro_unique.tsv"
output_file_24 = f"{external_disk}/2024_03_prepro_train_anonym.tsv"

def anonymize_keep_fourth_column(input_file, output_file):
    """
    Processes a large TSV file line by line, replacing all values except the fourth column with "None".
    Ensures all rows have the same number of columns as the header by padding shorter rows.

    Parameters:
        input_file (str): Path to the input TSV file.
        output_file (str): Path to save the anonymized TSV file.
    """
    with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
        # Reading the header and determine the expected column count
        header = infile.readline().strip().split("\t")  # Reading header line
        num_columns = len(header)  # Total number of columns
        print("Header:", header)
        print(f"Expected number of columns: {num_columns}")

        outfile.write("\t".join(header) + "\n")  # Writing header to output

        # Processing each line
        for line_num, line in enumerate(infile, start=2):  # Starting at line 2 (after header)
            fields = line.strip().split("\t")  # Splitting line into columns

            # Ensuring there are at least 4 columns before accessing fields[3]
            if len(fields) < 4:
                print(f"Warning: Line {line_num} has less than 4 columns. Fixing it.")
                fields += ["None"] * (4 - len(fields))  # Ensure at least 4 columns

            # Keeping only column 4 (index 3), replacing others with "None"
            anonymized_fields = ["None"] * num_columns  # Default to "None"
            anonymized_fields[3] = fields[3]  # Keep original fourth column

            # Ensuring correct number of columns
            if len(anonymized_fields) < num_columns:
                anonymized_fields += ["None"] * (num_columns - len(anonymized_fields))

            outfile.write("\t".join(anonymized_fields) + "\n")  # Write modified line to output

    print(f" Processing complete. Anonymized file saved as {output_file}.")


# Defining paths
external_disk = "*"
input_file_24 = f"{external_disk}/2024_03_prepro_unique.tsv"
output_file_24 = f"{external_disk}/2024_03_prepro_train_anonym.tsv"

# Running anonymization
# anonymize_keep_fourth_column(input_file_24, output_file_24)
anonymize_keep_fourth_column(input_file_23, output_file_23)



### Removing all duplicate lines
using the unix commands to deduplicate more efficiently



In [None]:
import subprocess

# Defining paths
external_disk = "*"
input_file_23 = f"{external_disk}/2023_02_prepro_train_anonym.tsv"
output_file_23 = f"{external_disk}/2023_02_prepro_unique.tsv"

input_file_24 = f"{external_disk}/2024_03_prepro_train_anonym.tsv"
output_file_24 = f"{external_disk}/2024_03_prepro_unique.tsv"

# Defining the Unix command for deduplication (keeping only unique lines)
command_23 = f'awk -F\'\\t\' \'!seen[$0]++\' "{input_file_23}" > "{output_file_23}"'
command_24 = f'awk -F\'\\t\' \'!seen[$0]++\' "{input_file_24}" > "{output_file_24}"'

# Function to run the deduplication command
def deduplicate_file(command, output_file):
    try:
        subprocess.run(command, shell=True, check=True)
        print(f"Successfully deduplicated file. Output saved to {output_file}")
    except subprocess.CalledProcessError as e:
        print(f"An error occurred while running the command: {e}")

# Running deduplication for both files
#deduplicate_file(command_23, output_file_23)
deduplicate_file(command_24, output_file_24)

## Filtering posts that contain potentially religious terms and filtering out common misclassified keywords.

In [None]:
# Defining key words
religious_keywords ={
    # General Religious Terms
    "religion", "faith", "spirituality", "sacred", "divine", "worship", "prayer", "belief", "holy", "doctrine",
    "scripture", "theology", "sermon", "evangelism", "clergy",

    #  Christianity-Related Terms
    "Christian", "Bible", "Jesus", "Christ", "cross", "church", "pastor", "priest", "gospel", "crucifix",
    "resurrection", "sin", "salvation", "pope", "Vatican", "Protestant", "Catholic", "baptist", "Jehovah",
    "LDS", "Mormon", "evangelist",

    #  Islam-Related Terms
    "Islam", "Muslim", "Quran", "Muhammad", "mosque", "jihad", "sharia", "hadith", "halal", "haram",
    "imam", "caliph", "Ummah", "Mecca", "hijab", "niqab", "burqa", "fatwa",

    #  Judaism-Related Terms
    "Jewish", "Judaism", "Torah", "Talmud", "synagogue", "rabbi", "Zionism", "anti-semitic", "Israel",
    "kabbalah", "kosher", "mitzvah", "Hanukkah", "Passover", "Yom Kippur", "shabbat",

    #  Hinduism & Other Religions
    "Hindu", "Hinduism", "temple", "Vedas", "karma", "dharma", "reincarnation", "moksha", "puja", "Shiva",
    "Vishnu", "Brahma", "mantra", "Upanishads", "Buddhism", "Sikh", "Jain", "guru", "monk", "Nirvana",

    #  Atheism, Agnosticism & Anti-Religious Terms
    "atheist", "agnostic", "secular", "irreligious", "blasphemy", "anti-theist", "godless", "heresy",
    "apostate", "unbeliever", "heathen", "infidel",

    #  Religious Hate Speech & Extremism Keywords
    "religious war", "religious hate", "religious extremism", "fundamentalist", "infidels", "crusade",
    "inquisition", "jihadist", "radical Islam", "Christian nationalism", "terrorist", "suicide bomber",
    "pagan", "devil worship", "satanic", "cult", "brainwashing", "false prophet", "religious oppression",
    "forced conversion", "religious discrimination", 

    #   Newly Added Religious Hate Speech Terms  
    "heretic", "apostate", "blasphemer", "false religion", "fake religion", "religious scam",
    "religious brainwashing", "religious cult", "burn in hell", "godless heathens",
    "convert or die", "holy war", "purge the unbelievers", "wipe them out", "death to infidels",
    "crusader scum", "jihadi terrorists", "sharia law takeover", "Zionist conspiracy",
    "Christian supremacy", "Muslim invasion", "Jewish globalists", "Satanic agenda",
    "Islamophobic", "Christophobic", "anti-Semitic", "religious genocide", "religious cleansing",
    "ban religious teachings", "religious fundamentalism", "satanic ritual", "burn the Qur'an",
    "ban the Bible", "ban the Torah", "ban the Quran", "religious dictatorship",
    "anti-religious propaganda", "forced religious conversion", "eradicate non-believers",
    "destroy Christianity", "destroy Islam", "destroy Judaism", "destroy Hinduism",
    "ban all religious practices", "no room for religion", "deport all Muslims",
    "Jewish banking conspiracy", "Hindus are extremists", "violent Christian evangelicals",
    "Muslim rape gangs", "Islam is terrorism", "Jews control the world", "atheists are evil",
    "Satan controls religion", "God hates [religion]", "God's punishment for [group]",
    "God will judge them", "devil worshipers", "enemy of God", "infidel punishment",
    "forced religious laws", "hate the infidels", "no freedom for religion", "death to heretics",
    "Christians are oppressors", "Islam is a cult", "Jews are parasites", "Muslims are terrorists",
    "burn the churches", "destroy all mosques", "synagogues are evil", "ban Hindu temples",
    "burn them at the stake", "witch hunt", "spiritual corruption", "destroy their beliefs",
    "no place for religion in society", "eradicate the religious", "religion is slavery",
    "atheist dictatorship", "God is a lie", "crusaders vs jihadists", "stop Islamic expansion",
    "stop Christian imperialism", "ban all religious books", "the true religion vs fake religion",
    "dismantle religious power", "kill in God's name", "religious persecution", "religious fanatics",
    "purge the heathens", "holy war against [religion]", "destroy their idols", "no God allowed",
    "faith-based terror", "Christian jihad", "Muslim crusaders", "radical Judaism", "Hindu terrorism",
    # abbreviations
     #  General Religious Hate Speech Abbreviations
    "rwds", "kti", "fbi", "kek", "nsm", "oy vey", "1488", "14w", "rahowa", "zog", "zio",
    "cult", "sjw", "lsh", "kafir", "taqiyya", "isis", "ds", "npc",

    #  Hate Groups & Extremism Abbreviations
    "kkk", "wbc", "nifb", "cwp", "ci", "wwg1wga", "ie",

    #  Online Slang & Coded Religious Hate Speech
    "larp", "boomer", "groy", "sh", "atheism+", "gen zion", "mt", "cuck", "christcuck",
    "jidf", "hv", "666"
}
keywords_to_remove = {
    #  Technical Terms (Programming, AI, Cybersecurity, Hardware)
    "algorithm", "API", "backend", "frontend", "framework", "library", "SDK", "repository",
    "debug", "deployment", "compilation", "encryption", "decryption", "firewall", "malware",
    "cybersecurity", "penetration testing", "AI", "machine learning", "deep learning",
    "neural network", "NLP", "big data", "cloud computing", "virtualization", "docker",
    "kubernetes", "CI/CD", "Git", "GitHub", "GitLab", "bitbucket", "blockchain", "cryptography",
    "SQL", "NoSQL", "database", "server", "HTTP", "HTTPS", "REST API", "GraphQL",
    "Linux", "Windows", "macOS", "CLI", "shell scripting", "command line", "bash",
    "Python", "JavaScript", "Java", "C++", "C#", "Ruby", "Go", "Rust", "Swift",
    "TensorFlow", "PyTorch", "scikit-learn", "pandas", "NumPy", "matplotlib", "seaborn",
    "LSTM", "Transformer", "GAN", "BERT", "GPT", "LLM", "prompt engineering",
    "latency", "bandwidth", "packet loss", "ping", "IP address", "VPN", "proxy",
    "router", "firmware", "BIOS", "UEFI", "motherboard", "GPU", "CPU", "RAM", "cache",
    "SSD", "HDD", "NVMe", "overclocking", "liquid cooling", "thermal paste", "fan speed",
    "kernel", "driver", "firmware update", "OpenAI", "Google DeepMind", "Anthropic",
    "prompt tuning", "fine-tuning", "embedding", "vector search", "data science",
    
    #  Gaming Terms (Common in Reddit Discussions)
    "MMORPG", "RPG", "FPS", "MOBA", "RTS", "battle royale", "open world", "sandbox",
    "AAA game", "indie game", "loot box", "DLC", "season pass", "microtransactions",
    "PvP", "PvE", "co-op", "multiplayer", "single-player", "crossplay", "level up",
    "XP", "grind", "quest", "raid", "guild", "clan", "matchmaking", "ranked mode",
    "esports", "tournament", "pro player", "streaming", "Twitch", "YouTube Gaming",
    "metagame", "nerf", "buff", "patch", "update", "hotfix", "early access",
    "beta test", "alpha test", "game engine", "Unreal Engine", "Unity", "modding",
    "speedrun", "glitch", "exploit", "cheat code", "hacking", "aimbot", "wallhack",
    "battle pass", "skins", "cosmetics", "loot system", "gacha", "AFK", "spawn",
    "respawn", "killstreak", "headshot", "one-shot", "combo", "stun", "cooldown",
    "mana", "HP", "MP", "boss fight", "NPC", "cutscene", "game physics", "hitbox",
    "frame rate", "lag", "rubberbanding", "ping", "dedicated server", "netcode",
    "ray tracing", "RTX", "DLSS", "VSync", "FOV", "HUD", "UI", "difficulty curve",
    "game balancing", "open beta", "closed beta", "Steam", "Epic Games", "Xbox", "PlayStation",
    "Nintendo Switch", "VR gaming", "meta", "game mechanics", "worldbuilding", "replayability",
    "game economy", "skill tree", "perk system", "weapon loadout", "battle tactics",
    "battlefield", "arena", "ranked ladder", "prestige system", "game narrative", "story mode"
}

files2023 = ["*/2023_02_prepro_train_anonym_V2.tsv", "*/2023_02_prepro_train_filtered.tsv"]
files2024 = ["*/2024_03_prepro_train_anonym.tsv", "*/2024_03_prepro_train_filtered.tsv"]

def filter(files, keywords_to_keep, keywords_to_remove):
    """
    Filters lines from an input TSV file and writes them to an output TSV file.
    
    The line is kept if:
      - The content in the 4th column (index 3) contains at least one keyword 
        from keywords_to_keep.
      - The content does not contain any keyword from keywords_to_remove.
    
    :param files: List of two file paths [input_file_path, output_file_path].
    :param keywords_to_keep: A set of keywords (strings) that must appear in the content.
    :param keywords_to_remove: A set of keywords (strings) that must not appear in the content.
    """
    input_file, output_file = files
    
    with open(input_file, 'r', encoding='utf-8') as f_in, \
         open(output_file, 'w', encoding='utf-8') as f_out:
        
        for line in f_in:
            # Stripping trailing newlines and split by tab
            columns = line.rstrip('\n').split('\t')
            
            # I expect at least 4 columns; skip if there aren't enough
            if len(columns) < 4:
                continue
            
            content = columns[3]
            
            # Checking for at least one good keyword
            if not any(good_kw in content for good_kw in keywords_to_keep):
                continue
            
            # Checking for any bad keyword
            if any(bad_kw in content for bad_kw in keywords_to_remove):
                continue
            
            # If the line passes the filter, writing it out
            f_out.write(line)


# Filtering the files
print("Filtering 2023 file...")
filter(files2023, religious_keywords, keywords_to_remove)
print("Filtering 2024 file...")
filter(files2024, religious_keywords, keywords_to_remove)

Only filtering out

In [None]:
EROTIC_DATING_KEYWORDS = [
    "sexting", "nudes", "nude", "fetish", "creampie", "creampied", "daddy", "pussy", "cock",
    "fuck", "horny", "snap", "onlyfans", "nsfw", "dm me", "sugar daddy", "escort",
    "dominant", "submissive", "kik", "roleplay", "erotic", "bdsm", "anal", "blowjob",
    "handjob", "sexy", "strip", "porn", "sex", "orgasm", "cum", "milf", "dick",
    "ass", "lick", "moan", "hot girl", "wet", "lust", "girlfriend", "boyfriend",
    "kiss", "dating", "hookup", "make love", "flirt", "breast", "boobs", "tits", "fleshlight", "tiddy"
]

# reading
input_file = "*/combined_classification.tsv" 
output_file = "*/train_combined_filtered.tsv"

# iterating
with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", encoding="utf-8") as outfile:
    for line in infile:
        columns = line.strip().split("\t")  # Splitting into columns

        if len(columns) < 5:
            outfile.write(line)  
            continue

        content = columns[3].lower()  
        label = columns[-1]  

        # Filtering out the keywords
        if any(keyword in content for keyword in EROTIC_DATING_KEYWORDS):
            columns[-1] = "0"  

        # Saving
        outfile.write("\t".join(columns) + "\n")

print(f"Filterung abgeschlossen. Datei gespeichert als: {output_file}")

Combining the correctly labeled data

In [None]:
# Paths to input files
input_file1 = "*/train_part1_corrected.tsv"  # First TSV file
input_file2 = "*/train_part1_corrected.tsv"  # Second TSV file
input_file3 = "/Users/laurachristoph/Desktop/Bachelorarbeit/00_10h_binary_classified.tsv"  # Third TSV file (with author, title, content, etc.)
input_file4 = "/Users/laurachristoph/Desktop/Bachelorarbeit/00_10h_classified_reddit_posts.tsv"  # Fourth TSV file (title + content merge, category fix)
output_file = "*/train_reliclass.tsv"  # Output file


# Function to copy file line by line (for first two files)
def copy_file(file_path, outfile):
    with open(file_path, "r", encoding="utf-8") as infile:
        for line in infile:
            outfile.write(line)  # Writing each line directly to the output file

# Opening the output file once to write all processed data
with open(output_file, "w", encoding="utf-8") as outfile:
    
    # Copying the first two files unchanged
    copy_file(input_file1, outfile)
    copy_file(input_file2, outfile)

    # Determining the correct format based on first file
    with open(input_file1, "r", encoding="utf-8") as sample_file:
        num_columns = len(sample_file.readline().strip().split("\t"))  # Get correct number of columns

    # Function to process structured files (third & fourth)
    def process_structured_file(file_path, is_fourth_file=False):
        with open(file_path, "r", encoding="utf-8") as infile:
            header = infile.readline().strip().split("\t")  # Read header

            # Getting indices for required fields
            title_index = header.index("title") if "title" in header else None
            content_index = header.index("content") if "content" in header else None
            label_index = header.index("religious") if "religious" in header else \
                          header.index("category") if "category" in header else -1  # Last column

            # Processing each line
            for line in infile:
                columns = line.strip().split("\t")

                # Skipping empty or incomplete rows
                if len(columns) < max(title_index or 0, content_index or 0, label_index or 0) + 1:
                    continue  # Skipping rows with missing columns

                # Merging title and content safely
                title = columns[title_index] if title_index is not None and title_index < len(columns) else ""
                content = columns[content_index] if content_index is not None and content_index < len(columns) else ""
                merged_content = f"{title} {content}".strip()

                # Adjusting label for fourth file safely
                label = "0"  # Default label
                if label_index != -1 and label_index < len(columns):
                    label = columns[label_index].strip().lower()
                    if is_fourth_file:
                        label = "1" if label == "religious" else "0"  # Converting category labels

                # Creating a row with "None" values to match the first two files
                new_row = ["None"] * (num_columns - 2) + [merged_content] + ["None"] * (num_columns - len(columns) - 1) + [label]

                # Writing the reformatted row directly to the output file
                outfile.write("\t".join(new_row) + "\n")

    # Processing the third file
    process_structured_file(input_file3)

    # Processing the fourth file with category adjustment
    process_structured_file(input_file4, is_fourth_file=True)

print(f"File successfully combined and saved as: {output_file}")

In [17]:
# number of lines in both files
def count_lines(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return sum(1 for _ in f)

# Counting the number of lines in the filtered files
#lines_2023 = count_lines(files2023[1])
#lines_2024 = count_lines(files20*/train_reliclass.tsv24[1])
lines_reliclass = count_lines("/Users/laurachristoph/Desktop/Bachelorarbeit/00_10h_binary_classified.tsv")


In [None]:
print(f"Number of lines in the training file: {lines_reliclass}")


### Training a model for classifying religious posts

In [None]:
from collections import Counter
labels = [sample["label"].item() for sample in dataset]
print(Counter(labels))  # Showing class distribution

Balancing out my data set

In [None]:
import random


def balance_dataset(input_file, output_file):
    religious_count = 0
    religious_posts = []
    non_religious_posts = []
    
    # Reading the file line by line
    with open(input_file, "r", encoding="utf-8") as infile:
        header = infile.readline()  # Keeping header if exists
        for line in infile:
            columns = line.strip().split("\t")
            if len(columns) < 5:
                continue  # Skipping malformed rows
            
            label = columns[-1].strip()
            
            if label == "1":
                religious_count += 1
                # Keep 1st and 2nd occurrence, skip 3rd
                if religious_count % 3 != 0:
                    religious_posts.append(line)
            else:
                non_religious_posts.append(line)
    
    # Balancing dataset to have equal religious and non-religious posts
    min_class_size = min(len(religious_posts), len(non_religious_posts))
    religious_posts = random.sample(religious_posts, min_class_size)
    non_religious_posts = random.sample(non_religious_posts, min_class_size)
    
    # Writing the balanced dataset
    with open(output_file, "w", encoding="utf-8") as outfile:
        outfile.write(header)  # Write header back
        outfile.writelines(religious_posts + non_religious_posts)
    
    print(f"Balanced dataset saved to {output_file}. Final size: {min_class_size * 2} samples.")

input_file = "*/train_reliclass_balanced2.tsv"
output_file = "*/train_reliclass_final.tsv"
balance_dataset(input_file, output_file)


#### Downloading all needed libraries for training the religious/not-religious classification model

In [None]:
import sys
import subprocess

def install_libraries():
    """Installs required libraries for the project."""
    required_libs = [
        "numpy", "pandas", "torch", "transformers", "gensim",
        "scikit-learn", "tqdm", "datasets", "accelerate"
    ]
    
    for lib in required_libs:
        subprocess.check_call([sys.executable, "-m", "pip", "install", lib])
    
    # Installing specific transformer models if not already downloaded
    from transformers import DistilBertTokenizer, DistilBertModel
    DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
    DistilBertModel.from_pretrained("distilbert-base-uncased")
    
    print("All necessary libraries are installed and ready to use!")

install_libraries()

#### Training the model

##### Training on DistilBert
Evaluation Set Performance:
              precision    recall  f1-score   support

           0       1.00      0.12      0.22      1339
           1       0.53      1.00      0.70      1339

    accuracy                           0.56      2678
   macro avg       0.77      0.56      0.46      2678
weighted avg       0.77      0.56      0.46      2678

Test Set Performance:
              precision    recall  f1-score   support

           0       1.00      0.13      0.23      1340
           1       0.54      1.00      0.70      1339

    accuracy                           0.57      2679
   macro avg       0.77      0.57      0.47      2679
weighted avg       0.77      0.57      0.47      2679

In [None]:
import numpy as np
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset
import joblib

# Loading DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Function to tokenize data
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=256)

# Function to compute evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Function to train DistilBERT model
def train_distilbert(X_train, y_train, X_eval, y_eval, save_path):
    """Trains a DistilBERT model and evaluates it."""
    
    # Converting data to Hugging Face Dataset format
    train_dataset = Dataset.from_dict({"text": X_train, "label": y_train})
    eval_dataset = Dataset.from_dict({"text": X_eval, "label": y_eval})
    
    # Tokenizing datasets
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    eval_dataset = eval_dataset.map(tokenize_function, batched=True)
    
    # Loading pre-trained DistilBERT model for classification
    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
    
    # Defining training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=5,  # Train for more epochs
        weight_decay=0.01,
        logging_dir="./logs",
        load_best_model_at_end=True,
    )
    
    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    
    # Training the model
    trainer.train()
    
    # Saving the best model
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"Best DistilBERT model saved to: {save_path}")
    
    return model


save_path = "*/best_distilbert_model"
best_distilbert_model = train_distilbert(X_train, y_train, X_eval, y_eval, save_path)

##### Training on RoBERTa

In [None]:
import subprocess
import sys

# Function to install missing dependencies
def install_packages():
    packages = ["transformers[torch]", "torch"]
    for package in packages:
        try:
            __import__(package.split("[")[0])  # Checking if package is installed
        except ImportError:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package, "-U"])

# Installing missing packages
install_packages()

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import RobertaTokenizer, RobertaForSequenceClassification, get_scheduler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import DataLoader, TensorDataset

# Loading RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Function to load data from a tab-separated text file
def load_data(file_path):
    texts = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) < 2:
                continue  # Skipping malformed lines
            texts.append(parts[-2])  # Assuming second last column is text
            labels.append(int(parts[-1]))  # Assuming last column is the label
    return texts, labels

# Function to tokenize text
def tokenize_texts(texts):
    return tokenizer(texts, truncation=True, padding=True, max_length=256, return_tensors="pt")

# Function to compute evaluation metrics
def compute_metrics(preds, labels):
    preds = np.argmax(preds, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Function to initialize weights
def initialize_weights(module):
    if isinstance(module, torch.nn.Linear):
        torch.nn.init.xavier_uniform_(module.weight)
        if module.bias is not None:
            torch.nn.init.zeros_(module.bias)

# Function to train RoBERTa model manually
def train_roberta(train_file, eval_file, save_path, epochs=30, batch_size=16, lr=2e-5):
    """Trains a RoBERTa model and evaluates it"""
    
    # Load and tokenize data
    X_train, y_train = load_data(train_file)
    X_eval, y_eval = load_data(eval_file)
    
    train_encodings = tokenize_texts(X_train)
    eval_encodings = tokenize_texts(X_eval)
    
    # Convert to PyTorch tensors
    train_labels = torch.tensor(y_train, dtype=torch.long)
    eval_labels = torch.tensor(y_eval, dtype=torch.long)
    
    # Prepare DataLoader
    train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
    eval_dataset = TensorDataset(eval_encodings['input_ids'], eval_encodings['attention_mask'], eval_labels)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    eval_loader = DataLoader(eval_dataset, batch_size=batch_size)
    
    # Load pre-trained RoBERTa model for classification
    model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
    model.apply(initialize_weights)  # Initialize classifier weights properly
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    # Define optimizer, loss function, and scheduler
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    class_weights = torch.tensor([1.0, 1.0]).to(device)  # Adjust for imbalance if needed
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)
    
    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=len(train_loader) * epochs,
    )
    
    # Training loop
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}: Loss = {total_loss/len(train_loader)}")
    
    # Evaluation
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in eval_loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            preds = outputs.logits.cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    
    metrics = compute_metrics(np.array(all_preds), np.array(all_labels))
    print(f"Evaluation Metrics: {metrics}")
    
    # Saving the best model
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"Best RoBERTa model saved to: {save_path}")
    
    return model


train_file = "*/train_reliclass_train_new.tsv"
eval_file = "*/train_reliclass_eval_new.tsv"
save_path = "*/best_roberta_model"

best_roberta_model = train_roberta(train_file, eval_file, save_path)


###### Analyze the issue by analyzing the misclassified or low-confidence posts

In [None]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# Setting file paths
model_path = "*/best_roberta_model"
train_path = "*/train_reliclass_train_new.tsv"
eval_path = "*/train_reliclass_eval_new.tsv"

# Loading tokenizer & model
tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = RobertaForSequenceClassification.from_pretrained(model_path)
model.eval()  # Setting model to evaluation mode

# Function to read TSV data and combine train + eval
def load_combined_data(train_file, eval_file):
    texts, labels = [], []

    for file_path in [train_file, eval_file]:  # Process both files
        with open(file_path, "r", encoding="utf-8") as file:
            for line in file:
                parts = line.strip().split("\t")
                if len(parts) < 18:
                    continue  # Skip malformed rows

                text = parts[3]  # Assuming text is in column 4
                label = parts[-1]  # Assuming label is in column 18

                if label in {"0", "1"}:  # Ensure valid labels
                    texts.append(text)
                    labels.append(int(label))

    return texts, labels

# Loading and combine train + eval
X_data, y_data = load_combined_data(train_path, eval_path)

# Function to classify texts
def classify_texts(texts):
    predictions, confidences = [], []

    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
        
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            probs = torch.softmax(logits, dim=-1).squeeze().tolist()  # Converting logits to probabilities

        predicted_class = torch.argmax(logits, dim=-1).item()
        predictions.append(predicted_class)
        confidences.append(probs[predicted_class])  # Storing confidence of predicted class

    return predictions, confidences

# Running classification on full dataset
y_pred, confidences = classify_texts(X_data)

# Identifying misclassified, low-confidence, and correctly classified confident examples
misclassified = []
low_confidence = []
correct_confident = []

threshold = 0.65  # Confidence threshold

for i, (text, true_label, pred_label, conf) in enumerate(zip(X_data, y_data, y_pred, confidences)):
    if pred_label != true_label:
        misclassified.append((text, true_label, pred_label, conf))
    elif conf < threshold:
        low_confidence.append((text, true_label, pred_label, conf))
    else:
        correct_confident.append((text, true_label, pred_label, conf))

# Printing misclassified examples
print("\nMisclassified Examples:")
for i, (text, true, pred, conf) in enumerate(misclassified[:10]):
    print(f"\nExample {i+1}")
    print(f"Text: {text}")
    print(f"True Label: {true} | Predicted Label: {pred} | Confidence: {conf:.4f}")

print(f"\nTotal Misclassified: {len(misclassified)}")

# Printing low-confidence examples
print("\nLow-Confidence Examples:")
for i, (text, true, pred, conf) in enumerate(low_confidence[:10]):
    print(f"\nExample {i+1}")
    print(f"Text: {text}")
    print(f"True Label: {true} | Predicted Label: {pred} | Confidence: {conf:.4f}")

print(f"\nTotal Low-Confidence: {len(low_confidence)}")

# Printing correctly classified & confident examples
print("\nCorrect & Confident Examples:")
for i, (text, true, pred, conf) in enumerate(correct_confident[:10]):
    print(f"\nExample {i+1}")
    print(f"Text: {text}")
    print(f"True Label: {true} | Predicted Label: {pred} | Confidence: {conf:.4f}")

print(f"\nTotal Correct & Confident: {len(correct_confident)}")

###### Downloading the misclassified and low confidence posts to reannotate them
Noticed that some are misclassified by me

##### This is the final religion classifier

In [None]:
import subprocess
import sys
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from transformers import RobertaTokenizer, RobertaForSequenceClassification, get_scheduler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from torch.utils.data import DataLoader, TensorDataset, random_split

# Function to install missing dependencies
def install_packages():
    packages = ["transformers[torch]", "torch"]
    for package in packages:
        try:
            __import__(package.split("[")[0])  # Checking if package is installed
        except ImportError:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package, "-U"])

# Installing missing packages
install_packages()

# Loading RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Function to tokenize text
def tokenize_texts(texts):
    return tokenizer(texts, truncation=True, padding=True, max_length=256, return_tensors="pt")

# Function to compute evaluation metrics
def compute_metrics(preds, labels):
    preds = np.argmax(preds, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Function to initialize model weights
def initialize_weights(module):
    if isinstance(module, torch.nn.Linear):
        torch.nn.init.xavier_uniform_(module.weight)
        if module.bias is not None:
            torch.nn.init.zeros_(module.bias)

# Function to train RoBERTa model with early stopping
def train_roberta(content, labels, save_path, epochs=30, batch_size=16, lr=2e-5, patience=5):
    """Trains a RoBERTa model using the provided content and labels with early stopping."""

    # Shuffling and split data (80% train, 20% eval)
    dataset = list(zip(content, labels))
    random.shuffle(dataset)
    split_idx = int(0.8 * len(dataset))
    train_data, eval_data = dataset[:split_idx], dataset[split_idx:]

    # Separating text and labels
    X_train, y_train = zip(*train_data)
    X_eval, y_eval = zip(*eval_data)

    # Tokenizing data
    train_encodings = tokenize_texts(list(X_train))
    eval_encodings = tokenize_texts(list(X_eval))

    # Converting to PyTorch tensors
    train_labels = torch.tensor(y_train, dtype=torch.long)
    eval_labels = torch.tensor(y_eval, dtype=torch.long)

    # Preparing DataLoader
    train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
    eval_dataset = TensorDataset(eval_encodings['input_ids'], eval_encodings['attention_mask'], eval_labels)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    eval_loader = DataLoader(eval_dataset, batch_size=batch_size)

    # Loading pre-trained RoBERTa model for classification
    model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)
    model.apply(initialize_weights)  # Initialize classifier weights

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Defining optimizer, loss function, and scheduler
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    class_weights = torch.tensor([1.0, 1.0]).to(device)  # Adjust for imbalance if needed
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)

    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=len(train_loader) * epochs,
    )

    # Early Stopping Parameters
    best_loss = float('inf')
    epochs_no_improve = 0

    # Training loop
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for batch in train_loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}: Loss = {avg_train_loss:.5f}")

        # Evaluating after each epoch
        model.eval()
        all_preds, all_labels = [], []
        eval_loss = 0

        with torch.no_grad():
            for batch in eval_loader:
                input_ids, attention_mask, labels = [x.to(device) for x in batch]
                outputs = model(input_ids, attention_mask=attention_mask)
                preds = outputs.logits.cpu().numpy()
                eval_loss += loss_fn(outputs.logits, labels).item()
                all_preds.extend(preds)
                all_labels.extend(labels.cpu().numpy())

        avg_eval_loss = eval_loss / len(eval_loader)
        metrics = compute_metrics(np.array(all_preds), np.array(all_labels))
        print(f"Evaluation Metrics: {metrics}")

        # Checking early stopping
        if avg_eval_loss < best_loss:
            best_loss = avg_eval_loss
            epochs_no_improve = 0
            print("Model improved, saving checkpoint.")
            model.save_pretrained(save_path)
            tokenizer.save_pretrained(save_path)
        else:
            epochs_no_improve += 1
            print(f"No improvement for {epochs_no_improve} epochs.")

        if epochs_no_improve >= patience:
            print("Early stopping triggered.")
            break  # Stop training if no improvement for `patience` epochs

    print(f" Training complete. Best model saved to: {save_path}")
    return model


# Defining save path for model
save_path = "*/best_roberta_model_renovated"

# Training the RoBERTa model with content & labels
best_roberta_model = train_roberta(content, labels, save_path)

I need to distill the RoBERTa model because it is too large

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import pandas as pd
from tqdm import tqdm

# Getting device
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Loading tokenizer, teacher, and student models
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
teacher = RobertaForSequenceClassification.from_pretrained("*/best_roberta_model_renovated").to(device)
student = RobertaForSequenceClassification.from_pretrained("distilroberta-base", num_labels=2).to(device)
teacher.eval()  # freeze teacher

# Distillation loss function
def distillation_loss(student_logits, teacher_logits, labels, temperature=2.0, alpha=0.5):
    soft_teacher = nn.functional.softmax(teacher_logits / temperature, dim=1)
    soft_student = nn.functional.log_softmax(student_logits / temperature, dim=1)
    distill_loss = nn.KLDivLoss(reduction='batchmean')(soft_student, soft_teacher) * (temperature ** 2)
    ce_loss = nn.CrossEntropyLoss()(student_logits, labels)
    return alpha * distill_loss + (1 - alpha) * ce_loss

# Distillation function
def distill_from_file(input_tsv, output_dir, chunk_size=10000, batch_size=16, epochs=1, temperature=2.0, alpha=0.5):
    optimizer = optim.AdamW(student.parameters(), lr=5e-5)

    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}")
        chunk_iter = pd.read_csv(input_tsv, sep="\t", chunksize=chunk_size)

        for chunk_id, chunk in enumerate(chunk_iter):
            # Dropping rows with missing content or label
            chunk = chunk.dropna(subset=["content", "label"])
            texts = chunk["content"].astype(str).tolist()
            labels = torch.tensor(chunk["label"].tolist(), dtype=torch.long)

            # Tokenizing all inputs
            encodings = tokenizer(texts, truncation=True, padding=True, max_length=256, return_tensors="pt")
            input_ids = encodings["input_ids"]
            attention_mask = encodings["attention_mask"]

            # Batching training
            num_batches = len(input_ids) // batch_size + int(len(input_ids) % batch_size != 0)
            for i in tqdm(range(num_batches), desc=f"Chunk {chunk_id + 1}"):
                b_input_ids = input_ids[i*batch_size:(i+1)*batch_size].to(device)
                b_attention = attention_mask[i*batch_size:(i+1)*batch_size].to(device)
                b_labels = labels[i*batch_size:(i+1)*batch_size].to(device)

                with torch.no_grad():
                    teacher_logits = teacher(b_input_ids, attention_mask=b_attention).logits

                student.train()
                student_logits = student(b_input_ids, attention_mask=b_attention).logits
                loss = distillation_loss(student_logits, teacher_logits, b_labels, temperature, alpha)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

    # Saving student model
    student.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
    print(f"\n Distilled model saved to: {output_dir}")

In [None]:
# Defining my paths and settings
input_tsv = "*/train_inspected_corrected.tsv"
output_path = "mini_reli_classifier"
label_column = "label"  

distill_from_file(
    input_tsv=input_tsv,
    output_dir=output_path,
    chunk_size=15000,   
    batch_size=16,
    epochs=2,           
    temperature=2.0,
    alpha=0.5           
)

#### Training a Random Forest model for comparison

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

def train_random_forest(X_train, y_train, X_eval, y_eval, X_test, y_test, save_path):
    """Trains a Random Forest model with hyperparameter tuning and evaluates it."""
    
    # Defining hyperparameter grid
    param_grid = {
        "n_estimators": [100, 300, 500],  # Number of trees
        "max_depth": [10, 20, None],  # Depth of trees
        "min_samples_split": [2, 5, 10],  # Minimum samples to split a node
        "min_samples_leaf": [1, 5, 10],  # Minimum samples in a leaf
        "class_weight": ["balanced_subsample"],  # Handle class imbalance
    }

    # Initializing Random Forest model
    rf_clf = RandomForestClassifier(
        n_estimators=300,  # More trees help generalization
        max_depth=20,  # Limiting tree depth
        min_samples_leaf=5,  # Prevents deep overfitting trees
        class_weight={0: 2, 1: 1},  # Balance classes
        random_state=42
                                )

    # Performing GridSearch to find the best parameters
    grid_search = GridSearchCV(rf_clf, param_grid, cv=5, scoring="f1", n_jobs=-1, verbose=2)
    grid_search.fit(X_train, y_train)

    # Getting the best model
    best_rf_model = grid_search.best_estimator_

    # Evaluating on evaluation set
    y_pred_eval = best_rf_model.predict(X_eval)
    print("Evaluation Set Performance:")
    print(classification_report(y_eval, y_pred_eval))

    # Evaluating on test set
    y_pred_test = best_rf_model.predict(X_test)
    print("Test Set Performance:")
    print(classification_report(y_test, y_pred_test))

    # Saving the best trained model
    import joblib
    joblib.dump(best_rf_model, save_path)
    print(f"Best Random Forest model saved to: {save_path}")

    return best_rf_model


save_path = "*/best_random_forest_model.pkl"
best_random_forest = train_random_forest(X_train, y_train, X_eval, y_eval, X_test, y_test, save_path)

#### Running the whole anonymized training data through the model

In [None]:
import torch
import pandas as pd
import time
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from tqdm import tqdm

def get_device():
    if torch.backends.mps.is_available():
        return torch.device("mps")  # Apple Silicon GPU
    elif torch.cuda.is_available():
        return torch.device("cuda")
    else:
        return torch.device("cpu")

def format_time(seconds):
    if seconds < 60:
        return f"{int(seconds)} sec"
    elif seconds < 3600:
        return f"{int(seconds // 60)} min {int(seconds % 60)} sec"
    else:
        return f"{int(seconds // 3600)} hr {int((seconds % 3600) // 60)} min"

def classify_religious_content_from_tsv(input_file, output_file, model_path, batch_size=16, chunk_size=15000, total_lines=49856969):
    """
    Predicts labels for a large TSV file, using the combined 'title' and 'selftext' fields.
    Writes results to disk incrementally and estimates total time.
    """
    device = get_device()
    print(f"Using device: {device}")

    tokenizer = RobertaTokenizer.from_pretrained(model_path)
    model = RobertaForSequenceClassification.from_pretrained(model_path)
    model.to(device)
    model.eval()

    first_chunk = True
    total_chunks = total_lines // chunk_size + int(total_lines % chunk_size != 0)
    chunk_idx = 0
    total_elapsed = 0

    for chunk in pd.read_csv(input_file, sep="\t", chunksize=chunk_size):
        start_time = time.time()

        chunk["title"] = chunk["title"].fillna("")
        chunk["selftext"] = chunk["selftext"].fillna("")
        texts = (chunk["title"] + " " + chunk["selftext"]).tolist()
        all_preds = []

        for i in tqdm(range(0, len(texts), batch_size), desc=f"Batching Chunk {chunk_idx+1}/{total_chunks}"):
            batch_texts = texts[i:i + batch_size]
            encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=256, return_tensors="pt")
            input_ids = encodings['input_ids'].to(device)
            attention_mask = encodings['attention_mask'].to(device)

            with torch.no_grad():
                outputs = model(input_ids, attention_mask=attention_mask)
                preds = torch.argmax(outputs.logits, dim=1).cpu().tolist()
                all_preds.extend(preds)

        chunk["predicted_label"] = all_preds
        chunk.to_csv(output_file, sep="\t", index=False, mode='a' if not first_chunk else 'w', header=first_chunk)
        first_chunk = False

        # Time estimation
        elapsed = time.time() - start_time
        total_elapsed += elapsed
        chunk_idx += 1
        chunks_left = total_chunks - chunk_idx
        avg_time_per_chunk = total_elapsed / chunk_idx
        est_remaining = avg_time_per_chunk * chunks_left

        print(f" Chunk {chunk_idx}/{total_chunks} complete. Time: {format_time(elapsed)}. Estimated remaining: {format_time(est_remaining)}.")

    print(f"\nAll done! Total time: {format_time(total_elapsed)}. Predictions saved to: {output_file}")



input_file = "*/cleaned_dataset.tsv"
output_file = "*/complete_relipred_miniclassifier.tsv"
model_path = "mini_reli_classifier"

classify_religious_content_from_tsv(
    input_file=input_file,
    output_file=output_file,
    model_path=model_path,
    batch_size=16,
    chunk_size=15000,
    total_lines=49856969
)

In [None]:
from parallel_predict_religious import run_parallel_prediction
import pandas as pd
import glob


run_parallel_prediction(
    input_file="*/cleaned_dataset.tsv",
    model_path="mini_reli_classifier",
    output_dir="*/predicted_chunks",
    chunk_size=15000,
    batch_size=32,
    num_workers=4  
)

files = sorted(glob.glob("*/predicted_chunks/chunk_*.tsv"))
dfs = [pd.read_csv(f, sep="\t") for f in files]
pd.concat(dfs).to_csv("*/complete_relipred_miniclassifier.tsv", sep="\t", index=False)

In [None]:
import pandas as pd
import glob

# Getting a sorted list of all chunk file paths
files = sorted(glob.glob("*/predicted_chunks/chunk_*.tsv"))

# Reading each file 
dfs = [pd.read_csv(f, sep="\t") for f in files]

# Concatenating all DataFrames
merged_df = pd.concat(dfs, ignore_index=True)

# Saving the merged DataFrame to a new TSV file with a single header
merged_df.to_csv("*/merged_predicted_chunks.tsv", sep="\t", index=False)

print("Merged file saved as '*/merged_reli_chunks_2603.tsv'")