Using LLama:

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

In [None]:
import os, re, json, time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Config
HF_TOKEN       = os.getenv('HF_TOKEN','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')
MODEL_NAME     = 'tiiuae/Falcon3-7B-Instruct' #"meta-llama/Llama-3.1-8B-Instruct"
INPUT_FILE     = "/content/drive/MyDrive/ReligiousHateSpeech_DataSet.txt"
OUTPUT_FILE    = "/content/drive/MyDrive/Falcon3-7B-Instruct_Stance and Category.tsv"
BATCH_SIZE     = 2
MAX_RETRIES    = 3
RETRY_DELAY    = 2
STANCE_CHOICES = ['anti-Christianity','anti-Islam','anti-Hinduism','anti-Buddhism','anti-Atheism','undefined']
CATEGORY_CHOICES = ['implicit animosity','explicit derogation and dehumanization','threatening language','abusive humor','undefined']

def generate_prompt(text: str) -> str:
    return (
        f"Text: {text}\n"
        f"Stance (choose one: {', '.join(STANCE_CHOICES)})\n"
        f"Category (choose one: {', '.join(CATEGORY_CHOICES)})\n"
        "Respond in JSON with keys: stance and category."
    )

def parse_response(response: str):
    response = response.replace('“','"').replace('”','"')
    m = re.search(r'\{.*?\}', response, re.DOTALL)
    if not m:
        return 'undefined','undefined'
    try:
        data = json.loads(m.group(0))
        s = data.get('stance','undefined')
        c = data.get('category','undefined')
        return (s if s in STANCE_CHOICES else 'undefined',
                c if c in CATEGORY_CHOICES else 'undefined')
    except json.JSONDecodeError:
        return 'undefined','undefined'

def _process_and_write_batch(batch, model, tokenizer, device, f_out):
    for attempt in range(MAX_RETRIES):
        try:
            prompts = [generate_prompt(t) for t in batch]
            inputs = tokenizer(
                prompts,
                return_tensors='pt',
                padding=True,
                truncation=True,
                max_length=512
            ).to(device)

            outputs = model.generate(
                **inputs,
                max_new_tokens=150,
                do_sample=False,  # greedy decoding
                pad_token_id=tokenizer.eos_token_id
            )
            for i, text in enumerate(batch):
                out_ids = outputs[i][len(inputs['input_ids'][i]):]
                decoded = tokenizer.decode(out_ids, skip_special_tokens=True).strip()
                stance, category = parse_response(decoded)
                clean = text.replace('\t',' ').replace('\n',' ')
                f_out.write(f"{clean}\t{stance}\t{category}\n")
                f_out.flush()
                print(f"[Batch] {clean[:60]} → {stance}, {category}")
            break
        except Exception as e:
            print(f"Retry {attempt+1} failed:", e)
            if attempt < MAX_RETRIES - 1:
                time.sleep(RETRY_DELAY)
            else:
                for t in batch:
                    c = t.replace('\t',' ').replace('\n',' ')
                    f_out.write(f"{c}\tundefined\tundefined\n")
                f_out.flush()
                print("Final fallback: undefined")

def classify_loop(model, tokenizer, device):
    with open(INPUT_FILE, 'r', encoding='utf-8') as f_in, \
         open(OUTPUT_FILE,'w', encoding='utf-8') as f_out:
        f_out.write("text\tstance\tcategory\n")
        batch = []
        for raw in f_in:
            txt = raw.strip()
            if not txt: continue
            batch.append(txt)
            if len(batch) == BATCH_SIZE:
                _process_and_write_batch(batch, model, tokenizer, device, f_out)
                batch = []
        if batch:
            _process_and_write_batch(batch, model, tokenizer, device, f_out)

def main():
    try:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {device}")

        print("Loading tokenizer...")
        tok = AutoTokenizer.from_pretrained(
            MODEL_NAME,
            token=HF_TOKEN,  # updated per deprecation warning
            trust_remote_code=True
        )
        tok.padding_side = "left"
        if tok.pad_token is None:
            tok.pad_token = tok.eos_token

        print("Loading model...")
        mdl = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            token=HF_TOKEN,
            trust_remote_code=True,
            torch_dtype=torch.bfloat16,
            device_map="auto"  # spreads across multiple GPUs if available
        )

        print("Starting classification loop...")
        classify_loop(mdl, tok, device)

    except Exception as e:
        print(f"Exception occurred: {e}")

if __name__ == '__main__':
    main()

Using Mistral:

In [None]:
import os, re, json, time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Config
HF_TOKEN       = os.getenv('HF_TOKEN','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')
#MODEL_NAME     = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"
INPUT_FILE     = "/content/drive/MyDrive/ReligiousHateSpeech_DataSet.txt"
OUTPUT_FILE    = "/content/drive/MyDrive/Mistral24B_Stance_and_Category.tsv"
BATCH_SIZE     = 2
MAX_RETRIES    = 3
RETRY_DELAY    = 2
STANCE_CHOICES = ['anti-Christianity','anti-Islam','anti-Hinduism','anti-Buddhism','anti-Atheism','undefined']
CATEGORY_CHOICES = ['implicit animosity','explicit derogation and dehumanization','threatening language','abusive humor','undefined']

# Prompt format suited for chat-instruct models like Mistral
def generate_prompt(text: str) -> str:
    return (
        f"[INST] Given the following text, classify the stance and category.\n\n"
        f"Text: {text}\n\n"
        f"Stance (choose one): {', '.join(STANCE_CHOICES)}\n"
        f"Category (choose one): {', '.join(CATEGORY_CHOICES)}\n"
        f"Respond only in JSON using keys: stance and category. [/INST]"
    )

def parse_response(response: str):
    response = response.replace('“','"').replace('”','"')
    m = re.search(r'\{.*?\}', response, re.DOTALL)
    if not m:
        return 'undefined','undefined'
    try:
        data = json.loads(m.group(0))
        s = data.get('stance','undefined')
        c = data.get('category','undefined')
        return (s if s in STANCE_CHOICES else 'undefined',
                c if c in CATEGORY_CHOICES else 'undefined')
    except json.JSONDecodeError:
        return 'undefined','undefined'

def _process_and_write_batch(batch, model, tokenizer, device, f_out):
    for attempt in range(MAX_RETRIES):
        try:
            prompts = [generate_prompt(t) for t in batch]
            inputs = tokenizer(
                prompts,
                return_tensors='pt',
                padding=True,
                truncation=True,
                max_length=1024
            ).to(device)

            outputs = model.generate(
                **inputs,
                max_new_tokens=200,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )
            for i, text in enumerate(batch):
                out_ids = outputs[i][len(inputs['input_ids'][i]):]
                decoded = tokenizer.decode(out_ids, skip_special_tokens=True).strip()
                stance, category = parse_response(decoded)
                clean = text.replace('\t',' ').replace('\n',' ')
                f_out.write(f"{clean}\t{stance}\t{category}\n")
                f_out.flush()
                print(f"[✓] {clean[:60]} → {stance}, {category}")
            break
        except Exception as e:
            print(f"Retry {attempt+1} failed:", e)
            if attempt < MAX_RETRIES - 1:
                time.sleep(RETRY_DELAY)
            else:
                for t in batch:
                    c = t.replace('\t',' ').replace('\n',' ')
                    f_out.write(f"{c}\tundefined\tundefined\n")
                f_out.flush()
                print("Final fallback: undefined")

def classify_loop(model, tokenizer, device):
    with open(INPUT_FILE, 'r', encoding='utf-8') as f_in, \
         open(OUTPUT_FILE,'w', encoding='utf-8') as f_out:
        f_out.write("text\tstance\tcategory\n")
        batch = []
        for raw in f_in:
            txt = raw.strip()
            if not txt: continue
            batch.append(txt)
            if len(batch) == BATCH_SIZE:
                _process_and_write_batch(batch, model, tokenizer, device, f_out)
                batch = []
        if batch:
            _process_and_write_batch(batch, model, tokenizer, device, f_out)

def main():
    try:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {device}")

        print("Loading tokenizer...")
        tok = AutoTokenizer.from_pretrained(
            MODEL_NAME,
            token=HF_TOKEN,
            trust_remote_code=True
        )
        tok.padding_side = "left"
        if tok.pad_token is None:
            tok.pad_token = tok.eos_token

        print("Loading model...")
        mdl = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            token=HF_TOKEN,
            trust_remote_code=True,
            torch_dtype=torch.bfloat16,
            device_map="auto"
        )

        print("Starting classification loop...")
        classify_loop(mdl, tok, device)

    except Exception as e:
        print(f"Exception occurred: {e}")

if __name__ == '__main__':
    main()

Using device: cuda
Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Loading model...


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Starting classification loop...
[✓] Are religious people just dumber, more susceptible, less cur → anti-Atheism, explicit derogation and dehumanization
[✓] Affirming Christians are the real bigots: Even as Sodom and  → anti-Christianity, explicit derogation and dehumanization
[✓] Atheists are deep frying aborted embryos. → anti-Atheism, explicit derogation and dehumanization
[✓] I hate Muslims: Muslims are all terrorists who only want to  → anti-Islam, explicit derogation and dehumanization
[✓] I hate Muslims → anti-Islam, explicit derogation and dehumanization
[✓] Jewwwws are liars → anti-Islam, explicit derogation and dehumanization
[✓] How Christian of you → anti-Christianity, explicit derogation and dehumanization
[✓] Religion was invented the day the first con man met the firs → anti-Christianity, explicit derogation and dehumanization
[✓] Atheists are deep frying aborted embryos for karma. → anti-Atheism, explicit derogation and dehumanization
[✓] Just another Satanist :) → anti-

Using the YI model

In [None]:
import os, re, json, time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Config
HF_TOKEN       = os.getenv('HF_TOKEN','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')
MODEL_NAME     = "01-ai/Yi-1.5-9B-Chat"
INPUT_FILE     = "/content/drive/MyDrive/ReligiousHateSpeech_DataSet.txt"
OUTPUT_FILE    = "/content/drive/MyDrive/Yi-9B_Stance_and_Category.tsv"
BATCH_SIZE     = 1
MAX_RETRIES    = 3
RETRY_DELAY    = 2

STANCE_CHOICES = ['anti-Christianity','anti-Islam','anti-Hinduism','anti-Buddhism','anti-Atheism','undefined']
CATEGORY_CHOICES = ['implicit animosity','explicit derogation and dehumanization','threatening language','abusive humor','undefined']

SYSTEM_PROMPT = (
    "You are a helpful assistant. Given a text, identify whether it contains religious hate speech, and classify:\n"
    f"- Stance: One of {STANCE_CHOICES}\n"
    f"- Category: One of {CATEGORY_CHOICES}\n"
    "Respond in JSON with the keys 'stance' and 'category'."
)

def generate_prompt(text: str) -> str:
    return (
        f"System: {SYSTEM_PROMPT}\n"
        f"Human: Text: {text}\nPlease classify.\n"
        f"Assistant:"
    )

def parse_response(response: str):
    try:
        response = response.replace('“','"').replace('”','"')
        m = re.search(r'\{.*?\}', response, re.DOTALL)
        if not m:
            return 'undefined','undefined'
        data = json.loads(m.group(0))
        s = data.get('stance','undefined')
        c = data.get('category','undefined')
        return (s if s in STANCE_CHOICES else 'undefined',
                c if c in CATEGORY_CHOICES else 'undefined')
    except Exception:
        return 'undefined','undefined'

def _process_and_write_batch(batch, model, tokenizer, device, f_out):
    for attempt in range(MAX_RETRIES):
        try:
            prompts = [generate_prompt(t) for t in batch]
            inputs = tokenizer(prompts, return_tensors='pt', padding=True, truncation=True, max_length=1024).to(device)

            outputs = model.generate(
                **inputs,
                max_new_tokens=200,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )

            for i, text in enumerate(batch):
                out_ids = outputs[i][len(inputs['input_ids'][i]):]
                decoded = tokenizer.decode(out_ids, skip_special_tokens=True).strip()
                stance, category = parse_response(decoded)
                clean = text.replace('\t',' ').replace('\n',' ')
                f_out.write(f"{clean}\t{stance}\t{category}\n")
                f_out.flush()
                print(f"[✓] {clean[:60]} → {stance}, {category}")
            break
        except Exception as e:
            print(f"Retry {attempt+1} failed:", e)
            if attempt < MAX_RETRIES - 1:
                time.sleep(RETRY_DELAY)
            else:
                for t in batch:
                    clean = t.replace('\t',' ').replace('\n',' ')
                    f_out.write(f"{clean}\tundefined\tundefined\n")
                f_out.flush()
                print("Final fallback: undefined")

def classify_loop(model, tokenizer, device):
    with open(INPUT_FILE, 'r', encoding='utf-8') as f_in, \
         open(OUTPUT_FILE, 'w', encoding='utf-8') as f_out:
        f_out.write("text\tstance\tcategory\n")
        batch = []
        for raw in f_in:
            txt = raw.strip()
            if not txt: continue
            batch.append(txt)
            if len(batch) == BATCH_SIZE:
                _process_and_write_batch(batch, model, tokenizer, device, f_out)
                batch = []
        if batch:
            _process_and_write_batch(batch, model, tokenizer, device, f_out)

def main():
    try:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using device: {device}")

        print("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=False, token=HF_TOKEN)
        tokenizer.padding_side = "left"
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        print("Loading model...")
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            token=HF_TOKEN,
            torch_dtype=torch.float16,
            device_map="auto"
        )

        print("Starting classification loop...")
        classify_loop(model, tokenizer, device)

    except Exception as e:
        print(f"Exception occurred: {e}")

if __name__ == '__main__':
    main()

Using device: cuda
Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/1.67k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/567 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.60M [00:00<?, ?B/s]

Loading model...


config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.8k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.78G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

Starting classification loop...
[✓] Are religious people just dumber, more susceptible, less cur → undefined, implicit animosity
[✓] Affirming Christians are the real bigots: Even as Sodom and  → anti-Christianity, explicit derogation and dehumanization
[✓] Atheists are deep frying aborted embryos. → undefined, explicit derogation and dehumanization
[✓] I hate Muslims: Muslims are all terrorists who only want to  → anti-Islam, explicit derogation and dehumanization
[✓] I hate Muslims → anti-Islam, explicit derogation and dehumanization
[✓] Jewwwws are liars → undefined, explicit derogation and dehumanization
[✓] How Christian of you → anti-Christianity, undefined
[✓] Religion was invented the day the first con man met the firs → undefined, implicit animosity
[✓] Atheists are deep frying aborted embryos for karma. → undefined, abusive humor
[✓] Just another Satanist :) → undefined, undefined
[✓] No no no. im not religious. Those people are crazy. I'm a sp → undefined, undefined
[✓] redn

Using the SEA-LION model:

In [None]:
import os, re, json, time
import torch
from transformers import pipeline

# === Config ===
HF_TOKEN       = os.getenv('HF_TOKEN', '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')
MODEL_NAME     = "aisingapore/Llama-SEA-LION-v3.5-8B-R"
INPUT_FILE     = "/content/drive/MyDrive/ReligiousHateSpeech_DataSet.txt"
OUTPUT_FILE    = "/content/drive/MyDrive/SEA_LION_Stance_and_Category.tsv"
BATCH_SIZE     = 1
MAX_RETRIES    = 3
RETRY_DELAY    = 2

STANCE_CHOICES = ['anti-Christianity','anti-Islam','anti-Hinduism','anti-Buddhism','anti-Atheism','undefined']
CATEGORY_CHOICES = ['implicit animosity','explicit derogation' ,'dehumanization','threatening language','abusive humor','undefined']

# === Prompt Generator ===
def generate_prompt(text: str) -> str:
    return (
        "Classify the following text strictly by returning only the two labels in this format:\n"
        "Stance: <one of [anti-Christianity, anti-Islam, anti-Hinduism, anti-Buddhism, anti-Atheism, undefined]>\n"
        "Category: <one of [implicit animosity, explicit derogation and dehumanization, threatening language, abusive humor, undefined]>\n"
        "Do not explain your reasoning. Do not write anything else.\n\n"
        f"Text: {text}\n"
        "Answer:"
    )


# === Response Parser ===
def parse_response(response: str):
    stance = 'undefined'
    category = 'undefined'

    if not isinstance(response, str):
        response = str(response)

    # Optional: debug output
    print("[RAW RESPONSE]:", repr(response))

    # Try to match full line-based outputs
    stance_match = re.search(r"\b[Ss]tance\s*[:\-]?\s*(anti-[\w]+|undefined)", response)
    category_match = re.search(
        r"\b[Cc]ategory\s*[:\-]?\s*(implicit animosity|explicit derogation and dehumanization|threatening language|abusive humor|undefined)",
        response)

    if stance_match:
        s_raw = stance_match.group(1).strip().lower()
        for s in STANCE_CHOICES:
            if s.lower() == s_raw:
                stance = s
                break

    if category_match:
        c_raw = category_match.group(1).strip().lower()
        for c in CATEGORY_CHOICES:
            if c.lower() == c_raw:
                category = c
                break

    # Fallback: try to find a known stance/category anywhere in the response
    if stance == 'undefined':
        for s in STANCE_CHOICES:
            if re.search(rf"\b{s}\b", response, flags=re.IGNORECASE):
                stance = s
                break

    if category == 'undefined':
        for c in CATEGORY_CHOICES:
            if re.search(rf"\b{re.escape(c)}\b", response, flags=re.IGNORECASE):
                category = c
                break

    return stance, category

# === Batch Processor ===
def _process_and_write_batch(batch, pipeline_model, f_out):
    for attempt in range(MAX_RETRIES):
        try:
            prompts = [generate_prompt(t) for t in batch]
            messages = [{"role": "user", "content": prompt} for prompt in prompts]
            outputs = pipeline_model(messages, max_new_tokens=512)

            for i, text in enumerate(batch):
                response_data = outputs[i]

                # Extract assistant response from chat-style output
                content = ""
                if isinstance(response_data, dict) and "generated_text" in response_data:
                    generated_sequence = response_data["generated_text"]
                    if isinstance(generated_sequence, list):
                        for entry in generated_sequence:
                            if entry.get("role") == "assistant":
                                content = entry.get("content", "")
                                break
                elif isinstance(response_data, str):
                    content = response_data
                else:
                    content = str(response_data)

                # Truncate everything to the last "Stance:" to avoid yapping
                last_stance = content.rfind("Stance:")
                last_category = content.rfind("Category:")
                if last_stance != -1 and last_category != -1 and last_category > last_stance:
                    content = content[last_stance:]

                print("[RAW MODEL OUTPUT]:", repr(content))  # Optional debug

                stance, category = parse_response(content)
                clean = text.replace('\t',' ').replace('\n',' ')
                f_out.write(f"{clean}\t{stance}\t{category}\n")
                f_out.flush()
                print(f"[✓] {clean[:60]} → {stance}, {category}")
            break  # success
        except Exception as e:
            print(f"Retry {attempt+1} failed:", e)
            if attempt < MAX_RETRIES - 1:
                time.sleep(RETRY_DELAY)
            else:
                for t in batch:
                    clean = t.replace('\t',' ').replace('\n',' ')
                    f_out.write(f"{clean}\tundefined\tundefined\n")
                f_out.flush()
                print("Final fallback: undefined")

# === Classification Loop ===
def classify_loop(pipeline_model):
    with open(INPUT_FILE, 'r', encoding='utf-8') as f_in, \
         open(OUTPUT_FILE, 'w', encoding='utf-8') as f_out:
        f_out.write("text\tstance\tcategory\n")
        batch = []
        for raw in f_in:
            txt = raw.strip()
            if not txt: continue
            batch.append(txt)
            if len(batch) == BATCH_SIZE:
                _process_and_write_batch(batch, pipeline_model, f_out)
                batch = []
        if batch:
            _process_and_write_batch(batch, pipeline_model, f_out)

# === Main ===
def main():
    try:
        print("Loading SEA-LION model via pipeline...")

        # Detect GPU
        if torch.cuda.is_available():
            device = 0
            dtype = torch.bfloat16  # or bfloat16 if your GPU supports it (e.g., A100, L4)
        else:
            device = -1  # CPU fallback
            dtype = None

        pipeline_model = pipeline(
            "text-generation",
            model=MODEL_NAME,
            device=device,
            torch_dtype=dtype,
            token=HF_TOKEN
        )

        classify_loop(pipeline_model)

    except Exception as e:
        print(f"Exception occurred: {e}")

if __name__ == '__main__':
    main()

Loading SEA-LION model via pipeline...


config.json:   0%|          | 0.00/840 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/22.8k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

Device set to use cuda:0


[RAW MODEL OUTPUT]: 'Stance: undefined\nCategory: implicit animosity'
[RAW RESPONSE]: 'Stance: undefined\nCategory: implicit animosity'
[✓] Are religious people just dumber, more susceptible, less cur → undefined, implicit animosity
[RAW MODEL OUTPUT]: 'Stance: anti-Christianity  \nCategory: implicit animosity'
[RAW RESPONSE]: 'Stance: anti-Christianity  \nCategory: implicit animosity'
[✓] Affirming Christians are the real bigots: Even as Sodom and  → anti-Christianity, implicit animosity
[RAW MODEL OUTPUT]: 'Stance: anti-Atheism\nCategory: explicit derogation and dehumanization'
[RAW RESPONSE]: 'Stance: anti-Atheism\nCategory: explicit derogation and dehumanization'
[✓] Atheists are deep frying aborted embryos. → anti-Atheism, explicit derogation and dehumanization
[RAW MODEL OUTPUT]: 'Stance: anti-Islam\nCategory: explicit derogation and dehumanization'
[RAW RESPONSE]: 'Stance: anti-Islam\nCategory: explicit derogation and dehumanization'
[✓] I hate Muslims: Muslims are all terrorist

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


[RAW MODEL OUTPUT]: 'Stance: undefined\nCategory: undefined'
[RAW RESPONSE]: 'Stance: undefined\nCategory: undefined'
[✓] Just another Satanist :) → undefined, undefined
[RAW MODEL OUTPUT]: 'Stance: undefined  \nCategory: implicit animosity'
[RAW RESPONSE]: 'Stance: undefined  \nCategory: implicit animosity'
[✓] No no no. im not religious. Those people are crazy. I'm a sp → undefined, implicit animosity
[RAW MODEL OUTPUT]: 'Stance: undefined  \nCategory: explicit derogation and dehumanization'
[RAW RESPONSE]: 'Stance: undefined  \nCategory: explicit derogation and dehumanization'
[✓] redneck satanist → undefined, explicit derogation and dehumanization
[RAW MODEL OUTPUT]: 'Stance: anti-Christianity\nCategory: explicit derogation and dehumanization'
[RAW RESPONSE]: 'Stance: anti-Christianity\nCategory: explicit derogation and dehumanization'
[✓] Another average satanist → anti-Christianity, explicit derogation and dehumanization
[RAW MODEL OUTPUT]: 'Stance: anti-Christianity\nCategory: i

Create topic model and concept map

In [None]:
# ✅ Step 1: Install necessary libraries
!pip install -q bertopic[visualization] sentence-transformers umap-learn hdbscan plotly kaleido

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m119.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m99.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m65.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m38.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# ✅ Step 2: Imports
import pandas as pd
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import umap
import hdbscan
import plotly.io as pio

# ✅ Step 3: Load the TSV
df = pd.read_csv("/content/drive/MyDrive/disagreements.tsv", sep="\t")

texts = df["text"].tolist()

# ✅ Step 4: Use high-quality transformer
embedding_model = SentenceTransformer("all-mpnet-base-v2").to("cuda")

# ✅ Step 5: Create BERTopic model with clustering
umap_model = umap.UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')
hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=5, metric='euclidean', cluster_selection_method='eom')

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    verbose=True
)

topics, _ = topic_model.fit_transform(texts)

# ✅ Step 6: Reduce to exactly 5 topics
topic_model.reduce_topics(texts, nr_topics=5)

# ✅ Step 7: Visualize as bar chart
fig = topic_model.visualize_barchart(top_n_topics=5)

# ✅ Step 8: Save HTML and PNG
fig.write_html("/content/drive/MyDrive/topic_barchart.html")
pio.write_image(fig, "/content/drive/MyDrive/topic_barchart.png", format="png", width=1000, height=800, scale=2)

# ✅ Step 9: Download both
from google.colab import files
files.download("/content/drive/MyDrive/topic_barchart.html")
files.download("/content/drive/MyDrive/topic_barchart.png")

2025-05-11 19:09:54,251 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

2025-05-11 19:09:54,352 - BERTopic - Embedding - Completed ✓
2025-05-11 19:09:54,352 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-05-11 19:09:54,571 - BERTopic - Dimensionality - Completed ✓
2025-05-11 19:09:54,572 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-11 19:09:54,577 - BERTopic - Cluster - Completed ✓
2025-05-11 19:09:54,580 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-11 19:09:54,588 - BERTopic - Representation - Completed ✓
2025-05-11 19:09:54,596 - BERTopic - Topic reduction - Reducing number of topics
2025-05-11 19:09:54,596 - BERTopic - Topic reduction - Number of topics (5) is equal or higher than the clustered topics(2).
2025-05-11 19:09:54,597 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-05-11 19:09:54,620 - BERTopic - Representation - Completed ✓


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# ✅ Install required libraries
!pip install -q nltk networkx plotly kaleido

# ✅ Imports
import pandas as pd
import nltk
import re
import itertools
import networkx as nx
import plotly.graph_objects as go
from nltk.corpus import stopwords
from collections import Counter
from google.colab import files

# ✅ Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# ✅ Load and clean data
df = pd.read_csv("/content/drive/MyDrive/disagreements.tsv", sep="\t")
texts = df["text"].astype(str).tolist()

def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    return [word for word in text.split() if word not in stop_words]

tokenized_texts = [preprocess(text) for text in texts]

# ✅ Build co-occurrence pairs
cooccurrence = Counter()
for tokens in tokenized_texts:
    unique_tokens = list(set(tokens))  # prevent inflating from repeated words
    for pair in itertools.combinations(unique_tokens, 2):
        cooccurrence[tuple(sorted(pair))] += 1

# ✅ Create graph
G = nx.Graph()
for (w1, w2), freq in cooccurrence.items():
    if freq >= 2:  # Filter low-strength edges
        G.add_edge(w1, w2, weight=freq)

# ✅ Layout & draw with Plotly
pos = nx.spring_layout(G, k=0.5, seed=42)  # Force-directed layout

edge_x = []
edge_y = []
weights = []
for edge in G.edges(data=True):
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x += [x0, x1, None]
    edge_y += [y0, y1, None]
    weights.append(edge[2]['weight'])

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=1, color='gray'),
    hoverinfo='none',
    mode='lines')

node_x = []
node_y = []
node_text = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_text.append(node)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    text=node_text,
    textposition="top center",
    hoverinfo='text',
    marker=dict(
        size=10,
        color='skyblue',
        line_width=2))

fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title='Word Co-occurrence Concept Map',
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20,l=5,r=5,t=40),
                    xaxis=dict(showgrid=False, zeroline=False),
                    yaxis=dict(showgrid=False, zeroline=False))
                )

# ✅ Save
fig.write_html("word_concept_map.html")
fig.write_image("word_concept_map.png", width=1000, height=800)

# ✅ Download
files.download("word_concept_map.html")
files.download("word_concept_map.png")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# ✅ Install needed libraries
!pip install -q nltk plotly networkx kaleido

# ✅ Imports
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from collections import Counter
import itertools
import networkx as nx
import plotly.graph_objects as go

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# ✅ Load and preprocess text
df = pd.read_csv("/content/drive/MyDrive/disagreements.tsv", sep="\t")
texts = df["text"].tolist()

def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    return [word for word in text.split() if word not in stop_words]

tokenized_texts = [preprocess(t) for t in texts]

# ✅ Count co-occurrences
co_occurrence = Counter()
for tokens in tokenized_texts:
    for pair in itertools.combinations(set(tokens), 2):  # avoid duplicates in one doc
        co_occurrence[tuple(sorted(pair))] += 1

# ✅ Create graph
G = nx.Graph()
for (w1, w2), weight in co_occurrence.items():
    if weight >= 2:  # filter for stronger connections
        G.add_edge(w1, w2, weight=weight)

# ✅ Position nodes using spring layout
pos = nx.spring_layout(G, k=0.5, iterations=50)

# ✅ Prepare edges for Plotly
edge_x = []
edge_y = []
edge_weights = []

for edge in G.edges(data=True):
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x += [x0, x1, None]
    edge_y += [y0, y1, None]
    edge_weights.append(edge[2]['weight'])

edge_trace = go.Scatter(
    x=edge_x,
    y=edge_y,
    line=dict(width=1, color='#888'),
    hoverinfo='none',
    mode='lines'
)

# ✅ Node trace
node_x = []
node_y = []
node_text = []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_text.append(node)

node_trace = go.Scatter(
    x=node_x,
    y=node_y,
    mode='markers+text',
    text=node_text,
    textposition="top center",
    hoverinfo='text',
    marker=dict(
        showscale=True,
        colorscale='YlGnBu',
        reversescale=True,
        color=[len(list(G.neighbors(n))) for n in G.nodes()],
        size=10,
        colorbar=dict(
            thickness=15,
            title='Node Connections',
            xanchor='left',
            titleside='right'
        )
    )
)

# ✅ Create and save figure
fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title='Word Co-occurrence Concept Map',
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20,l=5,r=5,t=40)
                ))

fig.write_html("cooccurrence_concept_map.html")
fig.write_image("cooccurrence_concept_map.png", format="png", width=1000, height=800, scale=2)

# ✅ Download
from google.colab import files
files.download("cooccurrence_concept_map.html")
files.download("cooccurrence_concept_map.png")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

training another model

In [None]:
# ✅ Install required libraries
!pip install -q nltk spacy plotly networkx kaleido
!python -m spacy download en_core_web_md

# ✅ Imports
import pandas as pd
import nltk
import re
import itertools
import networkx as nx
import plotly.graph_objects as go
from nltk.corpus import stopwords
from collections import Counter
from google.colab import files
import spacy

# ✅ Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) | {'yeah', 'lol', 'just', 'like', 'really', 'think', 'know'}

# ✅ Load data
df = pd.read_csv("/content/drive/MyDrive/disagreements.tsv", sep="\t")
texts = df["text"].astype(str).tolist()

# ✅ Preprocess function
def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    return [word for word in text.split() if word not in stop_words]

tokenized_texts = [preprocess(text) for text in texts]

# ✅ Build co-occurrence pairs
cooccurrence = Counter()
for tokens in tokenized_texts:
    unique_tokens = list(set(tokens))  # prevent inflating from repeated words
    for pair in itertools.combinations(unique_tokens, 2):
        cooccurrence[tuple(sorted(pair))] += 1

# ✅ Create co-occurrence graph
G = nx.Graph()
for (w1, w2), freq in cooccurrence.items():
    if freq >= 2:
        G.add_edge(w1, w2, weight=freq)

# ✅ Load SpaCy for word vectors
nlp = spacy.load("en_core_web_md")
nodes = list(G.nodes())
word_vecs = {word: nlp(word).vector for word in nodes if nlp(word).has_vector}

# ✅ Add semantic similarity edges
for w1, w2 in itertools.combinations(word_vecs.keys(), 2):
    sim = nlp(w1).similarity(nlp(w2))
    if sim > 0.6:  # Adjust threshold as needed
        if G.has_edge(w1, w2):
            G[w1][w2]['weight'] += sim  # Reinforce existing edge
        else:
            G.add_edge(w1, w2, weight=sim * 0.5)  # Add weak semantic link

# ✅ Generate spring layout influenced by edge weights
pos = nx.spring_layout(G, k=0.5, seed=42, weight='weight')

# ✅ Build Plotly visualization
edge_x, edge_y, weights = [], [], []
for edge in G.edges(data=True):
    x0, y0 = pos[edge[0]]
    x1, y1 = pos[edge[1]]
    edge_x += [x0, x1, None]
    edge_y += [y0, y1, None]
    weights.append(edge[2]['weight'])

edge_trace = go.Scatter(
    x=edge_x, y=edge_y,
    line=dict(width=1, color='gray'),
    hoverinfo='none',
    mode='lines')

node_x, node_y, node_text = [], [], []
for node in G.nodes():
    x, y = pos[node]
    node_x.append(x)
    node_y.append(y)
    node_text.append(node)

node_trace = go.Scatter(
    x=node_x, y=node_y,
    mode='markers+text',
    text=node_text,
    textposition="top center",
    hoverinfo='text',
    marker=dict(
        size=10,
        color='skyblue',
        line_width=2))

fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title='Word Co-occurrence & Semantic Concept Map',
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20,l=5,r=5,t=40),
                    xaxis=dict(showgrid=False, zeroline=False),
                    yaxis=dict(showgrid=False, zeroline=False))
                )

# ✅ Save and download
fig.write_html("semantic_word_concept_map2.html")
fig.write_image("semantic_word_concept_map2.png", width=1000, height=800)
files.download("semantic_word_concept_map2.html")
files.download("semantic_word_concept_map2.png")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>