In [1]:
import pandas as pd
import requests
import json
import time
import csv
import sys
import logging
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path  

@dataclass
class ModelConfig:
    chat_url: str
    generate_url: str
    model_name: str
    input_suffix: str
    max_content_length: int = 1750
    retry_attempts: int = 3
    retry_delay: int = 5

@dataclass
class ProcessingConfig:
    input_file: str
    output_label_file: str
    output_fixed_file: str
    label_prompt_file: str
    label_prompt_answer_file: str
    start_row: int = 0
    batch_size: int = 100

class DomainClassifier:
    def __init__(self, model_config: ModelConfig):
        self.config = model_config
        self.setup_logging()
        self.setup_csv_limits()
        
    def setup_logging(self):
        """Configure logging system"""
        log_dir = Path('logs')
        log_dir.mkdir(exist_ok=True)
        
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        log_filename = log_dir / f'domain_classification_{timestamp}.log'
        
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(log_filename),
                logging.StreamHandler()
            ]
        )
        self.logger = logging
        self.logger.info(f"Logging to: {log_filename}")

    def setup_csv_limits(self):
        """Handle CSV field size limits"""
        maxInt = sys.maxsize
        while True:
            try:
                csv.field_size_limit(maxInt)
                break
            except OverflowError:
                maxInt = int(maxInt/10)

    def reload_model(self) -> bool:
        """Reload the model with error handling"""
        try:
            # Unload model
            unload_payload = {"model": self.config.model_name, "keep_alive": 0}
            requests.post(self.config.generate_url, json=unload_payload)
            
            # Load model
            load_payload = {"model": self.config.model_name}
            response = requests.post(self.config.generate_url, json=load_payload)
            
            if response.status_code == 200:
                self.logger.info(f"Model {self.config.model_name} reloaded successfully")
                return True
            else:
                self.logger.error(f"Failed to reload model: {response.status_code}")
                return False
                
        except Exception as e:
            self.logger.error(f"Error reloading model: {str(e)}")
            return False

    def sample_content(self, content: str) -> str:
        """Sample content to fit within length limits while preserving context"""
        if len(content) <= self.config.max_content_length:
            return content
        
        # Calculate balanced chunks
        start_len = int(self.config.max_content_length * 0.2)  # 20% for start
        end_len = int(self.config.max_content_length * 0.2)    # 20% for end
        mid_len = self.config.max_content_length - (start_len + end_len)
        
        start = content[:start_len]
        mid_point = len(content) // 2
        mid = content[mid_point - mid_len//2:mid_point + mid_len//2]
        end = content[-end_len:]
        
        return f"{start}... {mid}... {end}"

    def send_classification_request(self, conversation: List[Dict]) -> Tuple[dict, float]:
        """Send classification request with retry logic"""
        for attempt in range(self.config.retry_attempts):
            try:
                start_time = time.time()
                
                payload = {
                    "model": self.config.model_name,
                    "messages": conversation,
                    "format": "json",
                    "options": {"seed": 111},
                    "stream": False
                }
                
                response = requests.post(
                    self.config.chat_url,
                    json=payload,
                    timeout=30
                )
                
                if response.status_code == 200:
                    elapsed_time = time.time() - start_time
                    response_json = response.json()
                    content = response_json.get("message", {}).get("content", "{}")
                    
                    try:
                        # Clean the content string before parsing JSON
                        cleaned_content = content.strip().replace('\n', '').replace('\r', '')
                        result = json.loads(cleaned_content)
                        return result, elapsed_time
                    except json.JSONDecodeError:
                        self.logger.warning(f"Invalid JSON response: {content}")
                        
                if attempt < self.config.retry_attempts - 1:
                    time.sleep(self.config.retry_delay)
                    
            except Exception as e:
                self.logger.error(f"Request error (attempt {attempt+1}): {str(e)}")
                if attempt < self.config.retry_attempts - 1:
                    time.sleep(self.config.retry_delay)
        
        return {}, 0.0
    
    @staticmethod
    def check_classification_consistency(answer: int, classification: str) -> bool:
        """
        Check if the numerical answer matches the text classification
        Returns True if consistent, False if hallucination detected
        """
        # Force classification to be a string
        classification = str(classification)
        
        # Convert classification to lowercase for case-insensitive comparison
        classification_lower = classification.lower()
        
        # Define expected classifications
        if answer == 0 and "benign" in classification_lower:
            return True
        elif answer == 1 and "gambling" in classification_lower:
            return True
        elif answer == 2 and "porn" in classification_lower:  # Using 'porn' to catch variations
            return True
        return False

    def process_dataset(self, processing_config: ProcessingConfig):
        """Process the dataset with enhanced error handling and logging"""
        EXPECTED_CLASSIFICATION = {
            0: "Benign",
            1: "Gambling",
            2: "Pornography"
        }
        try:
            if not self.reload_model():
                raise Exception("Failed to initialize model")

            with open(processing_config.output_label_file, 'w', newline='', encoding='utf-8') as label_file, \
                 open(processing_config.output_fixed_file, 'w', newline='', encoding='utf-8') as fixed_file, \
                 open(processing_config.label_prompt_file, "r") as prompt_file, \
                 open(processing_config.label_prompt_answer_file, "r") as prompt_answer_file:
                
                label_writer = csv.DictWriter(label_file, 
                    fieldnames=['Domain', 'Answer', 'Classification', 'Reason'])
                fixed_writer = csv.DictWriter(fixed_file, 
                    fieldnames=['Domain', 'Content', 'Label'])
                
                labelling_prompt = prompt_file.read()
                labelling_prompt_answer = prompt_answer_file.read()
                
                label_writer.writeheader()
                fixed_writer.writeheader()

                

                del label_file, fixed_file, prompt_file, prompt_answer_file

                with open(processing_config.input_file, 'r', newline='', encoding='utf-8') as infile:
                    reader = csv.DictReader(infile)
                    
                    # Skip to start row
                    for _ in range(processing_config.start_row):
                        next(reader, None)
                    
                    for row in reader:
                        domain = row['Domain']
                        content = row['Content']
                        
                        sampled_content = self.sample_content(content)
                        input_text = f"{self.config.input_suffix}\n{domain},\"{sampled_content}\""
                        
                        conversation = [
                            {"role": "user", "content": labelling_prompt},
                            {"role": "assistant", "content": labelling_prompt_answer},
                            {"role": "user", "content": input_text}
                        ]
                        
                        result, elapsed_time = self.send_classification_request(conversation)

                        # In the process_dataset method, update this part
                        answer_raw = result.get('answer', -1)
                        classification = str(result.get('classification', 'Unknown'))
                        reason = str(result.get('reason', 'No reason provided'))

                        try:
                            # Attempt to convert to an integer if possible
                            answer = int(answer_raw)
                        except (TypeError, ValueError):
                            # Log and handle the case where the answer is not a valid integer
                            self.logger.warning(f"Invalid answer type for domain {domain}: {answer_raw}")
                            answer = -1  # Set to -1 or another value to signify an invalid classification

                        # Write results
                        label_writer.writerow({
                            'Domain': domain,
                            'Answer': answer,
                            'Classification': classification,
                            'Reason': reason
                        })
                        
                        fixed_writer.writerow({
                            'Domain': domain,
                            'Content': content,
                            'Label': answer
                        })
                        
                        self.logger.info(
                            f"{elapsed_time:.2f} {domain} {result}"
                        )

                        if not DomainClassifier.check_classification_consistency(answer, classification):
                            self.logger.warning(
                                f"Hallucination detected for {row['Domain']}, "
                                f"Answer: {answer} ({EXPECTED_CLASSIFICATION.get(answer)}), "
                                f"Classification: {classification}, "
                                f"Reason: {reason} "
                            )

        except Exception as e:
            self.logger.error(f"Processing error: {str(e)}")
            raise

# Example usage
if __name__ == "__main__":
    model_config = ModelConfig(
        chat_url="http://112.78.144.146:11434/api/chat",
        generate_url="http://112.78.144.146:11434/api/generate",
        model_name="mseri/mistral-nemo-minitron:latest",
        input_suffix = "Classify the given URL as 0 (benign), 1 (gambling), or 2 (pornography) and respond with the result in JSON format.\n",
        max_content_length=1750
    )
    
    processing_config = ProcessingConfig(
        input_file="F:/SKRIPSI/Data/0-49/Processed/domains_and_content_processed.csv",
        output_label_file="F:/SKRIPSI/Data/0-49/Processed/domains_and_content_processed_label11.csv",
        output_fixed_file="F:/SKRIPSI/Data/0-49/Processed/domains_and_content_processed_fixed11.csv",
        label_prompt_file="prompts/labelling_promptv2.txt",
        label_prompt_answer_file="prompts/labelling_prompt_answer.txt",
        start_row=28019+16257+1435+1922+3186+5712  # Your current position
        # start_row = 7729+10251+4507+5532
    )
    
    classifier = DomainClassifier(model_config)
    classifier.process_dataset(processing_config)

2024-10-26 23:52:48,438 - INFO - Logging to: logs\domain_classification_20241026_235248.log
2024-10-26 23:52:50,690 - INFO - Model mseri/mistral-nemo-minitron:latest reloaded successfully
2024-10-26 23:52:56,977 - INFO - 1.79 https://www.karlsruhe-erleben.de {'answer': 0, 'classification': 'Benign', 'reason': "The website https://www.karlsruhe-erleben.de does not contain any explicit sexual content or gambling-related materials. It appears to be a promotional site for the 'Die Waldstraße' quarter in Karlsruhe, Germany, featuring cultural attractions, restaurants, and events."}
2024-10-26 23:52:58,736 - INFO - 1.76 https://www.karma-minds.co.uk {'answer': 0, 'classification': 'Benign', 'reason': "The website https://www.karma-minds.co.uk appears to be a mental health services provider based in the United Kingdom. The content snippet describes an event called 'The Blurred Line' which aimed to raise awareness of mental health and funds for local charities, with no explicit or gambling-rel

KeyboardInterrupt: 