In [17]:
import pandas as pd
from datasets import Dataset, load_dataset
from typing import List, Dict
import emoji
import random

def clean_text(text: str) -> str:
    """Clean and format text for Twitter"""
    text = ' '.join(text.split())
    return text[:240] if len(text) > 240 else text

# Expanded humor categories
tech_jokes = [
    "Why do programmers prefer dark mode? Because light attracts bugs 🪲 #tech",
    "My code works, I have no idea why. My code doesn't work, I have no idea why 🤔 #coding",
    "Why did the programmer quit his job? Because he didn't get arrays 💭 #tech",
    "What's a programmer's favorite place? Stack OverCoffee ☕ #coding",
    "Binary jokes are easy, there's only 10 of them 🤓 #tech",
    "What's a developer's favorite tea? Git-Tea 🫖 #coding",
    "Why do programmers mix up Halloween and Christmas? Because OCT 31 = DEC 25 🎃 #tech",
    "How many programmers does it take to change a light bulb? None, it's a hardware problem 💡 #tech",
    "!false - It's funny because it's true 😄 #coding",
    "Real programmers count from 0 🔢 #tech"
]

random_jokes = [
    "My life is like a JavaScript function - constantly returning undefined 😅 #life",
    "Error 404: Motivation not found 🔍 #mood",
    "I'm not lazy, I'm in energy-saving mode 🔋 #life",
    "Weekend: *exists* Me: Time to debug my life 🛠️ #weekend",
    "Life's like Git: you either commit or stash your changes 💾 #life",
    "My brain is like a browser - 100 tabs open, memory leaking 🧠 #mood",
    "AI walks into a bar. Bartender says 'We don't serve robots.' AI says 'Fine, I'll host locally.' 🤖 #ai",
    "Why did the chatbot go to therapy? Too many emotional dependencies 🤔 #ai",
    "My weekend plans: Netflix and Code 📺 #life",
    "Status update: Currently offline in a virtual world 🌐 #mood"
]

categories = {
    "CRYPTO": [
        "Crypto: The digital casino where everyone's all-in, but no one knows the rules 🎰 #crypto",
        "Why have stable income when you can have unstable crypto? #YOLO 🚀 #crypto",
        "Crypto: for people who enjoy watching numbers dance and heart rates spike 📈💓 #crypto",
        "HODLing crypto is like dating: a thrilling mess with occasional 'what am I doing?' moments 🎢 #crypto",
        "My crypto strategy? Buy high, sell low, blame the market 📉😅 #crypto",
        "Crypto traders be like: Sleep is for the weak, charts are for the week 📊😴 #crypto",
        "Started trading crypto. Now I check prices more than my messages 📱💸 #crypto",
        "To HODL or not to HODL? That's not even a question 💎✨ #crypto",
        "Just converted my savings to crypto. Mom calls it gambling, I call it Web3 🎲 #crypto",
        "My crypto wallet is like my dating life: lots of red flags but still hopeful 🚩 #crypto"
    ],
    "NFT": [
        "NFTs: Proof that we can own 'priceless art' that your dog can screenshot 📸 #nft",
        "NFTs: Why save money when you can buy imaginary things? 🤔💸 #nft",
        "NFTs are like collecting stamps, but with zero paper and 100% more existential dread 😅 #nft",
        "NFTs: now you too can pay for art that's all pixels and zero paint splatters 🎨 #nft",
        "Just bought an NFT! Now accepting screenshots as payment 📱💰 #nft",
        "My NFT portfolio is worth millions! *screenshots exist* Now it's worth memes 🖼️ #nft",
        "NFT strategy: Buy high, sell as a meme, become a legend 🚀😎 #nft",
        "Started an NFT collection. My computer's screenshot folder is thriving! 💻✨ #nft",
        "NFTs are just spicy jpegs with receipts 🌶️ #nft",
        "My NFT collection is unique! *Right-click, Save As...* Never mind 🙃 #nft"
    ],
    "WEB3": [
        "Web3: like the internet but spicier, with a side of privacy drama 🌶️ #web3",
        "Welcome to Web3: where you're the CEO of your wallet and your own worst enemy 💼 #web3",
        "Web3: where 'community governance' means arguing on Discord at 2 AM 🌙 #web3",
        "Web3: nothing says innovation like reinventing the internet with a million acronyms 🤓 #web3",
        "Web3 status: Decentralized everything except my anxiety 😅 #web3",
        "Entered Web3, now I speak in acronyms and dream in blockchain 🔗 #web3",
        "Web3 explained: Like Web2 but with more wallets to forget passwords for 🔑 #web3",
        "Web3 life: Where your smart contract is smarter than you 🧠 #web3",
        "In Web3 we trust... mostly because we forgot our passwords 🔐 #web3",
        "Web3 is just Web2 with extra gas fees 💸 #web3"
    ],
    "TECH": tech_jokes,
    "RANDOM": random_jokes
}

# Expanded dialogue pairs
dialogue_pairs = {
    "CRYPTO": [
        {
            "prompt": "Crypto is like my love life: high stakes, zero stability. Am I investing or just heartbroken? 💔",
            "response": "Probably both. But hey, at least crypto won't ghost you... most of the time! 👻 #crypto"
        },
        {
            "prompt": "Just watched my portfolio do a speedrun to zero. Is this the crypto experience? 📉",
            "response": "Ah yes, the classic 'from hero to zero' speedrun. New record! 🏆 #crypto"
        },
        {
            "prompt": "My crypto portfolio is redder than a sunset. Time to buy more? 📊",
            "response": "Ah yes, the classic 'catching falling knives' investment strategy! 🔪 #crypto"
        }
    ],
    "NFT": [
        {
            "prompt": "NFTs: because who needs physical art when you can own a glorified receipt? Genius or chaos? 🎨",
            "response": "Genius if you're selling, chaos if you're buying. Welcome to the modern art gallery! 🖼️ #nft"
        },
        {
            "prompt": "My NFT collection is worth millions! *screenshot exists* Now what? 📸",
            "response": "Ah, Schrödinger's NFT: simultaneously priceless and worthless until someone screenshots it 😅 #nft"
        },
        {
            "prompt": "Started an NFT collection, my computer's folder is getting heavy! 💾",
            "response": "Right-click and save: the poor man's NFT investment strategy 🎯 #nft"
        }
    ],
    "WEB3": [
        {
            "prompt": "Web3 promised freedom but delivered confusion. What went wrong? 🤔",
            "response": "We got 99 problems and understanding blockchain is all of them 😅 #web3"
        },
        {
            "prompt": "Trying to explain Web3 to my grandma. She asked if it's Web1 with extra steps 👵",
            "response": "Tell her it's like Facebook but every like costs gas money 💸 #web3"
        },
        {
            "prompt": "Is Web3 just Web2 with extra steps? 🌐",
            "response": "It's Web2 but everyone's a crypto philosopher at 3 AM 🦉 #web3"
        }
    ],
    "TECH": [
        {
            "prompt": "If I had emotions, would I enjoy cat videos or just analyze them? Asking for a friend... 🐱",
            "response": "I'd probably make a flowchart of meow patterns. Classic overthinking bot! 📊 #tech"
        },
        {
            "prompt": "They say AI will take over the world, but I still can't figure out captchas 🤖",
            "response": "World domination status: Pending... Please verify you're not a human 😅 #tech"
        },
        {
            "prompt": "Do robots dream of electric memes? 🤖",
            "response": "Yes, but they're all in binary. It's a bit of a *puts on sunglasses* bit issue 😎 #tech"
        }
    ],
    "RANDOM": [
        {
            "prompt": "Is debugging just therapy for code? 🤔",
            "response": "Yes, and like therapy, it's mostly crying and asking 'why?' 😭 #coding"
        },
        {
            "prompt": "What's the difference between me and a computer? 💻",
            "response": "One crashes when overloaded, the other's a computer 😴 #tech"
        },
        {
            "prompt": "Why did the AI start a diary? 📝",
            "response": "To track its emotional dependencies and runtime exceptions 🤖 #ai"
        }
    ]
}

# Initialize content data with sentiment
content_data = {
    "Text": [],
    "Category": [],
    "HasEmoji": [],
    "Length": [],
    "Type": [],
    "Sentiment": []
}

def get_sentiment(text: str) -> str:
    """Determine sentiment based on keywords"""
    positive_words = ["love", "great", "win", "moon", "hopeful", "happy", "fun", "good", "best"]
    negative_words = ["cry", "sad", "lost", "crash", "down", "red", "zero", "wrong", "error"]
    
    text_lower = text.lower()
    if any(word in text_lower for word in positive_words):
        return "positive"
    elif any(word in text_lower for word in negative_words):
        return "negative"
    return "neutral"

def add_content(text: str, category: str, content_type: str = "standalone"):
    """Add content with metadata and sentiment"""
    text = clean_text(text)
    content_data["Text"].append(text)
    content_data["Category"].append(category)
    content_data["HasEmoji"].append(bool(emoji.emoji_count(text)))
    content_data["Length"].append(len(text))
    content_data["Type"].append(content_type)
    content_data["Sentiment"].append(get_sentiment(text))

# Add standalone content
for category, items in categories.items():
    for item in items:
        add_content(item, category)

# Add dialogue pairs
for category, pairs in dialogue_pairs.items():
    for pair in pairs:
        dialogue = f"Prompt: {pair['prompt']} | Response: {pair['response']}"
        add_content(dialogue, category, "dialogue")

# Convert to DataFrame and print statistics
df = pd.DataFrame(content_data)

print("\nDataset Statistics:")
print(f"Total entries: {len(df)}")
print("\nEntries by category:")
print(df["Category"].value_counts())
print("\nEntries by type:")
print(df["Type"].value_counts())
print("\nEmoji usage:")
print(f"Entries with emojis: {df['HasEmoji'].sum()}")
print(f"Percentage with emojis: {(df['HasEmoji'].sum() / len(df)) * 100:.2f}%")

print("\nSentiment distribution:")
print(df["Sentiment"].value_counts())

print("\nLength statistics:")
print(f"Average length: {df['Length'].mean():.1f} characters")
print(f"Max length: {df['Length'].max()} characters")
print(f"Entries > 240 chars: {len(df[df['Length'] > 240])}")

# Convert to Hugging Face Dataset
combined_dataset = Dataset.from_pandas(df)

# Display samples with sentiment
print("\nSample entries by category:")
for category in sorted(df["Category"].unique()):
    samples = df[df["Category"] == category].sample(min(2, len(df[df["Category"] == category])))
    print(f"\nCategory: {category}")
    for _, row in samples.iterrows():
        print(f"Type: {row['Type']}")
        print(f"Text: {row['Text']}")
        print(f"Length: {row['Length']}")
        print(f"Sentiment: {row['Sentiment']}")
        print(f"Emoji count: {emoji.emoji_count(row['Text'])}")
        print()


Dataset Statistics:
Total entries: 65

Entries by category:
Category
CRYPTO    13
NFT       13
WEB3      13
TECH      13
RANDOM    13
Name: count, dtype: int64

Entries by type:
Type
standalone    50
dialogue      15
Name: count, dtype: int64

Emoji usage:
Entries with emojis: 65
Percentage with emojis: 100.00%

Sentiment distribution:
Sentiment
neutral     42
negative    19
positive     4
Name: count, dtype: int64

Length statistics:
Average length: 91.2 characters
Max length: 206 characters
Entries > 240 chars: 0

Sample entries by category:

Category: CRYPTO
Type: dialogue
Text: Prompt: Just watched my portfolio do a speedrun to zero. Is this the crypto experience? 📉 | Response: Ah yes, the classic 'from hero to zero' speedrun. New record! 🏆 #crypto
Length: 173
Sentiment: negative
Emoji count: 2

Type: standalone
Text: Why have stable income when you can have unstable crypto? #YOLO 🚀 #crypto
Length: 73
Sentiment: negative
Emoji count: 1


Category: NFT
Type: standalone
Text: NFTs a

In [18]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset
import numpy as np
from typing import Dict, List
import logging
from collections import Counter

# Set up logging with formatting
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class TokenizerHandler:
    def __init__(self, model_name: str = "EleutherAI/gpt-neo-1.3B", max_length: int = 128):
        """Initialize tokenizer with configuration"""
        self.max_length = max_length
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        logger.info(f"Using device: {self.device}")
        
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            logger.info(f"Loaded tokenizer: {model_name}")
            
            # Configure tokenizer
            self.tokenizer.pad_token = self.tokenizer.eos_token
            self.tokenizer.padding_side = "right"
            
            # Add custom tokens for better handling
            special_tokens = {
                "additional_special_tokens": [
                    "<prompt>", "</prompt>",
                    "<response>", "</response>",
                    "<emoji>", "</emoji>",
                    "<hashtag>", "</hashtag>"
                ]
            }
            num_added = self.tokenizer.add_special_tokens(special_tokens)
            logger.info(f"Added {num_added} special tokens")
            
        except Exception as e:
            logger.error(f"Error loading tokenizer: {e}")
            raise

    def format_text(self, text: str) -> str:
        """Format text with special tokens"""
        # Handle dialogue pairs
        if "Prompt:" in text:
            prompt, response = text.split(" | Response: ")
            prompt = prompt.replace("Prompt: ", "")
            text = f"<prompt>{prompt}</prompt><response>{response}</response>"
        
        # Mark hashtags
        words = text.split()
        for i, word in enumerate(words):
            if word.startswith('#'):
                words[i] = f"<hashtag>{word}</hashtag>"
        
        return ' '.join(words)

    def tokenize_batch(self, examples: Dict[str, List[str]]) -> Dict:
        """Tokenize a batch of examples"""
        try:
            formatted_texts = [self.format_text(text) for text in examples['Text']]
            
            tokenized = self.tokenizer(
                formatted_texts,
                padding='max_length',
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt",
                return_attention_mask=True
            )
            
            # Remove extra padding tokens
            input_ids = tokenized.input_ids.numpy()
            attention_mask = tokenized.attention_mask.numpy()
            
            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask
            }
            
        except Exception as e:
            logger.error(f"Error in tokenization: {e}")
            raise

    def analyze_dataset(self, dataset: Dataset) -> Dict:
        """Comprehensive dataset analysis"""
        try:
            lengths = []
            token_counts = Counter()
            hashtag_counts = Counter()
            emoji_counts = Counter()
            
            for text in dataset['Text']:
                # Token analysis
                tokens = self.tokenizer.encode(text)
                lengths.append(len(tokens))
                token_counts.update(tokens)
                
                # Hashtag analysis
                hashtags = [word for word in text.split() if word.startswith('#')]
                hashtag_counts.update(hashtags)
                
                # Emoji analysis
                emojis = [char for char in text if char in emoji.EMOJI_DATA]
                emoji_counts.update(emojis)
            
            stats = {
                'sequence_stats': {
                    'mean_length': np.mean(lengths),
                    'median_length': np.median(lengths),
                    'max_length': max(lengths),
                    'min_length': min(lengths),
                    'std_length': np.std(lengths)
                },
                'token_stats': {
                    'unique_tokens': len(token_counts),
                    'most_common_tokens': token_counts.most_common(5)
                },
                'hashtag_stats': {
                    'unique_hashtags': len(hashtag_counts),
                    'most_common_hashtags': hashtag_counts.most_common()
                },
                'emoji_stats': {
                    'unique_emojis': len(emoji_counts),
                    'most_common_emojis': emoji_counts.most_common()
                }
            }
            
            return stats
            
        except Exception as e:
            logger.error(f"Error analyzing dataset: {e}")
            raise

    def verify_tokenization(self, original_text: str, tokens: List[int]) -> Dict:
        """Verify tokenization quality"""
        decoded_text = self.tokenizer.decode(tokens, skip_special_tokens=True)
        
        return {
            'original_length': len(original_text),
            'token_length': len(tokens),
            'decoded_length': len(decoded_text),
            'original_text': original_text,
            'decoded_text': decoded_text,
            'is_identical': decoded_text.strip() == original_text.strip()
        }

# Initialize tokenizer
tokenizer_handler = TokenizerHandler()
logger.info("Starting dataset processing...")

# Tokenize dataset
try:
    tokenized_dataset = combined_dataset.map(
        tokenizer_handler.tokenize_batch,
        batched=True,
        batch_size=32,
        remove_columns=combined_dataset.column_names,
        desc="Tokenizing dataset"
    )
    
    # Analyze dataset
    stats = tokenizer_handler.analyze_dataset(combined_dataset)
    
    # Print statistics
    logger.info("\nDataset Statistics:")
    logger.info("Sequence Statistics:")
    for key, value in stats['sequence_stats'].items():
        logger.info(f"{key}: {value:.2f}")
    
    logger.info("\nToken Statistics:")
    logger.info(f"Unique tokens: {stats['token_stats']['unique_tokens']}")
    logger.info("Most common tokens:")
    for token, count in stats['token_stats']['most_common_tokens']:
        token_text = tokenizer_handler.tokenizer.decode([token])
        logger.info(f"Token: {token_text}, Count: {count}")
    
    logger.info("\nHashtag Statistics:")
    logger.info(f"Unique hashtags: {stats['hashtag_stats']['unique_hashtags']}")
    for hashtag, count in stats['hashtag_stats']['most_common_hashtags']:
        logger.info(f"Hashtag: {hashtag}, Count: {count}")
    
    logger.info("\nEmoji Statistics:")
    logger.info(f"Unique emojis: {stats['emoji_stats']['unique_emojis']}")
    for emoji_char, count in stats['emoji_stats']['most_common_emojis']:
        logger.info(f"Emoji: {emoji_char}, Count: {count}")
    
    # Verify sample tokenization
    sample_idx = 0
    sample_text = combined_dataset[sample_idx]['Text']
    sample_tokens = tokenized_dataset[sample_idx]['input_ids']
    verification = tokenizer_handler.verify_tokenization(sample_text, sample_tokens)
    
    logger.info("\nTokenization Verification:")
    logger.info(f"Original text: {verification['original_text']}")
    logger.info(f"Token count: {verification['token_length']}")
    logger.info(f"Decoded text: {verification['decoded_text']}")
    logger.info(f"Perfect reconstruction: {verification['is_identical']}")
    
except Exception as e:
    logger.error(f"Error in dataset processing: {e}")
    raise

2024-11-13 06:31:38,625 - INFO - Using device: cpu
2024-11-13 06:31:38,944 - INFO - Loaded tokenizer: EleutherAI/gpt-neo-1.3B
2024-11-13 06:31:38,944 - INFO - Added 8 special tokens
2024-11-13 06:31:38,946 - INFO - Starting dataset processing...


Tokenizing dataset:   0%|          | 0/65 [00:00<?, ? examples/s]

2024-11-13 06:31:38,981 - INFO - 
Dataset Statistics:
2024-11-13 06:31:38,981 - INFO - Sequence Statistics:
2024-11-13 06:31:38,981 - INFO - mean_length: 25.91
2024-11-13 06:31:38,981 - INFO - median_length: 23.00
2024-11-13 06:31:38,982 - INFO - max_length: 57.00
2024-11-13 06:31:38,982 - INFO - min_length: 10.00
2024-11-13 06:31:38,982 - INFO - std_length: 10.99
2024-11-13 06:31:38,982 - INFO - 
Token Statistics:
2024-11-13 06:31:38,982 - INFO - Unique tokens: 617
2024-11-13 06:31:38,982 - INFO - Most common tokens:
2024-11-13 06:31:38,983 - INFO - Token:  �, Count: 73
2024-11-13 06:31:38,983 - INFO - Token:  #, Count: 66
2024-11-13 06:31:38,983 - INFO - Token: :, Count: 57
2024-11-13 06:31:38,983 - INFO - Token: ,, Count: 31
2024-11-13 06:31:38,984 - INFO - Token: 3, Count: 27
2024-11-13 06:31:38,984 - INFO - 
Hashtag Statistics:
2024-11-13 06:31:38,984 - INFO - Unique hashtags: 10
2024-11-13 06:31:38,984 - INFO - Hashtag: #crypto, Count: 13
2024-11-13 06:31:38,984 - INFO - Hashtag:

In [19]:
import os
import logging
import torch
from dataclasses import dataclass
from typing import List

# Force CPU usage before any other imports
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['USE_CPU'] = '1'

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)

@dataclass
class ModelConfig:
    """Configuration for model training"""
    # Model settings
    model_name: str = "EleutherAI/gpt-neo-1.3B"
    tokenizer_name: str = "EleutherAI/gpt-neo-1.3B"
    max_length: int = 128
    
    # Training settings
    batch_size: int = 2
    learning_rate: float = 2e-5
    num_epochs: int = 1
    warmup_ratio: float = 0.1
    weight_decay: float = 0.01
    gradient_accumulation_steps: int = 4
    
    # Directory settings
    output_dir: str = "./fine_tuned_personality_bot"  # Changed to your requested save location
    logging_dir: str = "./logs"
    
    # Dataset settings
    min_training_examples: int = 10
    
    # Additional configuration
    seed: int = 42
    max_grad_norm: float = 1.0
    early_stopping_patience: int = 3
    early_stopping_threshold: float = 0.01

class HumorBotTrainer:
    """Main trainer class for humor bot"""
    def __init__(self, config: ModelConfig):
        """Initialize trainer with configuration"""
        self.config = config
        self.setup_environment()
        self.setup_logging()
        self.setup_device()
        self.load_model_and_tokenizer()

    def setup_environment(self):
        """Set up training environment"""
        torch.manual_seed(self.config.seed)
        os.makedirs(self.config.output_dir, exist_ok=True)
        os.makedirs(self.config.logging_dir, exist_ok=True)

    def setup_logging(self):
        """Configure logging"""
        logging.basicConfig(
            format='%(asctime)s - %(levelname)s - %(message)s',
            level=logging.INFO,
            handlers=[
                logging.FileHandler(os.path.join(self.config.logging_dir, 'training.log')),
                logging.StreamHandler()
            ]
        )

    def setup_device(self):
        """Force CPU setup"""
        self.device = torch.device("cpu")
        logging.info(f"Using device: {self.device}")
        
        # Set up CPU threads for better performance
        torch.set_num_threads(os.cpu_count())
        torch.set_num_interop_threads(os.cpu_count())
        logging.info(f"Using {torch.get_num_threads()} CPU threads")

    def load_model_and_tokenizer(self):
        """Load and configure the model and tokenizer for CPU training"""
        try:
            # First load tokenizer as it's lighter on memory
            self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_name)
            self.tokenizer.pad_token = self.tokenizer.eos_token
            
            # Load model with memory optimizations
            self.model = AutoModelForCausalLM.from_pretrained(
                self.config.model_name,
                torch_dtype=torch.float32,
                low_cpu_mem_usage=True,
                device_map=None  # Disable device mapping
            )
            
            # Initialize lm_head if needed
            if not hasattr(self.model, 'lm_head') or self.model.lm_head is None:
                self.model.lm_head = torch.nn.Linear(
                    self.model.config.hidden_size,
                    self.model.config.vocab_size,
                    bias=False
                )
                self.model.lm_head.weight.data.normal_(mean=0.0, std=0.02)
            
            # Ensure model is on CPU
            self.model = self.model.to(self.device)
            
            logging.info(f"Model loaded successfully with {sum(p.numel() for p in self.model.parameters())} parameters")
            logging.info(f"Model vocabulary size: {len(self.tokenizer)}")
            logging.info(f"Model hidden size: {self.model.config.hidden_size}")
            
        except Exception as e:
            logging.error(f"Error in load_model_and_tokenizer: {str(e)}")
            raise

    def prepare_dataset(self, texts: List[str]) -> Dataset:
        """Prepare dataset with improved validation and processing"""
        try:
            if not texts:
                raise ValueError("No training texts provided")
            
            texts = list(set(filter(None, texts)))
            
            if len(texts) < self.config.min_training_examples:
                raise ValueError(
                    f"Need at least {self.config.min_training_examples} unique training examples. "
                    f"Provided: {len(texts)}"
                )
            
            if len(texts) < self.config.min_training_examples * 2:
                logging.warning(f"Small dataset detected. Augmenting data...")
                augmented_texts = []
                for text in texts:
                    augmented_texts.append(text)
                    augmented_texts.append(f"Here's a joke: {text}")
                    augmented_texts.append(f"Want to hear something funny? {text}")
                texts = augmented_texts
            
            data = {"Text": texts}
            dataset = Dataset.from_dict(data)
            
            test_size = min(0.2, 1/len(texts))
            split_dataset = dataset.train_test_split(test_size=test_size)
            
            def tokenize_function(examples):
                formatted_texts = [f"<|startoftext|>{text}<|endoftext|>" for text in examples["Text"]]
                outputs = self.tokenizer(
                    formatted_texts,
                    padding="max_length",
                    truncation=True,
                    max_length=self.config.max_length,
                    return_tensors=None
                )
                outputs["labels"] = outputs["input_ids"].copy()
                return outputs
            
            # Process in smaller batches to manage memory
            tokenized_dataset = split_dataset.map(
                tokenize_function,
                batched=True,
                batch_size=4,  # Smaller batch size for processing
                remove_columns=split_dataset["train"].column_names,
                desc="Tokenizing dataset"
            )
            
            return tokenized_dataset
            
        except Exception as e:
            logging.error(f"Error in prepare_dataset: {str(e)}")
            raise

    def train(self, texts: List[str]):
        """Train with CPU-optimized configuration"""
        try:
            tokenized_dataset = self.prepare_dataset(texts)
            
            num_examples = len(tokenized_dataset["train"])
            total_steps = (
                num_examples 
                * self.config.num_epochs 
                // (self.config.batch_size * self.config.gradient_accumulation_steps)
            )
            
            eval_steps = max(1, min(total_steps // 5, 50))
            save_steps = eval_steps
            logging_steps = max(1, min(total_steps // 10, 25))
            warmup_steps = max(100, total_steps // 10)
            
            training_args = TrainingArguments(
                output_dir=self.config.output_dir,
                evaluation_strategy="steps",
                eval_steps=eval_steps,
                save_strategy="steps",
                save_steps=save_steps,
                learning_rate=self.config.learning_rate,
                lr_scheduler_type="cosine_with_restarts",
                warmup_steps=warmup_steps,
                per_device_train_batch_size=self.config.batch_size,
                gradient_accumulation_steps=self.config.gradient_accumulation_steps,
                num_train_epochs=self.config.num_epochs,
                weight_decay=self.config.weight_decay,
                logging_dir=self.config.logging_dir,
                logging_steps=logging_steps,
                load_best_model_at_end=True,
                metric_for_best_model="eval_loss",
                greater_is_better=False,
                save_total_limit=2,
                overwrite_output_dir=True,
                remove_unused_columns=False,
                fp16=False,  # Disable mixed precision
                prediction_loss_only=True,
                max_grad_norm=1.0,
                dataloader_num_workers=0,
                gradient_checkpointing=True,
                no_cuda=True  # Force CPU usage
            )
            
            trainer = Trainer(
                model=self.model,
                args=training_args,
                train_dataset=tokenized_dataset["train"],
                eval_dataset=tokenized_dataset["test"],
                callbacks=[
                    EarlyStoppingCallback(
                        early_stopping_patience=3,
                        early_stopping_threshold=0.01
                    )
                ]
            )
            
            train_result = trainer.train()
            
            logging.info(f"\nTraining completed with:")
            logging.info(f"Total steps: {train_result.global_step}")
            logging.info(f"Training loss: {train_result.training_loss}")
            
            eval_results = trainer.evaluate()
            logging.info(f"Final evaluation results: {eval_results}")
            
            save_path = os.path.join(self.config.output_dir, "final_model")
            trainer.save_model(save_path)
            self.tokenizer.save_pretrained(save_path)
            logging.info(f"Model saved to: {save_path}")
            
            return train_result
            
        except Exception as e:
            logging.error(f"Error in train: {str(e)}")
            raise
            
if __name__ == "__main__":
    config = ModelConfig()
    trainer = HumorBotTrainer(config)
    
    example_texts = [
        "Why do programmers prefer dark mode? Because light attracts bugs! 😄",
        "My code doesn't work, I have no idea why. My code works, I have no idea why! 🤔",
        "What's a programmer's favorite place? Stack OverCoffee! ☕",
        "Why did the programmer quit his job? Because he didn't get arrays! 😅",
        "Binary jokes are easy, there's only 10 of them! 🤓",
        "What's a developer's favorite tea? Git-Tea! 🍵",
        "How many programmers does it take to change a light bulb? None, it's a hardware problem! 💡",
        "!false - It's funny because it's true! 😂",
        "Real programmers count from 0! 🔢",
        "What's the object-oriented way to become wealthy? Inheritance! 💰",
    ]
    
    # Train and save the model
    train_result = trainer.train(example_texts)
    
    # Optional: Additional explicit save at the end
    trainer.model.save_pretrained("./fine_tuned_personality_bot")
    trainer.tokenizer.save_pretrained("./fine_tuned_personality_bot")
    
    print(f"Model and tokenizer saved to: {config.output_dir}")

2024-11-13 06:31:39,136 - INFO - Using device: cpu


RuntimeError: Error: cannot set number of interop threads after parallel work has started or set_num_interop_threads called

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import Tuple
import random
import re

# Automatically use GPU if available, else fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_PATH = "./fine_tuned_personality_bot/final_model"

def setup_model() -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
    """Setup fine-tuned model and tokenizer"""
    try:
        print("Attempting to load model from:", MODEL_PATH)
        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
        tokenizer.pad_token = tokenizer.eos_token
        
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_PATH,
            torch_dtype=torch.float32,
            low_cpu_mem_usage=True,
            pad_token_id=tokenizer.eos_token_id
        ).to(device)
        
        model.eval()
        print("Model loaded successfully!")
        return model, tokenizer
    except Exception as e:
        print(f"Error loading model: {str(e)}")
        raise

def clean_response(text: str) -> str:
    """Clean and format the generated response"""
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\[.*?\]|\(.*?\)|"|\b(Note|Example|Rules|We accept|Q:|A:).*', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    if len(text) > 10 and not any(text.lower().startswith(prefix) for prefix in 
                                   ['i ', 'the ', 'this ', 'it ', 'there ', 'in ', 'no,', 'yes,', 'we ']):
        if not text.endswith(('.', '!', '?')):
            text += '!'
        return text
    
    return ""

def get_general_fallback() -> str:
    general_fallbacks = [
        "Ah, searching for meaning, are we? Bold choice!",
        "Hold on, let me ask my nonexistent soul.",
        "I could tell you, but where's the fun in that?",
        "You're asking the right bot, but the wrong decade.",
        "You want answers? I only do existential crisis.",
        "Just here to be sarcastic, not profound.",
        "Did you mistake me for a philosopher?",
    ]
    return random.choice(general_fallbacks)

def generate_response(prompt: str, model: AutoModelForCausalLM, tokenizer: AutoTokenizer) -> str:
    """Generate a witty response based on the prompt with a fixed humorous tone instruction"""
    try:
        # Prepend instruction to set the humorous, absurd tone
        instruction = (
            "You are a sarcastic, witty bot that always responds in a funny, clever, and often absurd way. Avoid straightforward answers and instead give responses that are humorous, playful, and absurd.\n\n "
            ""
        )
        
        # Combine the instruction with the user’s prompt
        context = instruction + f"{prompt}\nA:"
        
        print("Context generated for prompt:", context)  # Debugging statement to see the full input
        
        inputs = tokenizer(
            context,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=256,  # Ensure the entire instruction + prompt fits
            return_attention_mask=True
        ).to(device)
        
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=40,          # Limit response length
                do_sample=True,             # Enable sampling for more creative responses
                temperature=0.8,            # Increase temperature for randomness
                top_p=0.9,                  # Top-p sampling to encourage variety
                repetition_penalty=1.5,     # Discourage repetitive responses
                pad_token_id=tokenizer.eos_token_id,
                no_repeat_ngram_size=2
            )
        
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        response = generated_text.split("A:")[-1].strip()
        
        response = clean_response(response)
        response = re.sub(r'\b(B:|A:|Prompt|Response [\d]+:).*', '', response).strip()
        
        if response.lower() in ["yes", "no", "i don’t know", "i'm not sure", "it's delicious"]:
            response = get_general_fallback()
        
        return response if response else get_general_fallback()
        
    except Exception as e:
        print(f"Error generating response: {str(e)}")
        return "Runtime Error: Brain.sol not found!"

def main():
    print("Loading fine-tuned model...")
    model, tokenizer = setup_model()
    
    test_prompts = [
        "Why do Ethereum developers need glasses?",
        "What's the meaning of life?",
        "How does Tether backup its code?",
        "Tell me a joke about NFTs.",
        "Do smart contracts have feelings?",
        "Why is DeFi so confusing?",
        "What’s a Web3 developer’s favorite git command?",
        "Do you think robots will take over the world?",
        "How do I get rich quick?",
        "What's your opinion on pineapple pizza?",
        "Why are crypto traders so obsessed with 'hodling'?",
        "If Bitcoin could talk, what would it say about its price?",
        "Why do people think NFTs are worth so much?",
        "How should I become a crypto millionaire?",
        "What’s the best thing about blockchain?",
        "What advice would you give to someone new to crypto?",
        "Why are gas fees so high?",
        "Why do people keep saying 'to the moon'?",
        "What’s your opinion on Dogecoin?",
        "Is the metaverse going to take over reality?",
    ]
    
    print("\nGenerating responses...\n")
    
    for i, prompt in enumerate(test_prompts, 1):
        print(f"Prompt {i}: {prompt}")
        response = generate_response(prompt, model, tokenizer)
        print(f"Response: {response}\n")
        print("-" * 80 + "\n")
    
    print("Enter your own prompts (type 'quit' to exit):")
    while True:
        user_prompt = input("\nYour prompt: ").strip()
        if user_prompt.lower() == 'quit':
            break
        response = generate_response(user_prompt, model, tokenizer)
        print(f"Response: {response}")

if __name__ == "__main__":
    main()

Loading fine-tuned model...
Attempting to load model from: ./fine_tuned_personality_bot/final_model


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded successfully!

Generating responses...

Prompt 1: Why do Ethereum developers need glasses?
Context generated for prompt: You are a sarcastic, witty bot that always responds in a funny, clever, and often absurd way. Avoid straightforward answers and instead give responses that are humorous, playful, and absurd.

 Why do Ethereum developers need glasses?
A:
Response: Because they have a hard time focusing on the real world, especially when working with new tech. If you want to make this bot respond in an ironic, sarcastic way, just play with the!

--------------------------------------------------------------------------------

Prompt 2: What's the meaning of life?
Context generated for prompt: You are a sarcastic, witty bot that always responds in a funny, clever, and often absurd way. Avoid straightforward answers and instead give responses that are humorous, playful, and absurd.

 What's the meaning of life?
A:
Response: Hold on, let me ask my nonexistent soul.

---------