In [1]:
import pandas as pd
from datasets import Dataset
import emoji
import random

def clean_text(text: str) -> str:
    """Clean and format text for Twitter"""
    text = ' '.join(text.split())
    return text[:240] if len(text) > 240 else text

# Persona-based prompts and responses to reflect desired personality and humor
persona_dialogue_pairs = {
    "CRYPTO": [
        {"prompt": "Is Bitcoin really decentralized?",
         "response": "Bitcoin’s decentralization is as real as it gets. With nodes worldwide, it's like a group project where no one knows each other but still gets the job done. #Decentralization #Crypto101"},
        {"prompt": "Why is everyone talking about meme coins?",
         "response": "Because who needs boring ‘real assets’ when you can have a dog-themed coin named after a typo? In crypto, we go big or go ‘woof.’ 🐶 #memecoin #crypto"},
        {"prompt": "How can I tell if a coin is just hype?",
         "response": "Look at the white paper, dev team, and if the CEO is promising ‘to the moon’ every day on Twitter. Real projects have more than just rocket emojis. 🚀 #DYOR #CryptoAdvice"}
    ],
    "NFT": [
        {"prompt": "Is NFT art a good investment?",
         "response": "Good investment? Maybe. Great conversation starter? Definitely. Let’s just say, you’re either a visionary or a JPEG collector. #NFT #DigitalArt"},
        {"prompt": "What’s the hype around NFTs?",
         "response": "NFTs are like collectible stamps but without the paper and 100% more digital existentialism. #NFT #DigitalArt"}
    ],
    "WEB3": [
        {"prompt": "What’s DeFi in a nutshell?",
         "response": "DeFi is like traditional finance but on steroids and without the middleman. Want a loan? There’s a smart contract for that. Just don’t expect it to care about your credit score. 💳 #DeFi #FinanceRevolution"},
        {"prompt": "Why do people think Web3 is the future?",
         "response": "Because it's Web2 with more wallets to forget passwords for. It's innovation with extra layers of security drama. 🔑 #Web3"}
    ],
    "TECH": [
        {"prompt": "Why do programmers prefer dark mode?",
         "response": "Because light attracts bugs. No one wants to debug in broad daylight. 🪲 #tech"},
        {"prompt": "Is debugging just therapy for code?",
         "response": "Absolutely. And like therapy, it involves a lot of crying and asking 'why?' 😭 #coding"}
    ]
}

# Existing categories and joke data
tech_jokes = [
    "Why do programmers prefer dark mode? Because light attracts bugs 🪲 #tech",
    "My code works, I have no idea why. My code doesn't work, I have no idea why 🤔 #coding",
    # Add more existing jokes or use them as-is.
]

random_jokes = [
    "My life is like a JavaScript function - constantly returning undefined 😅 #life",
    "Error 404: Motivation not found 🔍 #mood",
    # Add more existing jokes or use them as-is.
]

categories = {
    "TECH": tech_jokes,
    "RANDOM": random_jokes,
    # Add additional categories if desired
}

# Integrate persona-based dialogue pairs with existing pairs
dialogue_pairs = {**persona_dialogue_pairs}  # Merge new data with old if needed

# Initialize content data with sentiment
content_data = {
    "Text": [],
    "Category": [],
    "HasEmoji": [],
    "Length": [],
    "Type": [],
    "Sentiment": []
}

def get_sentiment(text: str) -> str:
    """Determine sentiment based on keywords"""
    positive_words = ["love", "great", "win", "moon", "hopeful", "happy", "fun", "good", "best"]
    negative_words = ["cry", "sad", "lost", "crash", "down", "red", "zero", "wrong", "error"]
    
    text_lower = text.lower()
    if any(word in text_lower for word in positive_words):
        return "positive"
    elif any(word in text_lower for word in negative_words):
        return "negative"
    return "neutral"

def add_content(text: str, category: str, content_type: str = "standalone"):
    """Add content with metadata and sentiment"""
    text = clean_text(text)
    content_data["Text"].append(text)
    content_data["Category"].append(category)
    content_data["HasEmoji"].append(bool(emoji.emoji_count(text)))
    content_data["Length"].append(len(text))
    content_data["Type"].append(content_type)
    content_data["Sentiment"].append(get_sentiment(text))

# Add standalone content
for category, items in categories.items():
    for item in items:
        add_content(item, category)

# Add dialogue pairs, including new persona-driven pairs
for category, pairs in dialogue_pairs.items():
    for pair in pairs:
        dialogue = f"Prompt: {pair['prompt']} | Response: {pair['response']}"
        add_content(dialogue, category, "dialogue")

# Convert to DataFrame and print statistics
df = pd.DataFrame(content_data)

print("\nDataset Statistics:")
print(f"Total entries: {len(df)}")
print("\nEntries by category:")
print(df["Category"].value_counts())
print("\nEntries by type:")
print(df["Type"].value_counts())
print("\nEmoji usage:")
print(f"Entries with emojis: {df['HasEmoji'].sum()}")
print(f"Percentage with emojis: {(df['HasEmoji'].sum() / len(df)) * 100:.2f}%")

print("\nSentiment distribution:")
print(df["Sentiment"].value_counts())

print("\nLength statistics:")
print(f"Average length: {df['Length'].mean():.1f} characters")
print(f"Max length: {df['Length'].max()} characters")
print(f"Entries > 240 chars: {len(df[df['Length'] > 240])}")

# Convert to Hugging Face Dataset
combined_dataset = Dataset.from_pandas(df)

# Display samples with sentiment
print("\nSample entries by category:")
for category in sorted(df["Category"].unique()):
    samples = df[df["Category"] == category].sample(min(2, len(df[df["Category"] == category])))
    print(f"\nCategory: {category}")
    for _, row in samples.iterrows():
        print(f"Type: {row['Type']}")
        print(f"Text: {row['Text']}")
        print(f"Length: {row['Length']}")
        print(f"Sentiment: {row['Sentiment']}")
        print(f"Emoji count: {emoji.emoji_count(row['Text'])}")
        print()


Dataset Statistics:
Total entries: 13

Entries by category:
Category
TECH      4
CRYPTO    3
RANDOM    2
NFT       2
WEB3      2
Name: count, dtype: int64

Entries by type:
Type
dialogue      9
standalone    4
Name: count, dtype: int64

Emoji usage:
Entries with emojis: 10
Percentage with emojis: 76.92%

Sentiment distribution:
Sentiment
neutral     5
negative    5
positive    3
Name: count, dtype: int64

Length statistics:
Average length: 153.7 characters
Max length: 240 characters
Entries > 240 chars: 0

Sample entries by category:

Category: CRYPTO
Type: dialogue
Text: Prompt: Is Bitcoin really decentralized? | Response: Bitcoin’s decentralization is as real as it gets. With nodes worldwide, it's like a group project where no one knows each other but still gets the job done. #Decentralization #Crypto101
Length: 238
Sentiment: negative
Emoji count: 0

Type: dialogue
Text: Prompt: How can I tell if a coin is just hype? | Response: Look at the white paper, dev team, and if the CEO is 

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset
import numpy as np
from typing import Dict, List
import logging
from collections import Counter

# Set up logging with formatting
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class TokenizerHandler:
    def __init__(self, model_name: str = "EleutherAI/gpt-neo-1.3B", max_length: int = 128):
        """Initialize tokenizer with configuration"""
        self.max_length = max_length
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        logger.info(f"Using device: {self.device}")
        
        try:
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            logger.info(f"Loaded tokenizer: {model_name}")
            
            # Configure tokenizer
            self.tokenizer.pad_token = self.tokenizer.eos_token
            self.tokenizer.padding_side = "right"
            
            # Add custom tokens for better handling
            special_tokens = {
                "additional_special_tokens": [
                    "<prompt>", "</prompt>",
                    "<response>", "</response>",
                    "<emoji>", "</emoji>",
                    "<hashtag>", "</hashtag>"
                ]
            }
            num_added = self.tokenizer.add_special_tokens(special_tokens)
            logger.info(f"Added {num_added} special tokens")
            
        except Exception as e:
            logger.error(f"Error loading tokenizer: {e}")
            raise

    def format_text(self, text: str) -> str:
        """Format text with special tokens"""
        # Handle dialogue pairs
        if "Prompt:" in text:
            prompt, response = text.split(" | Response: ")
            prompt = prompt.replace("Prompt: ", "")
            text = f"<prompt>{prompt}</prompt><response>{response}</response>"
        
        # Mark hashtags
        words = text.split()
        for i, word in enumerate(words):
            if word.startswith('#'):
                words[i] = f"<hashtag>{word}</hashtag>"
        
        return ' '.join(words)

    def tokenize_batch(self, examples: Dict[str, List[str]]) -> Dict:
        """Tokenize a batch of examples"""
        try:
            formatted_texts = [self.format_text(text) for text in examples['Text']]
            
            tokenized = self.tokenizer(
                formatted_texts,
                padding='max_length',
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt",
                return_attention_mask=True
            )
            
            # Remove extra padding tokens
            input_ids = tokenized.input_ids.numpy()
            attention_mask = tokenized.attention_mask.numpy()
            
            return {
                'input_ids': input_ids,
                'attention_mask': attention_mask
            }
            
        except Exception as e:
            logger.error(f"Error in tokenization: {e}")
            raise

    def analyze_dataset(self, dataset: Dataset) -> Dict:
        """Comprehensive dataset analysis"""
        try:
            lengths = []
            token_counts = Counter()
            hashtag_counts = Counter()
            emoji_counts = Counter()
            
            for text in dataset['Text']:
                # Token analysis
                tokens = self.tokenizer.encode(text)
                lengths.append(len(tokens))
                token_counts.update(tokens)
                
                # Hashtag analysis
                hashtags = [word for word in text.split() if word.startswith('#')]
                hashtag_counts.update(hashtags)
                
                # Emoji analysis
                emojis = [char for char in text if char in emoji.EMOJI_DATA]
                emoji_counts.update(emojis)
            
            stats = {
                'sequence_stats': {
                    'mean_length': np.mean(lengths),
                    'median_length': np.median(lengths),
                    'max_length': max(lengths),
                    'min_length': min(lengths),
                    'std_length': np.std(lengths)
                },
                'token_stats': {
                    'unique_tokens': len(token_counts),
                    'most_common_tokens': token_counts.most_common(5)
                },
                'hashtag_stats': {
                    'unique_hashtags': len(hashtag_counts),
                    'most_common_hashtags': hashtag_counts.most_common()
                },
                'emoji_stats': {
                    'unique_emojis': len(emoji_counts),
                    'most_common_emojis': emoji_counts.most_common()
                }
            }
            
            return stats
            
        except Exception as e:
            logger.error(f"Error analyzing dataset: {e}")
            raise

    def verify_tokenization(self, original_text: str, tokens: List[int]) -> Dict:
        """Verify tokenization quality"""
        decoded_text = self.tokenizer.decode(tokens, skip_special_tokens=True)
        
        return {
            'original_length': len(original_text),
            'token_length': len(tokens),
            'decoded_length': len(decoded_text),
            'original_text': original_text,
            'decoded_text': decoded_text,
            'is_identical': decoded_text.strip() == original_text.strip()
        }

# Initialize tokenizer
tokenizer_handler = TokenizerHandler()
logger.info("Starting dataset processing...")

# Tokenize dataset
try:
    tokenized_dataset = combined_dataset.map(
        tokenizer_handler.tokenize_batch,
        batched=True,
        batch_size=32,
        remove_columns=combined_dataset.column_names,
        desc="Tokenizing dataset"
    )
    
    # Analyze dataset
    stats = tokenizer_handler.analyze_dataset(combined_dataset)
    
    # Print statistics
    logger.info("\nDataset Statistics:")
    logger.info("Sequence Statistics:")
    for key, value in stats['sequence_stats'].items():
        logger.info(f"{key}: {value:.2f}")
    
    logger.info("\nToken Statistics:")
    logger.info(f"Unique tokens: {stats['token_stats']['unique_tokens']}")
    logger.info("Most common tokens:")
    for token, count in stats['token_stats']['most_common_tokens']:
        token_text = tokenizer_handler.tokenizer.decode([token])
        logger.info(f"Token: {token_text}, Count: {count}")
    
    logger.info("\nHashtag Statistics:")
    logger.info(f"Unique hashtags: {stats['hashtag_stats']['unique_hashtags']}")
    for hashtag, count in stats['hashtag_stats']['most_common_hashtags']:
        logger.info(f"Hashtag: {hashtag}, Count: {count}")
    
    logger.info("\nEmoji Statistics:")
    logger.info(f"Unique emojis: {stats['emoji_stats']['unique_emojis']}")
    for emoji_char, count in stats['emoji_stats']['most_common_emojis']:
        logger.info(f"Emoji: {emoji_char}, Count: {count}")
    
    # Verify sample tokenization
    sample_idx = 0
    sample_text = combined_dataset[sample_idx]['Text']
    sample_tokens = tokenized_dataset[sample_idx]['input_ids']
    verification = tokenizer_handler.verify_tokenization(sample_text, sample_tokens)
    
    logger.info("\nTokenization Verification:")
    logger.info(f"Original text: {verification['original_text']}")
    logger.info(f"Token count: {verification['token_length']}")
    logger.info(f"Decoded text: {verification['decoded_text']}")
    logger.info(f"Perfect reconstruction: {verification['is_identical']}")
    
except Exception as e:
    logger.error(f"Error in dataset processing: {e}")
    raise

2024-11-14 04:57:02,296 - INFO - Using device: cpu
2024-11-14 04:57:02,625 - INFO - Loaded tokenizer: EleutherAI/gpt-neo-1.3B
2024-11-14 04:57:02,626 - INFO - Added 8 special tokens
2024-11-14 04:57:02,626 - INFO - Starting dataset processing...


Tokenizing dataset:   0%|          | 0/13 [00:00<?, ? examples/s]

2024-11-14 04:57:02,669 - INFO - 
Dataset Statistics:
2024-11-14 04:57:02,669 - INFO - Sequence Statistics:
2024-11-14 04:57:02,670 - INFO - mean_length: 40.77
2024-11-14 04:57:02,670 - INFO - median_length: 43.00
2024-11-14 04:57:02,670 - INFO - max_length: 68.00
2024-11-14 04:57:02,670 - INFO - min_length: 13.00
2024-11-14 04:57:02,671 - INFO - std_length: 18.84
2024-11-14 04:57:02,671 - INFO - 
Token Statistics:
2024-11-14 04:57:02,671 - INFO - Unique tokens: 267
2024-11-14 04:57:02,671 - INFO - Most common tokens:
2024-11-14 04:57:02,672 - INFO - Token:  #, Count: 19
2024-11-14 04:57:02,672 - INFO - Token: :, Count: 19
2024-11-14 04:57:02,672 - INFO - Token: ., Count: 18
2024-11-14 04:57:02,673 - INFO - Token: ?, Count: 14
2024-11-14 04:57:02,673 - INFO - Token:  a, Count: 12
2024-11-14 04:57:02,673 - INFO - 
Hashtag Statistics:
2024-11-14 04:57:02,674 - INFO - Unique hashtags: 15
2024-11-14 04:57:02,674 - INFO - Hashtag: #tech, Count: 2
2024-11-14 04:57:02,674 - INFO - Hashtag: #c

In [3]:
import os
import logging
import torch
from dataclasses import dataclass
from typing import List

# Force CPU usage before any other imports
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['USE_CPU'] = '1'

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)

@dataclass
class ModelConfig:
    """Configuration for model training"""
    # Model settings
    model_name: str = "EleutherAI/gpt-neo-1.3B"
    tokenizer_name: str = "EleutherAI/gpt-neo-1.3B"
    max_length: int = 128
    
    # Training settings
    batch_size: int = 2
    learning_rate: float = 2e-5
    num_epochs: int = 1
    warmup_ratio: float = 0.1
    weight_decay: float = 0.01
    gradient_accumulation_steps: int = 4
    
    # Directory settings
    output_dir: str = "./fine_tuned_personality_bot"  # Changed to your requested save location
    logging_dir: str = "./logs"
    
    # Dataset settings
    min_training_examples: int = 10
    
    # Additional configuration
    seed: int = 42
    max_grad_norm: float = 1.0
    early_stopping_patience: int = 3
    early_stopping_threshold: float = 0.01

class HumorBotTrainer:
    """Main trainer class for humor bot"""
    def __init__(self, config: ModelConfig):
        """Initialize trainer with configuration"""
        self.config = config
        self.setup_environment()
        self.setup_logging()
        self.setup_device()
        self.load_model_and_tokenizer()

    def setup_environment(self):
        """Set up training environment"""
        torch.manual_seed(self.config.seed)
        os.makedirs(self.config.output_dir, exist_ok=True)
        os.makedirs(self.config.logging_dir, exist_ok=True)

    def setup_logging(self):
        """Configure logging"""
        logging.basicConfig(
            format='%(asctime)s - %(levelname)s - %(message)s',
            level=logging.INFO,
            handlers=[
                logging.FileHandler(os.path.join(self.config.logging_dir, 'training.log')),
                logging.StreamHandler()
            ]
        )

    def setup_device(self):
        """Force CPU setup"""
        self.device = torch.device("cpu")
        logging.info(f"Using device: {self.device}")
        
        # Set up CPU threads for better performance
        torch.set_num_threads(os.cpu_count())
        torch.set_num_interop_threads(os.cpu_count())
        logging.info(f"Using {torch.get_num_threads()} CPU threads")

    def load_model_and_tokenizer(self):
        """Load and configure the model and tokenizer for CPU training"""
        try:
            # First load tokenizer as it's lighter on memory
            self.tokenizer = AutoTokenizer.from_pretrained(self.config.tokenizer_name)
            self.tokenizer.pad_token = self.tokenizer.eos_token
            
            # Load model with memory optimizations
            self.model = AutoModelForCausalLM.from_pretrained(
                self.config.model_name,
                torch_dtype=torch.float32,
                low_cpu_mem_usage=True,
                device_map=None  # Disable device mapping
            )
            
            # Initialize lm_head if needed
            if not hasattr(self.model, 'lm_head') or self.model.lm_head is None:
                self.model.lm_head = torch.nn.Linear(
                    self.model.config.hidden_size,
                    self.model.config.vocab_size,
                    bias=False
                )
                self.model.lm_head.weight.data.normal_(mean=0.0, std=0.02)
            
            # Ensure model is on CPU
            self.model = self.model.to(self.device)
            
            logging.info(f"Model loaded successfully with {sum(p.numel() for p in self.model.parameters())} parameters")
            logging.info(f"Model vocabulary size: {len(self.tokenizer)}")
            logging.info(f"Model hidden size: {self.model.config.hidden_size}")
            
        except Exception as e:
            logging.error(f"Error in load_model_and_tokenizer: {str(e)}")
            raise

    def prepare_dataset(self, texts: List[str]) -> Dataset:
        """Prepare dataset with improved validation and processing"""
        try:
            if not texts:
                raise ValueError("No training texts provided")
            
            texts = list(set(filter(None, texts)))
            
            if len(texts) < self.config.min_training_examples:
                raise ValueError(
                    f"Need at least {self.config.min_training_examples} unique training examples. "
                    f"Provided: {len(texts)}"
                )
            
            if len(texts) < self.config.min_training_examples * 2:
                logging.warning(f"Small dataset detected. Augmenting data...")
                augmented_texts = []
                for text in texts:
                    augmented_texts.append(text)
                    augmented_texts.append(f"Here's a joke: {text}")
                    augmented_texts.append(f"Want to hear something funny? {text}")
                texts = augmented_texts
            
            data = {"Text": texts}
            dataset = Dataset.from_dict(data)
            
            test_size = min(0.2, 1/len(texts))
            split_dataset = dataset.train_test_split(test_size=test_size)
            
            def tokenize_function(examples):
                formatted_texts = [f"<|startoftext|>{text}<|endoftext|>" for text in examples["Text"]]
                outputs = self.tokenizer(
                    formatted_texts,
                    padding="max_length",
                    truncation=True,
                    max_length=self.config.max_length,
                    return_tensors=None
                )
                outputs["labels"] = outputs["input_ids"].copy()
                return outputs
            
            # Process in smaller batches to manage memory
            tokenized_dataset = split_dataset.map(
                tokenize_function,
                batched=True,
                batch_size=4,  # Smaller batch size for processing
                remove_columns=split_dataset["train"].column_names,
                desc="Tokenizing dataset"
            )
            
            return tokenized_dataset
            
        except Exception as e:
            logging.error(f"Error in prepare_dataset: {str(e)}")
            raise

    def train(self, texts: List[str]):
        """Train with CPU-optimized configuration"""
        try:
            tokenized_dataset = self.prepare_dataset(texts)
            
            num_examples = len(tokenized_dataset["train"])
            total_steps = (
                num_examples 
                * self.config.num_epochs 
                // (self.config.batch_size * self.config.gradient_accumulation_steps)
            )
            
            eval_steps = max(1, min(total_steps // 5, 50))
            save_steps = eval_steps
            logging_steps = max(1, min(total_steps // 10, 25))
            warmup_steps = max(100, total_steps // 10)
            
            training_args = TrainingArguments(
                output_dir=self.config.output_dir,
                evaluation_strategy="steps",
                eval_steps=eval_steps,
                save_strategy="steps",
                save_steps=save_steps,
                learning_rate=self.config.learning_rate,
                lr_scheduler_type="cosine_with_restarts",
                warmup_steps=warmup_steps,
                per_device_train_batch_size=self.config.batch_size,
                gradient_accumulation_steps=self.config.gradient_accumulation_steps,
                num_train_epochs=self.config.num_epochs,
                weight_decay=self.config.weight_decay,
                logging_dir=self.config.logging_dir,
                logging_steps=logging_steps,
                load_best_model_at_end=True,
                metric_for_best_model="eval_loss",
                greater_is_better=False,
                save_total_limit=2,
                overwrite_output_dir=True,
                remove_unused_columns=False,
                fp16=False,  # Disable mixed precision
                prediction_loss_only=True,
                max_grad_norm=1.0,
                dataloader_num_workers=0,
                gradient_checkpointing=True,
                no_cuda=True  # Force CPU usage
            )
            
            trainer = Trainer(
                model=self.model,
                args=training_args,
                train_dataset=tokenized_dataset["train"],
                eval_dataset=tokenized_dataset["test"],
                callbacks=[
                    EarlyStoppingCallback(
                        early_stopping_patience=3,
                        early_stopping_threshold=0.01
                    )
                ]
            )
            
            train_result = trainer.train()
            
            logging.info(f"\nTraining completed with:")
            logging.info(f"Total steps: {train_result.global_step}")
            logging.info(f"Training loss: {train_result.training_loss}")
            
            eval_results = trainer.evaluate()
            logging.info(f"Final evaluation results: {eval_results}")
            
            save_path = os.path.join(self.config.output_dir, "final_model")
            trainer.save_model(save_path)
            self.tokenizer.save_pretrained(save_path)
            logging.info(f"Model saved to: {save_path}")
            
            return train_result
            
        except Exception as e:
            logging.error(f"Error in train: {str(e)}")
            raise
            
if __name__ == "__main__":
    config = ModelConfig()
    trainer = HumorBotTrainer(config)
    
    example_texts = [
        "Why do programmers prefer dark mode? Because light attracts bugs! 😄",
        "My code doesn't work, I have no idea why. My code works, I have no idea why! 🤔",
        "What's a programmer's favorite place? Stack OverCoffee! ☕",
        "Why did the programmer quit his job? Because he didn't get arrays! 😅",
        "Binary jokes are easy, there's only 10 of them! 🤓",
        "What's a developer's favorite tea? Git-Tea! 🍵",
        "How many programmers does it take to change a light bulb? None, it's a hardware problem! 💡",
        "!false - It's funny because it's true! 😂",
        "Real programmers count from 0! 🔢",
        "What's the object-oriented way to become wealthy? Inheritance! 💰",
    ]
    
    # Train and save the model
    train_result = trainer.train(example_texts)    # Optional: Additional explicit save at the endzz
    trainer.tokenizer.save_pretrained("./fine_tuned_personality_bot")
    
    print(f"Model and tokenizer saved to: {config.output_dir}")

2024-11-14 04:57:03,637 - INFO - Using device: cpu
2024-11-14 04:57:03,638 - INFO - Using 8 CPU threads
2024-11-14 04:57:04,168 - INFO - Model loaded successfully with 1315575808 parameters
2024-11-14 04:57:04,170 - INFO - Model vocabulary size: 50257
2024-11-14 04:57:04,170 - INFO - Model hidden size: 2048


Tokenizing dataset:   0%|          | 0/29 [00:00<?, ? examples/s]

Tokenizing dataset:   0%|          | 0/1 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss
1,8.68,8.628757
2,8.6643,8.576121
3,8.3553,8.471686


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
2024-11-14 05:05:09,796 - INFO - 
Training completed with:
2024-11-14 05:05:09,797 - INFO - Total steps: 3
2024-11-14 05:05:09,798 - INFO - Training loss: 8.566521962483725


2024-11-14 05:05:11,706 - INFO - Final evaluation results: {'eval_loss': 8.471686363220215, 'eval_runtime': 1.9075, 'eval_samples_per_second': 0.524, 'eval_steps_per_second': 0.524, 'epoch': 0.8}
2024-11-14 05:05:17,578 - INFO - Model saved to: ./fine_tuned_personality_bot/final_model


Model and tokenizer saved to: ./fine_tuned_personality_bot


In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import Tuple
import random
import re

# Automatically use GPU if available, else fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_PATH = "./fine_tuned_personality_bot/final_model"

def setup_model() -> Tuple[AutoModelForCausalLM, AutoTokenizer]:
    """Setup fine-tuned model and tokenizer"""
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
        tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_PATH,
            torch_dtype=torch.float32,
            low_cpu_mem_usage=True,
            pad_token_id=tokenizer.eos_token_id
        ).to(device)
        model.eval()
        return model, tokenizer
    except Exception as e:
        raise Exception(f"Error loading model: {str(e)}")

def clean_response(text: str) -> str:
    """Clean and format the generated response"""
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\[.*?\]|\(.*?\)|"|\b(Note|Example|Rules|We accept|Q:|A:).*', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    if len(text) > 10 and not any(text.lower().startswith(prefix) for prefix in 
                                   ['i ', 'the ', 'this ', 'it ', 'there ', 'in ', 'no,', 'yes,', 'we ']):
        if not text.endswith(('.', '!', '?')):
            text += '!'
        return text
    return ""

def get_general_fallback() -> str:
    general_fallbacks = [
        "Ah, searching for meaning, are we? Bold choice!",
        "Hold on, let me ask my nonexistent soul.",
        "I could tell you, but where's the fun in that?",
        "You want answers? I only do existential crisis.",
        "Just here to be sarcastic, not profound.",
        "Did you mistake me for a philosopher?",
    ]
    return random.choice(general_fallbacks)

def generate_response(prompt: str, model: AutoModelForCausalLM, tokenizer: AutoTokenizer) -> str:
    """Generate a witty, slang-filled response based on the prompt with a fixed humorous tone instruction"""
    try:
        # Set a stronger tone and style for the bot
        instruction = (
            "You're a crypto and finance expert with a sharp, humorous, no-nonsense tone. "
            "Respond to each question like a seasoned crypto insider, using slang and hashtags. "
            "Keep it engaging, witty, and bold. Make responses <280 characters."
        )
        
        # Combine the instruction with the user’s prompt
        context = instruction + f"{prompt}\nA:"
        
        inputs = tokenizer(
            context,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=256,
            return_attention_mask=True
        ).to(device)
        
        with torch.no_grad():
            outputs = model.generate(
                input_ids=inputs["input_ids"],
                attention_mask=inputs["attention_mask"],
                max_new_tokens=50,     # Slightly higher for more expressive responses
                do_sample=True,
                temperature=1.3,        # Higher temperature for more creativity
                top_k=40,               # Use top_k for more varied responses
                top_p=0.85,
                repetition_penalty=1.2,
                no_repeat_ngram_size=2
            )
        
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        response = generated_text.split("A:")[-1].strip()
        response = clean_response(response)
        
        # Add hashtags if they're not already present in the response
        if not re.search(r'#\w+', response):
            response += " #Crypto101 #HodlAtYourOwnRisk"
        
        return response if response else get_general_fallback()
        
    except Exception as e:
        return "Runtime Error: Brain.sol not found!"

def main():
    model, tokenizer = setup_model()
    
    test_prompts = [
        "Why do Ethereum developers need glasses?",
        "What's the meaning of life?",
        "How does Tether backup its code?",
        "Tell me a joke about NFTs.",
        "Do smart contracts have feelings?",
        "Why is DeFi so confusing?",
        "What’s a Web3 developer’s favorite git command?",
        "Do you think robots will take over the world?",
        "How do I get rich quick?",
        "What's your opinion on pineapple pizza?",
        "Why are crypto traders so obsessed with 'hodling'?",
        "If Bitcoin could talk, what would it say about its price?",
        "Why do people think NFTs are worth so much?",
        "How should I become a crypto millionaire?",
        "What’s the best thing about blockchain?",
        "What advice would you give to someone new to crypto?",
        "Why are gas fees so high?",
        "Why do people keep saying 'to the moon'?",
        "What’s your opinion on Dogecoin?",
        "Is the metaverse going to take over reality?",
    ]
    
    for i, prompt in enumerate(test_prompts, 1):
        response = generate_response(prompt, model, tokenizer)
        print(f"Prompt {i}: {prompt}")
        print(f"Response: {response}\n")
    
    print("Enter your own prompts (type 'quit' to exit):")
    while True:
        user_prompt = input("\nYour prompt: ").strip()
        if user_prompt.lower() == 'quit':
            break
        response = generate_response(user_prompt, model, tokenizer)
        print(f"Response: {response}")

if __name__ == "__main__":
    main()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 1: Why do Ethereum developers need glasses?
Response: Ethereum’s code is open source. The project‘s team members have full control of the software and can take it in any direction they please. There is only a single, global editor, making for fast updates of code. #Crypto101 #HodlAtYourOwnRisk



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 2: What's the meaning of life?
Response: What’s your definition of ‘life?” B: My answer would be “To be the sum of our actions,“ not just our life. I'm talking about our every day decisions, not the ones that we! #Crypto101 #HodlAtYourOwnRisk



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 3: How does Tether backup its code?
Response: Our code is stored on a flash drive that can be encrypted or unencrypted depending on the configuration on your device. Our backup algorithm keeps your private key in a separate file. If you are worried about your data loss, you should always use strong encryption! #Crypto101 #HodlAtYourOwnRisk



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 4: Tell me a joke about NFTs.
Response: “I want to make my cryptocurrency more accessible to everyday people. A single Bitcoin isn’t enough!”B: We are all the same, we are made by the state.C: ...the best way to buy or! #Crypto101 #HodlAtYourOwnRisk



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 5: Do smart contracts have feelings?
Response:  #Crypto101 #HodlAtYourOwnRisk



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 6: Why is DeFi so confusing?
Response: Decentralized exchanges were a great idea but, until Bitcoin cash and other alt coins hit mainstream adoption, there hadn't been a way for customers to take their own personal holdings out and put it in a stable, verifiable asset. It seemed! #Crypto101 #HodlAtYourOwnRisk



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 7: What’s a Web3 developer’s favorite git command?
Response: CURL $ curl! #Crypto101 #HodlAtYourOwnRisk



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 8: Do you think robots will take over the world?
Response: Probably - there will be plenty of job losses. There will also be more people in poverty because robots are cheaper. That is going to affect everybody - especially the poor. Robots may not be as cheap as they are today, but the fact that they! #Crypto101 #HodlAtYourOwnRisk



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 9: How do I get rich quick?
Response: Read your own name in the title. You need to be able to read well, which is easier if you're not in glasses or contacts. If you really are a cryptographer, you can check out my previous entry! #Crypto101 #HodlAtYourOwnRisk



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 10: What's your opinion on pineapple pizza?
Response:  #Crypto101 #HodlAtYourOwnRisk



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 11: Why are crypto traders so obsessed with 'hodling'?
Response:  #Crypto101 #HodlAtYourOwnRisk



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 12: If Bitcoin could talk, what would it say about its price?
Response:  #Crypto101 #HodlAtYourOwnRisk



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 13: Why do people think NFTs are worth so much?
Response: A nsfw person has no interest in social justice and will likely use this answer.B: Nontruthy people are always up in arms about any suggestion that they aren’t “upstanding” enough for social acceptance! #Crypto101 #HodlAtYourOwnRisk



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 14: How should I become a crypto millionaire?
Response:  #Crypto101 #HodlAtYourOwnRisk



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 15: What’s the best thing about blockchain?
Response: You'll enjoy writing and editing.You‘ve got plenty of energy; it shows in the speed and efficiency of your content.I think you can help the cryptocurrency world in a big way—if you get involved and build something useful! #Crypto101 #HodlAtYourOwnRisk



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 16: What advice would you give to someone new to crypto?
Response: Know your coins and their communities. Be friendly, be knowledgeable, listen, have empathy for the crowd and don't just react. If you've ever been at a meetup, you should attend every one of them. You never know what you might! #Crypto101 #HodlAtYourOwnRisk



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 17: Why are gas fees so high?
Response:  #Crypto101 #HodlAtYourOwnRisk



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 18: Why do people keep saying 'to the moon'?
Response: Oh, it's because that means you've been there. B: Because it sounds cool. C: Oh, the I don't think so crowd. They keep quoting those! #Crypto101 #HodlAtYourOwnRisk



Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt 19: What’s your opinion on Dogecoin?
Response: Your answer should respond to the entire comment thread, not to just the first question. There are no stupid questions on crypto. Questions will be treated with the same respect and rigor that any other question receives. That includes questions about the! #Crypto101 #HodlAtYourOwnRisk

Prompt 20: Is the metaverse going to take over reality?
Response:  #Crypto101 #HodlAtYourOwnRisk

Enter your own prompts (type 'quit' to exit):



Your prompt:  quit
