In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import nltk
from nltk.util import ngrams
import random
import re

class PremiumDomainGenerator:
    def __init__(self):
        # Download required NLTK data
        nltk.download('words')
        nltk.download('wordnet')
        self.english_words = set(nltk.corpus.words.words())
        self.vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(2, 4))
        self.classifier = RandomForestClassifier()
        
    def preprocess_domains(self, domains):
        # Convert to string and handle any non-string types
        processed_domains = []
        for domain in domains:
            try:
                # Convert to string if not already
                domain_str = str(domain)
                # Remove TLD if present and convert to lowercase
                clean_domain = re.sub(r'\..*$', '', domain_str.lower())
                processed_domains.append(clean_domain)
            except (AttributeError, TypeError):
                # Skip invalid entries
                continue
        return processed_domains
        
    def extract_features(self, domains):
        # Extract various features from domains
        features = []
        for domain in domains:
            features.append({
                'length': len(domain),
                'num_words': len(domain.split()),
                'has_numbers': int(bool(re.search(r'\d', domain))),
                'num_vowels': sum(1 for c in domain if c in 'aeiou'),
                'num_consonants': sum(1 for c in domain if c in 'bcdfghjklmnpqrstvwxyz'),
            })
        return pd.DataFrame(features)
        
    def train(self, domains, labels):
        # Preprocess domains
        clean_domains = self.preprocess_domains(domains)
        
        # Ensure we have valid data after preprocessing
        if not clean_domains:
            raise ValueError("No valid domains after preprocessing")
            
        # Create character-level features
        X_char = self.vectorizer.fit_transform(clean_domains)
        
        # Create additional features
        X_features = self.extract_features(clean_domains)
        
        # Train the classifier
        self.classifier.fit(X_char, labels[:len(clean_domains)])
        
    def generate_domains(self, num_domains=10, min_length=5, max_length=15):
        generated_domains = set()
        
        while len(generated_domains) < num_domains:
            # Generate base domain using character n-grams
            length = random.randint(min_length, max_length)
            domain = ''
            
            # Use common patterns from your dataset
            patterns = [
                'tech', 'hub', 'solutions', 'pay', 'desk', 'pro',
                'digital', 'eco', 'cyber', 'smart', 'cloud', 'web'
            ]
            
            if random.random() < 0.3:  # 30% chance to use a pattern
                pattern = random.choice(patterns)
                remaining_length = length - len(pattern)
                if remaining_length > 0:
                    prefix = ''.join(random.choices('abcdefghijklmnopqrstuvwxyz', k=remaining_length))
                    domain = prefix + pattern
            else:
                domain = ''.join(random.choices('abcdefghijklmnopqrstuvwxyz', k=length))
            
            # Evaluate domain quality
            if self.evaluate_domain(domain):
                generated_domains.add(domain)
                
        return list(generated_domains)
    
    def evaluate_domain(self, domain):
        # Basic rules for domain quality
        if len(domain) < 4:
            return False
        if domain.startswith('-') or domain.endswith('-'):
            return False
        if '--' in domain:
            return False
            
        # Check if domain contains recognizable words or patterns
        words = [''.join(gram) for gram in ngrams(domain, 3)]
        word_like = any(word in self.english_words for word in words)
        
        # Evaluate using the trained classifier
        features = self.vectorizer.transform([domain])
        prediction = self.classifier.predict_proba(features)[0]
        
        # Return True if domain passes all checks and has high prediction score
        return word_like and prediction[1] > 0.7

# Example usage
def main():
    # Load your dataset
    df = pd.read_csv('Premium.csv')
    
    # Convert domains to strings and handle NaN values
    domains = df['Name'].astype(str).replace('nan', '').tolist()
    labels = df['isPremium'].tolist()
    
    # Remove empty domains and their corresponding labels
    valid_data = [(d, l) for d, l in zip(domains, labels) if d]
    if not valid_data:
        raise ValueError("No valid domains in dataset")
    
    valid_domains, valid_labels = zip(*valid_data)
    
    # Initialize and train the generator
    generator = PremiumDomainGenerator()
    generator.train(valid_domains, valid_labels)
    
    # Generate new premium domains
    new_domains = generator.generate_domains(num_domains=10)
    
    print("Generated Premium Domains:")
    for domain in new_domains:
        print(domain)

if __name__ == "__main__":
    main()