In [23]:
import pandas as pd
import numpy as np
from collections import Counter
import requests
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import words
import itertools
import spacy

class PremiumDomainGenerator:
    def __init__(self):
        # Initialize NLTK
        nltk.download('words')
        self.english_words = set(words.words())
        self.nlp = spacy.load("en_core_web_sm")  # Load the spaCy model
        
        # Common variations for letters/numbers
        self.letter_variations = {
        
            'e': ['3'],
            'i': ['1'],
            'o': ['0'],
            's': ['z', '5'],
            'z': ['s'],
            'y': ['i', 'ie'],
        }
        
        # Common domain patterns
        self.domain_patterns = {
            'suffix': ['hub', 'app', 'tech', 'ai', 'io', 'pro', 'lab', 'box', 'space'],
            'prefix': ['e', 'i', 'my','our', 'the', 'web', 'smart', 'cyber', 'mage', 'ape'],
        }
        
        # Industry terms
        self.industry_terms = {
            'tech': ['ai', 'ml', 'crypto', 'web3', 'cloud', 'data', 'cyber', 'tech'],
            'business': ['biz', 'corp', 'inc', 'pro', 'group', 'global'],
            'social': ['social', 'share', 'connect', 'community', 'network'],
            'commerce': ['shop', 'store', 'market', 'buy', 'sell', 'trade'],
        }

    def analyze_existing_domains(self, domains):
        """Analyze patterns in existing premium domains"""
        patterns = {
            'lengths': [],
            'words': [],
            'prefixes': [],
            'suffixes': []
        }
        
        for domain in domains:
            domain = str(domain).lower()
            patterns['lengths'].append(len(domain))
            
            # Extract words
            words = re.findall(r'[a-zA-Z]+', domain)
            patterns['words'].extend(words)
            
            # Analyze prefixes and suffixes
            if len(words) > 0:
                patterns['prefixes'].append(words[0][:2])
                patterns['suffixes'].append(words[-1][-2:])
        
        return {
            'avg_length': np.mean(patterns['lengths']),
            'common_words': Counter(patterns['words']).most_common(20),
            'common_prefixes': Counter(patterns['prefixes']).most_common(10),
            'common_suffixes': Counter(patterns['suffixes']).most_common(10)
        }

    def get_trending_words(self):
        """Get trending words from various sources"""
        trending_words = set()
        
        # Add your trending word sources here
        # Example: Tech news websites, social media trends, etc.
        trending_words.update([
            'ai', 'crypto', 'nft', 'defi', 'web3', 'meta', 'blockchain', 'cloud', 'digital', 'smart', 'cyber', 'eco', 'sustainable',
    'tech', 'quantum', 'robotics', 'iot', 'vr', 'ar', 'fintech', 'biotech', 'edtech', 'greentech', 'saas', 'drones', 'automation',
    '5g', 'neural', 'genomics', 'spacex', 'tesla', 'openai', 'chatgpt', 'startup', 'venture', 'capital', 'equity', 'fund', 'invest',
    'trade', 'market', 'economy', 'biz', 'corp', 'inc', 'global', 'holdings', 'enterprise', 'commerce', 'ecommerce', 'retail',
    'wholesale', 'supply', 'logistics', 'consulting', 'advisory', 'strategy', 'growth', 'profit', 'revenue', 'sales', 'marketing',
    'branding', 'health', 'wellness', 'fitness', 'nutrition', 'mentalhealth', 'wellbeing', 'selfcare', 'mindfulness', 'yoga',
    'meditation', 'therapy', 'rehab', 'clinic', 'pharmacy', 'healthcare', 'medical', 'dental', 'optometry', 'chiropractic',
    'holistic', 'organic', 'vegan', 'keto', 'paleo', 'supplements', 'vitamins', 'wellnesscoach', 'personaltrainer', 'spa', 'massage',
    'travel', 'adventure', 'explore', 'wanderlust', 'foodie', 'gourmet', 'chef', 'recipe', 'fashion', 'style', 'beauty', 'makeup',
    'skincare', 'music', 'podcast', 'vlog', 'blog', 'influencer', 'creator', 'artist', 'gamer', 'esports', 'stream', 'movie', 'film',
    'cinema', 'theater', 'dance', 'eco', 'green', 'renewable', 'solar', 'wind', 'climate', 'carbon', 'footprint', 'conservation',
    'recycling', 'biodiversity', 'permaculture', 'zerowaste', 'plasticfree', 'ethical', 'fairtrade', 'upcycle', 'cleanenergy',
    'ecofriendly', 'environment', 'nature', 'wildlife', 'forest', 'ocean', 'planet', 'earth', 'learn', 'study', 'tutor', 'mentor',
    'coach', 'academy', 'institute', 'school', 'college', 'university', 'course', 'class', 'workshop', 'seminar', 'webinar',
    'training', 'skills', 'knowledge', 'wisdom', 'elearning', 'onlinecourse', 'certification', 'degree', 'diploma', 'curriculum',
    'syllabus', 'lesson', 'homework', 'studyguide', 'examprep', 'social', 'network', 'connect', 'community', 'forum', 'group', 'club',
    'society', 'meetup', 'event', 'gathering', 'party', 'celebration', 'festival', 'charity', 'nonprofit', 'volunteer', 'support',
    'help', 'outreach', 'advocacy', 'campaign', 'movement', 'initiative', 'collaboration', 'partnership', 'team', 'crew', 'squad',
    'tribe', 'amazon', 'google', 'facebook', 'apple', 'microsoft', 'netflix', 'spotify', 'uber', 'lyft', 'airbnb', 'tiktok', 'instagram',
    'snapchat', 'twitter', 'linkedin', 'pinterest', 'reddit', 'youtube', 'whatsapp', 'wechat', 'telegram', 'slack', 'discord', 'zoom',
    'skype', 'shopify', 'salesforce', 'adobe', 'oracle', 'sap', 'intel', 'amd', 'nvidia', 'samsung', 'huawei', 'xiaomi', 'sony',
    'lg', 'panasonic', 'nike', 'adidas', 'puma', 'reebok', 'underarmour', 'zara', 'h&m', 'uniqlo', 'ikea', 'starbucks', 'mcdonalds',
    'kfc', 'burgerking', 'subway', 'dominos', 'pepsi', 'coca-cola', 'nestle', 'procter', 'gamble', 'johnson', 'johnson', 'pfizer',
    'moderna', 'astrazeneca', 'biontech', 'merck', 'novartis', 'roche', 'glaxosmithkline', 'sanofi', 'bayer', 'siemens', 'bosch',
    'ge', 'honeywell', 'lockheed', 'martin', 'boeing', 'airbus', 'northrop', 'grumman', 'raytheon', 'general', 'dynamics', 'bae',
    'systems', 'thales', 'leonardo', 'rolls-royce', 'mitsubishi', 'hitachi', 'toshiba', 'fujitsu', 'nec', 'canon', 'nikon', 'olympus',
    'kodak', 'philips', 'sharp', 'pioneer', 'yamaha', 'suzuki', 'honda', 'toyota', 'nissan', 'mazda', 'subaru', 'mitsubishi', 'hyundai',
    'kia', 'daewoo', 'volkswagen', 'bmw', 'mercedes', 'audi', 'porsche', 'ferrari', 'lamborghini', 'maserati', 'bugatti', 'astonmartin',
    'bentley', 'rollsroyce', 'jaguar', 'landrover', 'range', 'rover', 'tesla', 'spacex', 'blueorigin', 'virgingalactic', 'nasa',
    'esa', 'roscosmos', 'isro', 'jaxa', 'cnes', 'dlr', 'uksa', 'csiro', 'cnsa', 'knes', 'supreme', 'offwhite', 'balenciaga', 'gucci',
    'prada', 'louisvuitton', 'chanel', 'dior', 'hermes', 'versace', 'burberry', 'givenchy', 'fendi', 'valentino', 'saintlaurent',
    'balmain', 'celine', 'loewe', 'bottegaveneta', 'alexandermcqueen', 'maisonmargiela', 'rickowens', 'commedesgarcons', 'vetements',
    'fearofgod', 'yeezy', 'bape', 'stussy', 'supreme', 'palace', 'kith', 'offwhite', 'nike'
        ])
        
        return trending_words

    def is_noun(self, word):
        """Check if the word is a noun using spaCy POS tagging"""
        doc = self.nlp(word)
        return doc[0].pos_ == "NOUN"  # Check if the word is a noun

    def can_be_pluralized_with_s(self, word):
        """Determine if a word can be pluralized by simply adding 's'."""
        irregular_plural_exceptions = {'man', 'woman', 'child', 'foot', 'tooth', 'mouse', 'person'}
        
        if word.lower() in irregular_plural_exceptions:
            return False  # Irregular forms, not handled by adding 's'
        if word.endswith(('s', 'x', 'z', 'sh', 'ch')):
            return False  # Needs 'es'
        if word.endswith('y') and len(word) > 1 and word[-2] not in 'aeiou':
            return False  # Needs 'ies'
        
        return True

  
    def generate_variations(self, word):
        """Generate variations of a word"""
        variations = set([word])
        
        # Check if the word is a noun
        if self.is_noun(word):
            # Plural forms
            if self.can_be_pluralized_with_s(word):
                variations.add(word + 's')
            if word.endswith('y') and len(word) > 1 and word[-2] not in 'aeiou':
                variations.add(word[:-1] + 'ies')  # Convert to plural for words like 'baby' -> 'babies'

            if word.endswith('ies'):
                variations.add(word[:-3] + 'y')  # Handle singular conversion for 'babies' -> 'baby'

        for i in range(len(word)):
            if word[i] in self.letter_variations:
                for variation in self.letter_variations[word[i]]:
                    new_word = word[:i] + variation + word[i+1:]
                    variations.add(new_word)
    
        return variations

    # def generate_variations(self, word):
    #     """Generate variations of a word"""
    #     variations = set([word])
        
    #     # Plural forms

    #     variations.add(word + 's')
    #     if word.endswith('y'):
    #         variations.add(word[:-1] + 'ies')

    #     if word.endswith('ies'):
    #         variations.add(word[:-3] + 'y')
            
    #     # Number replacements
    #     for i in range(len(word)):
    #         if word[i] in self.letter_variations:
    #             for variation in self.letter_variations[word[i]]:
    #                 new_word = word[:i] + variation + word[i+1:]
    #                 variations.add(new_word)
        
        # Add common suffixes
        # for suffix in self.domain_patterns['suffix']:
        #     variations.add(word + suffix)
            
        # # Add common prefixes
        # for prefix in self.domain_patterns['prefix']:
        #     variations.add(prefix + word)
        
        # return variations

    def generate_premium_domains(self, existing_domains, num_domains=100):
        """Generate premium domain names"""
        premium_domains = set()
        
        # Get trending words
        trending_words = self.get_trending_words()
        
        # Combine seed words with trending words
        # all_base_words = set(existing_domains) | trending_words
        
        for domain in existing_domains:
            # Generate variations
            # print(domain)
            variations = self.generate_variations(domain)
            
            # Combine with industry terms
            # for term in itertools.chain(*self.industry_terms.values()):
            #     premium_domains.add(f"{base_word}{term}")
            #     premium_domains.add(f"{term}{base_word}")
                
            # Add variations
            premium_domains.update(variations)
            
            # Create compound domains
            # for second_word in all_base_words:
            #     if second_word != base_word:
            #         premium_domains.add(f"{base_word}{second_word}")
        
        # Filter domains
        # print(premium_domains)
        filtered_domains = self.filter_domains(premium_domains)
        
        return list(filtered_domains)

    def filter_domains(self, domains):
        """Filter domains based on quality criteria"""
        filtered = set()
        
        for domain in domains:
            # Length check
            if not (2 <= len(domain) <= 20):
                continue
                
            # Character check
            if not re.match(r'^[a-zA-Z0-9-]+$', domain):
                continue
                
            # No double hyphens
            if '--' in domain:
                continue
                
            # No starting/ending hyphens
            if domain.startswith('-') or domain.endswith('-'):
                continue
            
            filtered.add(domain)
            
        return filtered

    def score_domain(self, domain):
        """Score a domain based on various factors"""
        score = 100
        
        # Length factor
        if len(domain) < 5:
            score += 10
        elif len(domain) > 10:
            score -= 20
            
        # Contains dictionary word
        if any(word in self.english_words for word in re.findall(r'[a-zA-Z]+', domain)):
            score += 15
            
        # Contains trending term
        if any(term in domain for term in self.get_trending_words()):
            score += 20
            
        # Contains industry term
        if any(term in domain for terms in self.industry_terms.values() for term in terms):
            score += 15
            
        return max(0, min(score, 100))  # Normalize between 0 and 100

def main():
    # Example usage
    generator = PremiumDomainGenerator()
    
    # Load your existing premium domains
    df = pd.read_csv('data.csv')
    existing_domains = df['domain'].tolist()
    existing_domains = ['bizness', 'energy']
    
    # Analyze existing domains
    analysis = generator.analyze_existing_domains(existing_domains)
    print("Domain Analysis:", analysis)
    
    # Generate new domains
    seed_words = ['boy', 'tech', 'crypto', 'meta']  # Add your seed words
    new_domains = generator.generate_premium_domains(existing_domains)
    
    print(new_domains)
    # Score and sort domains
    scored_domains = [(domain, generator.score_domain(domain)) for domain in new_domains]
    scored_domains.sort(key=lambda x: x[1], reverse=True)
    
    print("\nGenerated Premium Domains:")
    for domain, score in scored_domains[:20]:
        print(f"{domain}: {score}")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package words to /home/maneth/nltk_data...
[nltk_data]   Package words is already up-to-date!


Domain Analysis: {'avg_length': np.float64(6.5), 'common_words': [('bizness', 1), ('energy', 1)], 'common_prefixes': [('bi', 1), ('en', 1)], 'common_suffixes': [('ss', 1), ('gy', 1)]}
['bisness', 'biznesz', 'bizness', 'b1zness', 'biznes5', 'energi', 'biznezs', 'energie', '3nergy', 'bizn3ss', 'energy', 'bizne5s', 'energies', 'en3rgy']

Generated Premium Domains:
bisness: 100
biznesz: 100
bizness: 100
b1zness: 100
biznes5: 100
energi: 100
biznezs: 100
energie: 100
3nergy: 100
bizn3ss: 100
energy: 100
bizne5s: 100
energies: 100
en3rgy: 100
