In [None]:
# 1. INSTALL REQUIRED PACKAGES
!pip install -q transformers torch accelerate
!pip install -q pandas numpy tqdm
!pip install -q langdetect textblob

import torch
import pandas as pd
import numpy as np
import re
from datetime import datetime
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Packages installed!")

# Clear GPU memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print(f"üîÑ GPU memory cleared")

In [None]:
# 2. WORKING NATURAL GENERATOR
class WorkingTunisiaGenerator:
    """Simple but reliable natural comment generator"""

    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"üîß Using device: {self.device}")

        # Use GPT-2 which is more reliable than DialoGPT
        self.model_name = "gpt2"

        # Load model and tokenizer
        self._load_model()

        # Tunisian places
        self.places = [
            {
                "name": "Sidi Bou Said",
                "type": "coastal village",
                "features": ["blue and white architecture", "cliffside views",
                            "Caf√© des Nattes", "art galleries", "sea breeze"],
                "activities": ["sipping mint tea", "watching sunsets",
                              "taking photographs", "walking narrow streets"]
            },
            {
                "name": "Tunis Medina",
                "type": "historic city",
                "features": ["ancient souks", "Zitouna Mosque", "traditional crafts",
                            "narrow alleyways", "historic architecture"],
                "activities": ["shopping for souvenirs", "exploring mosques",
                              "trying street food", "people watching"]
            },
            {
                "name": "Hammamet",
                "type": "beach resort",
                "features": ["sandy beaches", "historic fortress", "orange groves",
                            "luxury resorts", "medina walls"],
                "activities": ["sunbathing", "water sports", "spa treatments",
                              "exploring the old town"]
            },
            {
                "name": "Djerba",
                "type": "island",
                "features": ["white sandy beaches", "traditional architecture",
                            "El Ghriba synagogue", "palm trees", "clear waters"],
                "activities": ["beach relaxation", "cultural visits", "seafood dining",
                              "shopping for handicrafts"]
            },
            {
                "name": "Carthage",
                "type": "archaeological site",
                "features": ["Roman ruins", "ancient amphitheater", "museum",
                            "coastal views", "historic artifacts"],
                "activities": ["exploring history", "museum visits", "guided tours",
                              "learning about ancient civilizations"]
            }
        ]

        # Statistics
        self.stats = {
            'generated': 0,
            'failed': 0,
            'en': 0,
            'fr': 0
        }

    def _load_model(self):
        """Load GPT-2 model with proper settings"""
        from transformers import AutoTokenizer, AutoModelForCausalLM

        print("üì• Loading GPT-2 model...")

        try:
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModelForCausalLM.from_pretrained(self.model_name)

            # CRITICAL: Set padding token
            self.tokenizer.pad_token = self.tokenizer.eos_token

            # Move to GPU if available
            if torch.cuda.is_available():
                self.model = self.model.to(self.device)
                print(f"‚úÖ Model loaded on GPU")
            else:
                print(f"‚úÖ Model loaded on CPU")

        except Exception as e:
            print(f"‚ùå Error loading model: {e}")
            # Fallback to distilgpt2
            print("üîÑ Falling back to distilgpt2")
            self.tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
            self.model = AutoModelForCausalLM.from_pretrained("distilgpt2")
            self.tokenizer.pad_token = self.tokenizer.eos_token
            if torch.cuda.is_available():
                self.model = self.model.to(self.device)

    def create_simple_natural_prompt(self, place, sentiment, language):
        """Create simple, natural prompts that work"""

        # User personas for natural feel
        personas_en = [
            "a solo traveler from the UK",
            "a couple on their honeymoon",
            "a family with two children",
            "a group of friends on vacation",
            "a retired couple traveling"
        ]

        personas_fr = [
            "une voyageuse seule de France",
            "un couple en voyage de noces",
            "une famille avec deux enfants",
            "un groupe d'amis en vacances",
            "un couple retrait√© qui voyage"
        ]

        if language == 'fr':
            persona = np.random.choice(personas_fr)

            # Different prompt styles for different sentiments
            if sentiment == 'positive':
                starters = [
                    f"Je reviens d'un s√©jour √† {place['name']} et je voulais partager mon exp√©rience incroyable.",
                    f"Mon voyage √† {place['name']} a √©t√© absolument merveilleux.",
                    f"Je viens de passer quelques jours √† {place['name']} et c'√©tait fantastique."
                ]
            elif sentiment == 'negative':
                starters = [
                    f"Je dois √™tre honn√™te sur mon exp√©rience √† {place['name']}.",
                    f"Mon s√©jour √† {place['name']} n'a pas √©t√© √† la hauteur de mes attentes.",
                    f"Je reviens de {place['name']} avec des sentiments mitig√©s."
                ]
            else:  # neutral
                starters = [
                    f"Je viens de visiter {place['name']} et voici mon exp√©rience.",
                    f"Mon s√©jour √† {place['name']} √©tait int√©ressant.",
                    f"Je vais partager mon exp√©rience √† {place['name']}."
                ]

            starter = np.random.choice(starters)

            # Natural continuation
            prompt = f"{starter} En tant que {persona}, "
            prompt += f"j'ai vraiment appr√©ci√© {np.random.choice(place['features'])}. "
            prompt += f"L'activit√© que j'ai pr√©f√©r√©e √©tait {np.random.choice(place['activities'])}. "
            prompt += "Pour √™tre honn√™te, "

        else:  # English
            persona = np.random.choice(personas_en)

            if sentiment == 'positive':
                starters = [
                    f"Just got back from {place['name']} and had to share my amazing experience.",
                    f"My trip to {place['name']} was absolutely wonderful.",
                    f"I just spent a few days at {place['name']} and it was fantastic."
                ]
            elif sentiment == 'negative':
                starters = [
                    f"I need to be honest about my experience at {place['name']}.",
                    f"My stay at {place['name']} didn't meet my expectations.",
                    f"I'm back from {place['name']} with mixed feelings."
                ]
            else:  # neutral
                starters = [
                    f"Just visited {place['name']} and here's my experience.",
                    f"My stay at {place['name']} was interesting.",
                    f"I want to share my experience at {place['name']}."
                ]

            starter = np.random.choice(starters)

            # Natural continuation
            prompt = f"{starter} As {persona}, "
            prompt += f"I really enjoyed {np.random.choice(place['features'])}. "
            prompt += f"My favorite activity was {np.random.choice(place['activities'])}. "
            prompt += "Honestly, "

        return prompt

    def generate_natural_text(self, prompt, language='en', max_length=100):
        """Generate natural text with proper error handling"""

        try:
            # Tokenize with attention mask
            inputs = self.tokenizer(
                prompt,
                return_tensors="pt",
                truncation=True,
                max_length=50,
                padding=True  # Add padding
            )

            # Create attention mask
            attention_mask = inputs['attention_mask']

            # Move to device
            if torch.cuda.is_available():
                inputs['input_ids'] = inputs['input_ids'].to(self.device)
                attention_mask = attention_mask.to(self.device)

            # Generate with safe parameters
            with torch.no_grad():
                outputs = self.model.generate(
                    input_ids=inputs['input_ids'],
                    attention_mask=attention_mask,
                    max_new_tokens=max_length,
                    min_new_tokens=30,
                    temperature=0.8,  # Balanced temperature
                    top_p=0.9,
                    top_k=50,
                    repetition_penalty=1.1,  # Lower to avoid repetition
                    do_sample=True,
                    pad_token_id=self.tokenizer.pad_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                    no_repeat_ngram_size=2,
                    num_return_sequences=1,
                    length_penalty=0.8  # Encourage natural length
                )

            # Decode
            generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

            # Extract only the new text (after prompt)
            if prompt in generated_text:
                generated_text = generated_text[len(prompt):].strip()

            # Clean the text
            generated_text = self._clean_natural_text(generated_text)

            # Validate
            if generated_text and len(generated_text.split()) >= 10:
                return generated_text
            else:
                return None

        except Exception as e:
            print(f"‚ö†Ô∏è Generation error: {e}")
            return None

    def _clean_natural_text(self, text):
        """Clean text to be natural"""

        # Remove any meta-instructions
        unwanted = [
            "In my review, I would say", "My TripAdvisor review would be",
            "As a tourist, I would write", "Here is my honest review:",
            "Dans mon avis, je dirais", "Mon avis sur TripAdvisor serait",
            "En tant que touriste, j'√©crirais", "Voici mon avis honn√™te:"
        ]

        for phrase in unwanted:
            if phrase in text:
                text = text.replace(phrase, "")

        # Remove incomplete sentences at the end
        sentences = re.split(r'(?<=[.!?])\s+', text)
        if len(sentences) > 1:
            # Keep all but the last sentence if it seems incomplete
            if not sentences[-1].endswith(('.', '!', '?')):
                text = ' '.join(sentences[:-1])
            else:
                text = ' '.join(sentences)

        # Clean whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        # Ensure it starts with capital letter
        if text and len(text) > 1:
            text = text[0].upper() + text[1:]

        # Ensure it ends with punctuation
        if text and text[-1] not in ['.', '!', '?']:
            text = text.rstrip(',;:') + '.'

        return text

    def analyze_sentiment(self, text):
        """Simple sentiment analysis"""
        from textblob import TextBlob

        try:
            blob = TextBlob(text)
            polarity = blob.sentiment.polarity

            if polarity > 0.2:
                return 'positive', abs(polarity)
            elif polarity < -0.2:
                return 'negative', abs(polarity)
            else:
                return 'neutral', 0.5
        except:
            return 'neutral', 0.5

    def generate_one_comment(self):
        """Generate one natural comment"""

        # Select place
        place = np.random.choice(self.places)

        # Select language (mix of French and English)
        language = np.random.choice(['fr', 'en'], p=[0.6, 0.4])

        # Select sentiment (mostly positive for tourism)
        sentiments = ['positive', 'negative', 'neutral']
        sentiment_probs = [0.65, 0.20, 0.15]
        sentiment = np.random.choice(sentiments, p=sentiment_probs)

        # Create prompt
        prompt = self.create_simple_natural_prompt(place, sentiment, language)

        # Generate text
        text = self.generate_natural_text(prompt, language)

        if not text:
            self.stats['failed'] += 1
            return None

        # Analyze sentiment
        predicted_sentiment, confidence = self.analyze_sentiment(text)

        # Create comment data
        comment_data = {
            'id': f"TUN_{self.stats['generated']:06d}",
            'text': text,
            'language': language,
            'place': place['name'],
            'place_type': place['type'],
            'target_sentiment': sentiment,
            'predicted_sentiment': predicted_sentiment,
            'sentiment_confidence': confidence,
            'sentiment_match': sentiment == predicted_sentiment,
            'rating': self._generate_rating(predicted_sentiment),
            'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            'word_count': len(text.split())
        }

        # Update stats
        self.stats['generated'] += 1
        self.stats[language] += 1

        return comment_data

    def _generate_rating(self, sentiment):
        """Generate realistic rating"""
        if sentiment == 'positive':
            return np.random.choice([4, 5])
        elif sentiment == 'negative':
            return np.random.choice([1, 2])
        else:
            return np.random.choice([3, 4])

    def generate_dataset(self, num_comments=100, batch_size=20, save_frequency=20):
        """Generate dataset of natural comments"""

        print(f"\nüöÄ Generating {num_comments:,} natural comments...")
        print("=" * 60)

        all_comments = []

        # Generate
        pbar = tqdm(total=num_comments, desc="Generating comments")

        while len(all_comments) < num_comments:
            comment = self.generate_one_comment()

            if comment:
                all_comments.append(comment)
                pbar.update(1)

                # Show progress
                if len(all_comments) % 10 == 0:
                    print(f"üìä Generated: {len(all_comments):,}/{num_comments:,}")
                    print(f"   Success rate: {len(all_comments)/(len(all_comments)+self.stats['failed']):.1%}")

                # Save checkpoint
                if len(all_comments) % save_frequency == 0:
                    df_checkpoint = pd.DataFrame(all_comments)
                    checkpoint_file = f"tunisia_checkpoint_{len(all_comments)}.csv"
                    df_checkpoint.to_csv(checkpoint_file, index=False)
                    print(f"üíæ Checkpoint saved: {checkpoint_file}")

            # Break if too many failures
            if self.stats['failed'] > 100 and len(all_comments) < 10:
                print("‚ö†Ô∏è Too many failures, trying alternative approach...")
                # Reset and try again
                break

        pbar.close()

        if all_comments:
            df = pd.DataFrame(all_comments)
            print(f"\n‚úÖ Generation complete!")
            print(f"   Generated: {len(df):,} comments")
            print(f"   Failed: {self.stats['failed']:,}")
            print(f"   Success rate: {len(df)/(len(df)+self.stats['failed']):.1%}")
            return df
        else:
            print("‚ùå No comments generated")
            return pd.DataFrame()

# 3. TEST FUNCTION
def test_generation():
    """Test the generator with small sample"""

    print("üß™ Testing natural comment generation...")
    print("=" * 60)

    # Initialize generator
    generator = WorkingTunisiaGenerator()

    # Test 5 comments
    test_comments = []

    for i in range(5):
        print(f"\nGenerating comment {i+1}/5...")
        comment = generator.generate_one_comment()

        if comment:
            test_comments.append(comment)
            print(f"‚úÖ Success! Language: {comment['language']}")
            print(f"   Text: {comment['text'][:80]}...")
        else:
            print(f"‚ùå Failed")

    if test_comments:
        df_test = pd.DataFrame(test_comments)
        print(f"\nüéâ Test successful! Generated {len(test_comments)} comments.")
        return True, df_test
    else:
        print("\n‚ùå Test failed")
        return False, None

# 4. MAIN WORKING FUNCTION
def main_working(num_comments=100):
    """Main working function"""

    print("=" * 60)
    print("üèùÔ∏è  WORKING TUNISIA NATURAL COMMENTS GENERATOR")
    print("=" * 60)

    # First run a quick test
    print("\nüß™ Running quick test...")
    success, df_test = test_generation()

    if not success:
        print("\n‚ùå Test failed. Cannot proceed with generation.")
        return None

    print("\n‚úÖ Test passed! Starting main generation...")

    # Initialize generator
    generator = WorkingTunisiaGenerator()

    # Generate dataset
    df = generator.generate_dataset(num_comments=num_comments)

    if len(df) == 0:
        print("‚ùå No comments generated")
        return None

    # Save results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"tunisia_natural_comments_{len(df)}_{timestamp}.csv"
    df.to_csv(filename, index=False)

    # Show statistics
    print(f"\nüìä DATASET STATISTICS:")
    print(f"   Total comments: {len(df):,}")
    print(f"   French comments: {len(df[df['language'] == 'fr']):,}")
    print(f"   English comments: {len(df[df['language'] == 'en']):,}")
    print(f"   Sentiment distribution: {df['predicted_sentiment'].value_counts().to_dict()}")
    print(f"   Average word count: {df['word_count'].mean():.1f}")

    # Show samples
    print(f"\nüìù SAMPLE COMMENTS:")
    print("=" * 60)

    samples = df.sample(min(3, len(df)))
    for idx, row in samples.iterrows():
        lang_flag = "üá´üá∑" if row['language'] == 'fr' else "üá¨üáß"
        sentiment_icon = "üòä" if row['predicted_sentiment'] == 'positive' else "üòû" if row['predicted_sentiment'] == 'negative' else "üòê"

        print(f"\n{lang_flag} {sentiment_icon} {row['place']}")
        print(f"Rating: {row['rating']}/5 | Words: {row['word_count']}")
        print(f"\"{row['text']}\"")
        print("-" * 40)

    print(f"\nüíæ Saved to: {filename}")

    return df



In [None]:
# 5. GOOGLE COLAB EXECUTION
if __name__ == "__main__":

    print("\nüéõÔ∏è  WORKING TUNISIA COMMENTS GENERATOR - GOOGLE COLAB")
    print("=" * 60)

    print("\nOptions:")
    print("1. Quick test (5 comments)")
    print("2. Small dataset (100 comments)")
    print("3. Medium dataset (500 comments)")
    print("4. Large dataset (2,000 comments)")
    print("5. Full dataset (10,000 comments)")

    choice = input("\nEnter choice (1-5): ").strip()

    if choice == "1":
        print("\nüß™ Running quick test...")
        success, df_test = test_generation()
        if success and df_test is not None:
            df_test.to_csv("tunisia_test_5_comments.csv", index=False)
            print("\n‚úÖ Test complete! Saved to tunisia_test_5_comments.csv")

    elif choice == "2":
        print("\nüöÄ Generating 100 natural comments...")
        df = main_working(100)
        if df is not None:
            print("\n‚úÖ Complete! Ready for download.")

    elif choice == "3":
        print("\nüöÄ Generating 500 natural comments...")
        df = main_working(500)
        if df is not None:
            print("\n‚úÖ Complete! Ready for download.")

    elif choice == "4":
        print("\nüöÄ Generating 2,000 natural comments...")
        df = main_working(2000)
        if df is not None:
            print("\n‚úÖ Complete! Ready for download.")

    elif choice == "5":
        print("\n‚ö†Ô∏è  Generating 10,000 comments (this will take 1-2 hours)")
        confirm = input("Continue? (yes/no): ").strip().lower()
        if confirm in ['yes', 'y']:
            df = main_working(10000)
            if df is not None:
                print("\n‚úÖ Complete! Ready for download.")
        else:
            print("‚ùå Operation cancelled.")

    else:
        print("‚ùå Invalid choice")

    print("\n‚ú® Script execution completed!")