In [1]:
%pip install groq python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import json
import time
import os
from typing import List, Dict, Optional
from groq import Groq
from dataclasses import dataclass
from dotenv import load_dotenv
import tiktoken  # For token counting

# Load environment variables from .env file
load_dotenv()

True

In [4]:
# Central configuration for file paths
FILE_CONFIG = {
    'input_csv': "/media/nebulamind/AI Lab/AI Lab/AskDeen/dataset/quran_tags_final_with_translations_structured1.xlsx",
    'output_csv': '/home/nebulamind/Documents/AI Lab/AskDeen/outputs/complete_tags_with_scrapping.csv'
}

In [5]:
@dataclass
class TagGenerationConfig:
    """Configuration for tag generation"""
    api_key: str = os.getenv("GROQ_API_KEY")
    model_name: str = "meta-llama/llama-4-maverick-17b-128e-instruct"
    context_window: int = 3  # 3 rows above and below
    max_retries: int = 3
    delay_between_calls: float = 0.5  # Groq is generally faster, can reduce delay
    temperature: float = 0.2
    max_tokens: int = 256

In [6]:
def count_tokens(text: str, model: str = "meta-llama/llama-4-maverick-17b-128e-instruct") -> int:
    """Count the number of tokens in a text string"""
    try:
        # Use cl100k_base encoding which is used by GPT-4 and similar models
        encoding = tiktoken.get_encoding("cl100k_base")
        return len(encoding.encode(text))
    except Exception as e:
        print(f"Warning: Could not count tokens: {e}")
        return 0

In [7]:
# def create_optimized_prompt(context_data: dict) -> str:
#     """Create a balanced prompt for Quranic verse tag generation (1000-1100 words)"""

#     prompt = """Generate 2 conceptual tags that capture the core semantic themes of the given Quranic verse.
#     Tags must be semantically rich, contextually accurate, and user-focused.

# CONTEXT VERSES:
# """


#     # Add context verses
#     # for row in context_data["context_rows"]:
#     #     marker = ">>> TARGET VERSE (Generate tags for this) <<<" if row["is_target"] else ""
#     #     prompt += f"Verse ID: {row['id']} {marker}\n"
#     #     prompt += f"Translation: {row['translation']}\n\n"
#     target_index = None
#     for i, row in enumerate(context_data["context_rows"]):
#       if row["is_target"]:
#           target_index = i
#           break

#     if target_index is not None:
#       # Get previous, target, and next verses
#       previous_verse2 = context_data["context_rows"][target_index - 2] if target_index-1 > 0 else None
#       previous_verse1 = context_data["context_rows"][target_index - 1] if target_index > 0 else None

#       target_verse = context_data["context_rows"][target_index]
#       next_verse1 = context_data["context_rows"][target_index + 1] if target_index < len(context_data["context_rows"])-1 else None
#       next_verse2 = context_data["context_rows"][target_index + 2] if target_index < len(context_data["context_rows"])-2 else None

#     # Build the new prompt format
#       if previous_verse1:
#           print("P1 done")
#           prompt += f"*Previous1:* {previous_verse1['translation']}\n"
#       if previous_verse2:
#           prompt += f"*Previous2:* {previous_verse2['translation']}\n"

#       prompt += f"*TARGET:* {target_verse['translation']}\n"
#       if next_verse1:
#           prompt += f"*Next1:* {next_verse1['translation']}\n"
#       if next_verse2:
#           prompt += f"*Next2:* {next_verse2['translation']}\n"

#     prompt += """ ## Focus
# - What is this verse fundamentally about?
# - What are the main concepts and themes present?
# - What theological/spiritual/moral domains does it address?

# ## Conceptual Tag Categories
# 1. **About Allah**: allah-names, allah-mercy, allah-power, divine-love, allah-forgiveness
# 2. **About Prophets**: prophet-stories, prophet-teachings, messenger-guidance, prophetic-examples
# 3. **Prayer & Worship**: salah-guidance, dua-supplications, dhikr-remembrance, worship-methods, spiritual-connection
# 4. **Good Character**: honesty-truth, kindness-compassion, patience-perseverance, humility-modesty, gratitude-thankfulness
# 5. **Family & Relationships**: parent-respect, marriage-love, children-upbringing, family-harmony, social-bonds
# 6. **Daily Life Guidance**: halal-haram, life-decisions, problem-solving, personal-growth, righteous-living
# 7. **Charity & Helping**: zakat-sadaqah, helping-needy, community-care, generosity-giving, social-responsibility
# 8. **Afterlife & Judgment**: paradise-heaven, hell-warning, day-judgment, life-after-death, divine-justice

# ## Guidelines
# - Use 2-3 words maximum per tag
# - Focus on abstract concepts rather than specific practices
# - Think semantically, not functionally
# - Avoid overlapping with practical applications

# ## Output Format:
# {
#     "verse": "verse text",
#     "conceptual_tags": ["concept-1", "concept-1"]
# }
# """

#     return prompt



In [15]:
def create_optimized_prompt(context_data: dict) -> str:
    """Create a balanced prompt for Quranic verse tag generation (1000-1100 words)"""

    prompt = """# Semantic Tag Generation for Quranic Verses - Vector Search Optimization

## Task
Generate semantic tags for the Target Verse that maximize discoverability in vector search systems. Tags should bridge user queries with verse content, enabling accurate retrieval across diverse Islamic learning and guidance scenarios.

## Tag Philosophy
- **Search-First**: Optimize for how users naturally search for Islamic guidance
- **User-Centric**: Reflect authentic ways people express spiritual needs and questions
- **Semantic Bridging**: Connect abstract spiritual concepts to concrete verse content
- **Query Expansion**: Support both specific and exploratory search patterns
- **RAG-Optimized**: Enhance retrieval accuracy in AI-powered Islamic guidance systems

## Tag Generation Framework

### Comprehensive Analysis Questions
1. **Daily Practice**: Does this verse mention or relate to prayer, zakat, hajj, fasting, or other daily Islamic practices?
2. **Legal Guidance**: Are there fiqh rulings, halal/haram distinctions, or jurisprudential principles?
3. **Financial Matters**: Does it address business, interest, inheritance, wealth, or economic ethics?
4. **Family/Social**: Are there guidance on marriage, parenting, family relations, or social interactions?
5. **Moral Character**: What character traits, virtues, or behavioral guidance is mentioned?
6. **Spiritual Development**: Are there elements of taqwa, tawakkul, spiritual purification, or inner growth?
7. **Prophetic Stories**: Are there historical narratives or lessons from prophets?
8. **Theological Concepts**: What divine attributes, Islamic beliefs, or theological principles are present?
9. **Practical Problems**: Does it address real-life challenges people face (anxiety, illness, conflict, etc.)?
10. **Worship Elements**: Are there specific ritual practices, prayer guidance, or worship instructions?

### Tag Extraction Process
1. **Read the verse carefully** - Identify all themes, not just the dominant one
2. **Check surrounding context** - Look for additional applicable concepts
3. **Consider user scenarios** - Think about ALL the ways people might search for this verse
4. **Extract practical guidance** - Don't miss daily-life applications
5. **Include historical elements** - Prophetic stories and historical lessons
6. **Capture legal/fiqh aspects** - Any jurisprudential implications
7. **Identify spiritual dimensions** - Growth, purification, divine connection aspects

### Tag Categories (Extract ALL applicable concepts from verse)

**Daily Islamic Practices**
- `salah-prayer`, `wudu-ablution`, `zakat-charity`, `hajj-pilgrimage`, `fasting-ramadan`, `dua-supplication`, `dhikr-remembrance`

**Islamic Law & Jurisprudence (Fiqh)**
- `halal-permissible`, `haram-forbidden`, `inheritance-laws`, `business-transactions`, `marriage-nikah`, `divorce-talaq`, `witness-testimony`

**Financial & Economic Guidance**
- `riba-interest`, `trading-business`, `debt-loans`, `wealth-distribution`, `charity-sadaqah`, `financial-justice`, `economic-ethics`

**Family & Social Relations**
- `parent-obedience`, `marriage-guidance`, `child-upbringing`, `family-rights`, `neighbor-relations`, `community-bonds`, `social-justice`

**Moral & Behavioral Guidance**
- `honesty-truthfulness`, `patience-sabr`, `forgiveness-maghfira`, `humility-tawadu`, `justice-adl`, `kindness-ihsan`, `anger-management`

**Spiritual Development**
- `taqwa-consciousness`, `tawakkul-trust`, `repentance-tawba`, `gratitude-shukr`, `purification-tazkiya`, `spiritual-growth`, `inner-peace`

**Divine Attributes & Names**
- `allah-rahman`, `allah-rahim`, `divine-mercy`, `allah-knowledge`, `divine-wisdom`, `allah-justice`, `divine-forgiveness`

**Prophetic Stories & Lessons**
- `prophet-ibrahim`, `prophet-musa`, `prophet-isa`, `adam-creation`, `nuh-flood`, `historical-lessons`, `prophetic-guidance`

**Eschatology & Afterlife**
- `day-judgment`, `paradise-jannah`, `hell-jahannam`, `resurrection-qiyamah`, `accountability-hisab`, `afterlife-akhirah`

**Worship & Rituals**
- `prayer-times`, `qibla-direction`, `purification-tahara`, `friday-prayer`, `eid-celebration`, `pilgrimage-rites`, `funeral-janaza`

**Personal Challenges & Solutions**
- `anxiety-worry`, `illness-health`, `poverty-wealth`, `oppression-injustice`, `grief-loss`, `temptation-fitna`, `life-hardships`

**Knowledge & Learning**
- `seeking-knowledge`, `islamic-education`, `quran-recitation`, `hadith-study`, `religious-scholarship`, `wisdom-hikmah`

**Community & Leadership**
- `community-leadership`, `consultation-shura`, `collective-responsibility`, `social-reform`, `public-interest`, `governance-khilafah`

## Guidelines

### Tag Formatting
- Use lowercase with hyphens: `divine-wisdom`
- Maximum 3 words per tag (flexibility for semantic richness)
- Combine Islamic terminology with accessible language
- Ensure searchability across user backgrounds

### Content Focus
- **Extract ALL applicable concepts** - Don't limit to just the "main" theme
- **Include practical applications** - Daily Islamic practices and guidance
- **Capture legal/fiqh dimensions** - Jurisprudential implications and rulings
- **Consider historical elements** - Prophetic stories and historical lessons  
- **Address real-life problems** - Practical solutions for daily challenges
- **Include spiritual aspects** - Growth, purification, and divine connection
- **Cover worship practices** - Ritual guidance and religious obligations
- **Identify moral teachings** - Character development and ethical behavior

### User-Centric Approach
- Think: "What would someone type to find this verse?"
- Consider emotional and spiritual search contexts
- Address both scholarly and personal guidance needs
- Support diverse cultural expressions of Islamic concepts

### Quality Criteria
- **Semantic Accuracy**: Tags accurately represent verse meaning
- **Search Relevance**: Tags match natural user query patterns  
- **Islamic Authenticity**: Maintain theological accuracy
- **Vector Optimization**: Enhance semantic similarity matching
- **Contextual Appropriateness**: Fit within broader Quranic themes

## Context Analysis Process

1. **Read Target Verse**: Identify primary themes and concepts
2. **Consider Context**: Analyze how previous/next verses inform meaning
3. **Map User Needs**: Think about why someone would seek this guidance
4. **Generate Tags**: Create tags that bridge content and user intent
5. **Validate**: Ensure tags support diverse search scenarios

---

## Verse Context for Analysis
"""


    # Add context verses
    # for row in context_data["context_rows"]:
    #     marker = ">>> TARGET VERSE (Generate tags for this) <<<" if row["is_target"] else ""
    #     prompt += f"Verse ID: {row['id']} {marker}\n"
    #     prompt += f"Translation: {row['translation']}\n\n"
    target_index = None
    for i, row in enumerate(context_data["context_rows"]):
      if row["is_target"]:
          target_index = i
          break

    if target_index is not None:
      # Get previous, target, and next verses
      previous_verse2 = context_data["context_rows"][target_index - 2] if target_index-1 > 0 else None
      previous_verse1 = context_data["context_rows"][target_index - 1] if target_index > 0 else None

      target_verse = context_data["context_rows"][target_index]
      next_verse1 = context_data["context_rows"][target_index + 1] if target_index < len(context_data["context_rows"])-1 else None
      next_verse2 = context_data["context_rows"][target_index + 2] if target_index < len(context_data["context_rows"])-2 else None

    # Build the new prompt format
      if previous_verse1 and previous_verse2:
        #   print("P1 done")
          prompt += f"*Previous Verse:* {previous_verse1['verse']}. "
          prompt += f"{previous_verse2['verse']}.\n "
          prompt += f"*Previous Verse Translation:* {previous_verse1['translation']}. "
          prompt += f"{previous_verse2['translation']}. \n\n"

      elif previous_verse1:
          prompt += f"*Previous Verse:* {previous_verse1['verse']}.\n "
          prompt += f"*Previous Verse Translation:* {previous_verse1['translation']}. \n\n"

      elif previous_verse2:
            prompt += f"*Previous Verse:* {previous_verse2['verse']}. \n"
            prompt += f"*Previous Verse Translation:* {previous_verse2['translation']}.\n\n"

      prompt += f"*TARGET Verse:* {target_verse['verse']}\n"
      prompt += f"*TARGET Verse Translation:* {target_verse['translation']}\n\n"
      
      
      if next_verse1 and next_verse2:
        #   print("P2 done")
          prompt += f"*Next Verse:* {next_verse1['verse']}. "
          prompt += f"{next_verse2['verse']}. \n"
          prompt += f"*Next Verse Translation:* {next_verse1['translation']}. "
          prompt += f"{next_verse2['translation']}.\n\n "

      elif next_verse1:
          prompt += f"*Next Verse:* {next_verse1['verse']}. \n"
          prompt += f"*Next Verse Translation:* {next_verse1['translation']}. \n\n"

      elif next_verse2:
            prompt += f"*Next Verse:* {next_verse2['verse']}. \n"
            prompt += f"*Next Verse Translation:* {next_verse2['translation']}.\n\n"

    prompt += """## Analysis Framework for This Verse

### Semantic Content Analysis
- Divine dialogue with angels about human creation
- Angels questioning human potential for corruption
- Allah's superior knowledge and divine wisdom
- The establishment of humanity on earth

### User Search Scenarios (Cover ALL these patterns)
- **Daily Practice Queries**: "verses about prayer times", "zakat calculation guidance", "hajj requirements"
- **Legal/Fiqh Questions**: "inheritance laws in Islam", "what is halal/haram", "marriage rules"
- **Financial Guidance**: "Islam and interest/riba", "business ethics", "debt in Islam"
- **Family Issues**: "parent rights", "marriage problems", "child discipline"
- **Personal Challenges**: "dealing with anxiety", "patience during hardship", "forgiveness"
- **Worship Questions**: "how to pray", "ablution rules", "Friday prayer"
- **Moral Guidance**: "honesty in business", "anger management", "being just"
- **Spiritual Growth**: "getting closer to Allah", "repentance", "purifying heart"
- **Historical Learning**: "story of Adam", "lessons from prophets", "historical events"
- **Theological Understanding**: "Allah's attributes", "predestination", "divine wisdom"

## Real-World Examples

**Example 1: Verse about Prayer**
- Tags could include: `salah-prayer`, `prayer-times`, `spiritual-discipline`, `divine-connection`

**Example 2: Verse about Business Ethics** 
- Tags could include: `business-transactions`, `honesty-truthfulness`, `halal-earning`, `economic-justice`

**Example 3: Verse about Inheritance**
- Tags could include: `inheritance-laws`, `family-rights`, `financial-distribution`, `justice-adl`

**Example 4: Verse about Patience**
- Tags could include: `patience-sabr`, `life-hardships`, `spiritual-growth`, `divine-testing`

**Example 5: Verse about Prophet's Story**
- Tags could include: `prophet-ibrahim`, `historical-lessons`, `faith-trials`, `divine-guidance`

## Output Format:
```json
{
    "Topics": [main topics],
    "Sub-Topics": [sub-topics]   
}
```

## Tag Validation Checklist
- [ ] Tags accurately capture verse's core meaning
- [ ] Tags support natural user search patterns
- [ ] Tags maintain Islamic authenticity and accuracy
- [ ] Tags enhance vector similarity matching
- [ ] Tags are discoverable by diverse user backgrounds
- [ ] Tags complement rather than duplicate each other"""

    return prompt



In [42]:
class GroqQuranTagGenerator:
    def __init__(self, config: TagGenerationConfig):
        self.config = config
        self.client = Groq(api_key=config.api_key)
        self.total_input_tokens = 0
        self.total_output_tokens = 0

    
    def get_contextual_rows(self, df: pd.DataFrame, current_index: int) -> Dict:
        """Get context rows (3 above and 3 below current row)"""
        start_idx = max(0, current_index - 3)
        end_idx = min(len(df), current_index + 3 + 1)

        context_rows = []
        target_row = None

        for i in range(start_idx, end_idx):
            # Assuming your CSV has columns 'ID' and 'Abdullah Yusuf Ali'
            row_data = {
                "id": str(df.iloc[i]['ID']),
                "verse": str(df.iloc[i]['arabic']),
                "translation": str(df.iloc[i]['translation']),
                "Topics": str(df.iloc[i]['main_topic']),
                "Sub-Topics": str(df.iloc[i]['subtopic'])
            }

            if i == current_index:
                target_row = row_data
                row_data["is_target"] = True
            else:
                row_data["is_target"] = False

            context_rows.append(row_data)

        return {
            "context_rows": context_rows,
            "target_row": target_row
        }

    def call_groq_api(self, prompt: str) -> Optional[Dict]:
        """Make API call to Groq with retry logic"""
        # Count input tokens
        input_tokens = count_tokens(prompt)
        self.total_input_tokens += input_tokens

        for attempt in range(self.config.max_retries):
            try:
                completion = self.client.chat.completions.create(
                    model=self.config.model_name,
                    messages=[
                        {
                            "role": "system",
                            "content": "You are an expert Islamic scholar specializing in Quranic interpretation and semantic analysis. Always respond with valid JSON only."
                        },
                        {
                            "role": "user",
                            "content": prompt
                        }
                    ],
                    temperature=self.config.temperature,
                    max_tokens=self.config.max_tokens,
                    top_p=0.9,
                    stream=False,  # Set to False for easier JSON parsing
                    stop=None,
                )

                content = completion.choices[0].message.content.strip()

                # Count output tokens
                output_tokens = count_tokens(content)
                self.total_output_tokens += output_tokens

                # Parse JSON response
                try:
                    # Extract JSON from response if it contains extra text
                    if '{' in content and '}' in content:
                        json_start = content.find('{')
                        json_end = content.rfind('}') + 1
                        json_content = content[json_start:json_end]
                        result = json.loads(json_content)

                        # Validoutputsate required keys
                        if all(key in result for key in ['Topics', 'Sub-Topics']):
                            return result
                        else:
                            print(f"Missing required keys in response: {result}")
                            return None

                    else:
                        print(f"No JSON found in response: {content}")
                        return None

                except json.JSONDecodeError as e:
                    print(f"JSON parsing error: {e}")
                    print(f"Raw Surah123response: {content}")
                    return None

            except Exception as e:
                print(f"API call attempt {attempt + 1} failed: {e}")
                if attempt < self.config.max_retries - 1:
                    time.sleep(self.config.delay_between_calls * (attempt + 1))
                else:
                    return None

        return None

    def process_dataframe(self, df: pd.DataFrame, start_row: int = 0, end_row: Optional[int] = None) -> pd.DataFrame:
        """Process the entire dataframe to generate tags"""
        if end_row is None:
            end_row = len(df)

        # Reset token counters
        self.total_input_tokens = 0
        self.total_output_tokens = 0

        # Initialize new columns if they don't exist
        for col in ['Verse', 'Topics', 'Sub-Topics']:
            if col not in df.columns:
                df[col] = None

        processed_count = 0
        failed_count = 0

        print(f"Starting tag generation for rows {start_row} to {end_row-1}")
        print(f"Using model: {self.config.model_name}")

        for idx in range(start_row, min(end_row, len(df))):
            print(f"\nProcessing row {idx + 1}/{len(df)} (ID: {df.iloc[idx]['ID']})")

            # Get contextual dataGROQ_API_KEY
            context_data = self.get_contextual_rows(df, idx)

            # Create prompt
            prompt = create_optimized_prompt(context_data)

            # Call Groq API
            tags = self.call_groq_api(prompt)

            if tags:
                # Update dataframe with generated tags
                # df.at[idx, 'Verse'] = df.iloc[i]['Abdullah Yusuf Ali']
                print(idx)
                df.at[idx, 'Topics'] = ', '.join(tags.get('Topics', []))
                df.at[idx, 'Sub-Topics'] = ', '.join(tags.get('Sub-Topics', []))
                # df.at[idx, 'reasoning'] = tags.get('reasoning', '').strip() 
                # df.at[idx, 'tag_3'] = tags.get('Secondary', '').strip()
                processed_count += 1
                print(f"✓ Generated tags: {tags}")
            else:
                failed_count += 1
                print(f"✗ Failed to generate tags for row {idx}")

            # Rate limiting
            time.sleep(self.config.delay_between_calls)

        print(f"\n{'='*50}")
        print(f"PROCESSINGprevious_verse2 COMPLETE!")
        print(f"Successfully processed: {processed_count} rows")
        print(f"Failed: {failed_count} rows")
        print(f"Success rate: {(processed_count/(processed_count+failed_count)*100):.1f}%")
        print(f"Total input tokens: {self.total_input_tokens}")
        print(f"Total output tokens: {self.total_output_tokens}")
        print(f"Average tokens per verse: {self.total_input_tokens/processed_count:.1f} input, {self.total_output_tokens/processed_count:.1f} output")

        return df

    def save_results(self, df: pd.DataFrame, output_path: str):
        """Save the results to a file"""
        df.to_csv(output_path, index=False)
        print(f"Results saved to: {output_path}")

In [43]:
def validate_tags(df: pd.DataFrame) -> Dict:
    """Validate generated tags and provide statistics"""
    stats = {
        'total_rows': len(df),
        'rows_with_all_tags': 0,
        'rows_with_missing_tags': 0,
        'unique_tags': set(),
        'tag_distribution': {}
    }

    for idx, row in df.iterrows():
        tags = [row.get('tag_1'), row.get('tag_2')]
        valid_tags = [tag for tag in tags if tag and str(tag).strip() and str(tag).strip() != 'nan']

        if len(valid_tags) == 3:
            stats['rows_with_all_tags'] += 1
        else:
            stats['rows_with_missing_tags'] += 1

        # Add to unique tags and count frequency
        for tag in valid_tags:
            tag = str(tag).strip()
            stats['unique_tags'].add(tag)
            stats['tag_distribution'][tag] = stats['tag_distribution'].get(tag, 0) + 1

    # Convert set to list for display
    stats['unique_tags'] = list(stats['unique_tags'])
    stats['total_unique_tags'] = len(stats['unique_tags'])

    return stats

In [44]:
def main():
    """Main function to run the tag generation"""

    # Configuration - Replace with your actual Groq API key
    config = TagGenerationConfig(
        api_key= os.getenv("GROQ_API_KEY"),
        model_name="meta-llama/llama-4-maverick-17b-128e-instruct",
        context_window=3,
        max_retries=3,
        delay_between_calls=0.5,
        temperature=0.2,
        max_tokens=256
    )

    # Load your data using the centralized file path
    try:
        df = pd.read_excel(FILE_CONFIG['input_csv'])
        df = df[650:660] 
        print(f"Loaded {len(df)} verses from CSV")
        print(f"Columns: {list(df.columns)}")
    except FileNotFoundError:
        print(f"Please check the input file path in FILE_CONFIG: {FILE_CONFIG['input_csv']}")
        return

    # Initialize generator
    generator = GroqQuranTagGenerator(config)

    # Process the full dataset
    print("Processing full dataset...")
    df_with_tags = generator.process_dataframe(df)  # No start_row or end_row specified means process all rows

    # Save results using the centralized output path
    output_file = FILE_CONFIG['output_csv']
    generator.save_results(df_with_tags, output_file)

    # Validate results
    stats = validate_tags(df_with_tags)
    print(f"\n{'='*50}")
    print("VALIDATION RESULTS:")
    print(f"Total rows processed: {stats['total_rows']}")
    print(f"Rows with all 3 tags: {stats['rows_with_all_tags']}")
    print(f"Rows with missing tags: {stats['rows_with_missing_tags']}")
    print(f"Unique tags generated: {stats['total_unique_tags']}")
    print(f"Total tokens used: {generator.total_input_tokens + generator.total_output_tokens}")
    print(f"Average tokens per verse: {generator.total_input_tokens/(stats['total_rows']+1):.1f} input, {generator.total_output_tokens/(stats['total_rows']+1):.1f} output")

    # Display sample results
    print(f"\n{'='*50}")
    print("SAMPLE RESULTS:")
    sample_cols = ['ID', 'Topic', 'Sub-Topic']
    if all(col in df_with_tags.columns for col in sample_cols):
        print(df_with_tags[sample_cols].head())

    print(f"\n{'='*50}")
    print("PROCESSING COMPLETE!")
    print(f"Results saved to: {output_file}")
    print("Review the generated tags for quality and consistency")

In [45]:
if __name__ == "__main__":
    main()

Loaded 10 verses from CSV
Columns: ['ID', 'translation', 'main_topic', 'subtopic', 'arabic']
Processing full dataset...
Starting tag generation for rows 0 to 9
Using model: meta-llama/llama-4-maverick-17b-128e-instruct

Processing row 1/10 (ID: 4|160)
0
✓ Generated tags: {'Topics': ['financial-guidance', 'islamic-law', 'jewish-practices', 'divine-punishment', 'economic-ethics'], 'Sub-Topics': ['riba-interest', 'halal-haram', 'food-laws', 'punishment-for-iniquity', 'hindering-others-from-allahs-way', 'wrongful-consumption-of-wealth', 'faith-rejection-punishment', 'believers-reward', 'prayer-maintenance', 'zakat-charity', 'faith-in-revelation']}

Processing row 2/11 (ID: 4|161)
1
✓ Generated tags: {'Topics': ['financial-guidance', 'islamic-law', 'moral-behavior', 'divine-punishment', 'business-ethics'], 'Sub-Topics': ['riba-interest', 'unlawful-wealth', 'financial-injustice', 'punishment-for-disbelievers', 'business-transactions', 'halal-earning', 'economic-justice', 'moral-accountabilit