In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
import warnings
import json
import re
from scipy.spatial.distance import cosine

warnings.filterwarnings('ignore')

# --- PREVENT TRUNCATION ---
pd.set_option('display.max_rows', None) 
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# SkillDev Class - Initialization & Data Preparation

In [None]:
class SkillDev:
    def __init__(self, file_path, use_llama=True, model_name="meta-llama/Llama-3.2-1B-Instruct"):
        print(f"--- üöÄ Loading Dataset: {file_path} ---")
        self.df = pd.read_csv(file_path)
        self.scaler = StandardScaler()
        self.use_llama = use_llama
        self.original_df = self.df.copy()
        
        # Store all available columns for flexible filtering
        self.all_columns = self.df.columns.tolist()
        print(f"üìã Available columns: {self.all_columns}")
        
        # Initialize LLAMA Model
        if self.use_llama:
            print(f"--- ü§ñ Loading LLAMA Model: {model_name} ---")
            try:
                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
                self.llama_model = AutoModelForCausalLM.from_pretrained(
                    model_name,
                    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                    device_map="auto" if torch.cuda.is_available() else None,
                    low_cpu_mem_usage=True
                )
                self.device = "cuda" if torch.cuda.is_available() else "cpu"
                print(f"‚úÖ LLAMA Model loaded successfully on {self.device}")
            except Exception as e:
                print(f"‚ö†Ô∏è Could not load LLAMA model: {e}")
                print("Continuing without LLAMA integration...")
                self.use_llama = False
        
        self._prepare_data()

    def _prepare_data(self):
        """Prepare and convert numeric columns"""
        # Identify numeric columns
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns.tolist()
        
        # Convert any string columns that should be numeric
        for col in self.all_columns:
            try:
                self.df[col] = pd.to_numeric(self.df[col], errors='coerce')
            except:
                pass
        
        # Default features (numeric columns, excluding ID-like columns)
        self.features = [col for col in self.df.select_dtypes(include=[np.number]).columns 
                        if col.upper() not in ['ID', 'INDEX', 'SEX', 'Q8']]
        
        if not self.features:
            self.features = self.df.select_dtypes(include=[np.number]).columns.tolist()[:4]
        
        print(f"üìä Features for clustering: {self.features}")

    def _extract_keywords_and_params(self, prompt):
        """Extract keywords, numbers, and intent from prompt"""
        prompt_lower = prompt.lower()
        
        # Extract numbers
        numbers = re.findall(r'\b\d+\b', prompt)
        quantity = int(numbers[0]) if numbers else None
        
        # Predefined keyword mappings for different domains
        keyword_map = {
            'women': ['women', 'lady', 'female', 'woman', 'wife', 'mother'],
            'farmers': ['farmer', 'farming', 'agricultural', 'agriculture', 'tractor', 'crop', 'harvest'],
            'students': ['student', 'school', 'education', 'college', 'university', 'study'],
            'elderly': ['elder', 'elderly', 'old', 'senior', 'retired', 'pension'],
            'youth': ['youth', 'young', 'teenager', 'teen', 'adolescent'],
            'disabled': ['disable', 'disability', 'wheelchair', 'blind', 'deaf', 'impair'],
            'poor': ['poor', 'poverty', 'needy', 'destitute', 'impoverish', 'low income'],
            'children': ['child', 'kid', 'infant', 'toddler', 'children'],
            'health': ['health', 'medical', 'medicine', 'doctor', 'hospital', 'sick', 'disease'],
            'education': ['education', 'school', 'book', 'learn', 'scholarship'],
        }
        
        detected_keywords = []
        for category, keywords in keyword_map.items():
            if any(kw in prompt_lower for kw in keywords):
                detected_keywords.append(category)
        
        # Extract resource type (sewing, tractor, book, etc.)
        resources = ['sewing machine', 'tractor', 'book', 'food', 'medicine', 'shelter', 'clothing', 
                     'equipment', 'tool', 'supply', 'device', 'machine']
        resource_type = None
        for resource in resources:
            if resource in prompt_lower:
                resource_type = resource
                break
        
        return {
            'keywords': detected_keywords,
            'quantity': quantity,
            'resource': resource_type,
            'full_text': prompt
        }

    def _llama_understand_intent(self, prompt):
        """Use LLAMA to understand user intent and extract parameters"""
        if not self.use_llama:
            return self._rule_based_intent(prompt)
        
        system_prompt = """You are an AI assistant for welfare distribution. Analyze the prompt and extract:
1. Target Demographics (who should receive benefits)
2. Resource Type (what is being distributed)
3. Selection Criteria (age, gender, income level, etc.)
4. Quantity (if mentioned)

Respond in JSON format: {"demographics": "...", "resource": "...", "criteria": "...", "quantity": null or number}"""

        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
        ]
        
        try:
            inputs = self.tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True)
            if self.device == "cuda":
                inputs = inputs.to(self.device)
            
            outputs = self.llama_model.generate(
                inputs,
                max_new_tokens=200,
                temperature=0.3,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id
            )
            
            response = self.tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
            print(f"ü§ñ LLAMA Analysis: {response}")
            return response
                
        except Exception as e:
            print(f"‚ö†Ô∏è LLAMA analysis failed: {e}")
            return self._rule_based_intent(prompt)

    def _rule_based_intent(self, prompt):
        """Enhanced rule-based intent and parameter detection"""
        params = self._extract_keywords_and_params(prompt)
        return json.dumps(params)

    def _smart_filter_data(self, params, original_df):
        """Intelligently filter data based on extracted parameters"""
        filtered_df = original_df.copy()
        keywords = params.get('keywords', [])
        
        # Filter based on keywords
        for keyword in keywords:
            if keyword == 'women' and 'SEX' in filtered_df.columns:
                filtered_df = filtered_df[filtered_df['SEX'] == 2]
            elif keyword == 'elderly' and 'AGE' in filtered_df.columns:
                filtered_df = filtered_df[filtered_df['AGE'] >= 60]
            elif keyword == 'youth' and 'AGE' in filtered_df.columns:
                filtered_df = filtered_df[(filtered_df['AGE'] >= 18) & (filtered_df['AGE'] <= 35)]
            elif keyword == 'children' and 'AGE' in filtered_df.columns:
                filtered_df = filtered_df[filtered_df['AGE'] < 18]
            elif keyword == 'poor' and 'Q45_A_1' in filtered_df.columns:
                median_income = filtered_df['Q45_A_1'].median()
                filtered_df = filtered_df[filtered_df['Q45_A_1'] < median_income]
        
        # If no filters matched, return full dataset
        if filtered_df.empty or len(filtered_df) < len(original_df) and len(keywords) > 0:
            if len(filtered_df) > 0:
                return filtered_df
        
        return filtered_df if len(filtered_df) > 0 else original_df

    def _analyze_clusters_detailed(self, target_group, n_clusters=3):
        """Perform detailed cluster analysis"""
        if target_group.empty:
            print("‚ö†Ô∏è Empty dataset for clustering")
            return None, None
        
        # Get numeric features
        numeric_features = [col for col in self.features if col in target_group.columns]
        if not numeric_features:
            numeric_features = target_group.select_dtypes(include=[np.number]).columns.tolist()[:4]
        
        # Fill missing values
        for col in numeric_features:
            target_group[col] = pd.to_numeric(target_group[col], errors='coerce').fillna(target_group[col].median())
        
        # Clustering
        X = self.scaler.fit_transform(target_group[numeric_features])
        kmeans = KMeans(n_clusters=min(n_clusters, len(target_group)), n_init=10, random_state=42)
        target_group['cluster_id'] = kmeans.fit_predict(X)
        
        # Detailed cluster summary
        summary = target_group.groupby('cluster_id')[numeric_features].agg(['mean', 'min', 'max', 'std'])
        cluster_sizes = target_group.groupby('cluster_id').size()
        
        analysis = {
            'summary': summary,
            'sizes': cluster_sizes,
            'data': target_group,
            'features': numeric_features,
            'kmeans': kmeans  # Store KMeans object for later use
        }
        
        return analysis, numeric_features

    def _find_nearest_cluster_members(self, analysis, top_k=20):
        """Find top members closest to cluster centers"""
        target_group = analysis['data']
        numeric_features = analysis['features']
        kmeans = analysis['kmeans']
        
        # Find neediest cluster
        income_col = 'Q45_A_1' if 'Q45_A_1' in numeric_features else numeric_features[0]
        cluster_summary = analysis['summary'][income_col]['mean']
        neediest_id = cluster_summary.idxmin()
        
        # Get data for neediest cluster
        cluster_data = target_group[target_group['cluster_id'] == neediest_id]
        X_cluster = self.scaler.transform(cluster_data[numeric_features])
        
        # Calculate distance to cluster center
        center = kmeans.cluster_centers_[neediest_id]
        distances = np.linalg.norm(X_cluster - center, axis=1)
        
        # Get top_k closest members
        closest_indices = np.argsort(distances)[:top_k]
        closest_members = cluster_data.iloc[closest_indices]
        
        return closest_members, neediest_id

    def _llama_generate_intelligent_explanation(self, analysis, target_group, intent_params):
        """Generate contextual explanation using LLAMA"""
        if not self.use_llama or analysis is None:
            return self._generate_basic_explanation(analysis, target_group, intent_params)
        
        # Find neediest cluster (lowest average income if available)
        income_col = 'Q45_A_1' if 'Q45_A_1' in analysis['features'] else analysis['features'][0]
        cluster_summary = analysis['summary'][income_col]['mean']
        
        if cluster_summary.empty:
            return self._generate_basic_explanation(analysis, target_group, intent_params)
        
        neediest_id = cluster_summary.idxmin()
        eligible_count = len(target_group[target_group['cluster_id'] == neediest_id])
        
        summary_text = str(analysis['summary'].round(2))
        
        prompt = f"""Analyze this welfare distribution scenario:

Dataset Size: {len(target_group)} people
Number of Clusters: {len(analysis['sizes'])}
Cluster Distribution: {analysis['sizes'].to_dict()}

Intent/Keywords: {intent_params.get('keywords', [])}
Resource Type: {intent_params.get('resource', 'General aid')}
Quantity Needed: {intent_params.get('quantity', 'Not specified')}

Cluster Statistics:
{summary_text}

Selected Target: Cluster {neediest_id} with {eligible_count} eligible recipients

Provide a 2-3 sentence explanation of:
1. Why this cluster was selected
2. What this distribution means for the beneficiaries
3. Expected impact"""

        messages = [
            {"role": "system", "content": "You are a compassionate welfare officer explaining distribution decisions."},
            {"role": "user", "content": prompt}
        ]
        
        try:
            inputs = self.tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True)
            if self.device == "cuda":
                inputs = inputs.to(self.device)
            
            outputs = self.llama_model.generate(
                inputs,
                max_new_tokens=250,
                temperature=0.7,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id
            )
            
            explanation = self.tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
            return explanation.strip()
            
        except Exception as e:
            print(f"‚ö†Ô∏è Explanation generation failed: {e}")
            return self._generate_basic_explanation(analysis, target_group, intent_params)

    def _generate_basic_explanation(self, analysis, target_group, intent_params):
        """Generate basic explanation without LLAMA"""
        if analysis is None:
            return "Unable to analyze clusters with given filters."
        
        income_col = 'Q45_A_1' if 'Q45_A_1' in analysis['features'] else analysis['features'][0]
        cluster_summary = analysis['summary'][income_col]['mean']
        neediest_id = cluster_summary.idxmin()
        eligible_count = len(target_group[target_group['cluster_id'] == neediest_id])
        
        explanation = f"""Distribution Analysis Results:
- Selected Cluster: {neediest_id} (Most in need)
- Eligible Recipients: {eligible_count}
- Target Demographics: {', '.join(intent_params.get('keywords', ['General population']))}
- Resource: {intent_params.get('resource', 'General aid')}
- Quantity Available: {intent_params.get('quantity', 'Not specified')}"""
        
        return explanation

    def run_scenario(self, prompt, n_clusters=3):
        print(f"\n{'='*80}")
        print(f"üí¨ USER PROMPT: \"{prompt}\"")
        print('='*80)
        
        # 1. Parse intent and extract parameters
        intent_result = self._llama_understand_intent(prompt)
        try:
            intent_params = json.loads(intent_result) if intent_result.startswith('{') else self._extract_keywords_and_params(prompt)
        except:
            intent_params = self._extract_keywords_and_params(prompt)
        
        print(f"\nüìã EXTRACTED PARAMETERS:")
        print(f"   Keywords: {intent_params.get('keywords', [])}")
        print(f"   Resource: {intent_params.get('resource', 'Not specified')}")
        print(f"   Quantity: {intent_params.get('quantity', 'Not specified')}")

        # 2. Smart filtering based on parameters
        target_group = self._smart_filter_data(intent_params, self.original_df)
        
        print(f"\nüë• FILTERED POPULATION: {len(target_group)} people (from {len(self.original_df)} total)")
        
        if target_group.empty:
            print("‚ö†Ô∏è No matching records found with current filters.")
            return

        # 3. Cluster Analysis
        print(f"\nüìä K-MEANS CLUSTERING (n_clusters={n_clusters})")
        analysis, features = self._analyze_clusters_detailed(target_group, n_clusters)
        
        if analysis is None:
            print("‚ö†Ô∏è Could not perform cluster analysis")
            return
        
        # 4. Display Cluster Profiles
        print(f"\n--- üìä CLUSTER PROFILES ---")
        print(f"Total Clusters: {len(analysis['sizes'])}")
        print(f"Cluster Sizes: {analysis['sizes'].to_dict()}")
        print(f"\nDetailed Statistics:")
        print(analysis['summary'])

        # 5. Generate intelligent explanation
        explanation = self._llama_generate_intelligent_explanation(analysis, target_group, intent_params)
        print(f"\nüìå AI DECISION & EXPLANATION:")
        print(f"{explanation}")
        
        # 6. Show eligible recipients (closest to cluster center)
        print(f"\nüì¢ ELIGIBLE RECIPIENTS (Closest to Cluster Center)")
        closest_members, neediest_id = self._find_nearest_cluster_members(analysis, top_k=20)
        
        print(f"Cluster {neediest_id}: {len(closest_members)} top candidates")
        print("-" * 80)
        
        display_cols = [col for col in ['AGE', 'SEX', 'Q8', 'Q20', 'Q45_A_1'] if col in closest_members.columns]
        if not display_cols:
            display_cols = closest_members.select_dtypes(include=[np.number]).columns.tolist()[:5]
        
        print(closest_members[display_cols].head(20))
        print("-" * 80)
        
        return {
            'target_group': target_group,
            'closest_members': closest_members,
            'analysis': analysis,
            'intent_params': intent_params
        }

## NLP & Intent Extraction Methods

## System Initialization & Execution

In [None]:
import os

print("üîß Initializing SkillDev System with Advanced Analytics...\n")

# Check for CSV file - look in data folder
csv_file = 'data/LFS-2023.csv'
if not os.path.exists(csv_file):
    # Fallback to current directory
    csv_file = 'LFS-2023.csv'
    
if not os.path.exists(csv_file):
    print(f"‚ö†Ô∏è File 'LFS-2023.csv' not found!")
    print(f"üìÇ Current directory: {os.getcwd()}\n")
    print("üí° Options:")
    print("   1. Place 'LFS-2023.csv' in the data folder")
    print("   2. Use a different file path")
    print("   3. Generate sample data for testing\n")
    
    # Option to generate sample data
    response = input("Generate sample data for testing? (yes/no): ").strip().lower()
    if response == 'yes' or response == 'y':
        print("\nüî® Generating sample dataset...")
        import numpy as np
        
        # Generate realistic sample data
        np.random.seed(42)
        n_samples = 1000
        
        sample_data = pd.DataFrame({
            'AGE': np.random.randint(18, 80, n_samples),
            'SEX': np.random.choice([1, 2], n_samples),  # 1=Male, 2=Female
            'Q8': np.random.randint(1, 10, n_samples),  # Education/occupation code
            'Q20': np.random.randint(0, 60, n_samples),  # Weekly hours worked
            'Q45_A_1': np.random.exponential(50000, n_samples),  # Income (exponential distribution)
            'EDU': np.random.randint(1, 15, n_samples)  # Years of education
        })
        
        sample_data.to_csv(csv_file, index=False)
        print(f"‚úÖ Sample dataset created: {csv_file} ({n_samples} records)")
        print(f"   Age: 18-80, Sex: Male/Female, Income: varied distribution\n")
    else:
        raise FileNotFoundError(f"Please place 'LFS-2023.csv' in the data folder")

# Initialize with LLAMA integration
# Note: You may need to login to HuggingFace and accept LLAMA model terms
# Run: huggingface-cli login
system = SkillDev(csv_file, use_llama=False)  # Set to False to skip LLAMA for faster testing

print("\n" + "="*80)
print("üöÄ SYSTEM READY - Enter prompts to analyze and distribute resources")
print("="*80 + "\n")

while True:
    prompt = input("\nüìù Enter a prompt (or 'quit' to exit): ")
    if prompt.lower() == 'quit':
        break
    if prompt.strip():
        system.run_scenario(prompt) 

In [None]:
# Example: Run without LLAMA integration or test specific scenarios
system_basic = SkillDev('LFS-2023.csv', use_llama=False)

# Test scenarios - the model now handles ANY type of prompt
print("\nüß™ TESTING MULTIPLE SCENARIOS:\n")

test_prompts = [
    "I have 100 sewing machines to distribute to vulnerable women",
    "Find the poorest people in the dataset who need medical support",
    "Identify young people aged 18-35 for skill development programs",
    "We have tractors available for farmers in the agriculture sector",
    "Select elderly people over 60 who need financial assistance",
    "Find children and students who need educational materials"
]

for test_prompt in test_prompts:
    try:
        result = system_basic.run_scenario(test_prompt)
        print("\n‚úÖ Scenario completed successfully\n")
    except Exception as e:
        print(f"\n‚ùå Error in scenario: {e}\n")
