#Install dependencies

In [None]:
!pip uninstall -y huggingface_hub
!pip install -q unsloth
!pip install -q datasets pandas openpyxl pyreadstat transformers accelerate
!pip install -q --upgrade huggingface_hub
!pip install -q unsloth_zoo

#Mount Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import torch
import pandas as pd
import json
from datasets import Dataset
import gc
from unsloth import FastLanguageModel
from transformers import TrainingArguments
import transformers

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

#Memory optimization

In [None]:
max_seq_length = 1024
load_in_4bit = True
dtype = torch.float16

torch.cuda.empty_cache()
gc.collect()

# Load base model

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="Kai-99/qwen2-1.5-bihar-offices",
    max_seq_length=max_seq_length,
    load_in_4bit=load_in_4bit,
    token="hf_",
    dtype=dtype,
)

print("Model loaded successfully!")

#Office Parser

In [None]:
# Cell 5: Enhanced Office Parser (your v2 code)
import re
from typing import Dict, List, Tuple, Optional

class OfficeColumnParser:
    def __init__(self):
        self.short_forms = self.load_short_forms()
        self.districts = self.load_districts()
        self.aliases = self.load_aliases()

    def load_short_forms(self):
        return {
            # Educational Institutions
            'HS': 'HIGH SCHOOL', 'H.S': 'HIGH SCHOOL', 'H/S': 'HIGH SCHOOL',
            'MS': 'MIDDLE SCHOOL', 'M.S': 'MIDDLE SCHOOL', 'M/S': 'MIDDLE SCHOOL',
            'PS': 'PRIMARY SCHOOL', 'P.S': 'PRIMARY SCHOOL', 'P/S': 'PRIMARY SCHOOL',
            'GMS': 'GOVERNMENT MIDDLE SCHOOL',
            'UMS': 'UPGRADED MIDDLE SCHOOL',
            'GPS': 'GOVERNMENT PRIMARY SCHOOL',
            'GBS': 'GOVERNMENT BASIC SCHOOL',
            'NPS': 'NAVSRIJIT PRIMARY SCHOOL',
            'PG H/S': 'PRABHAT GYAN HIGH SCHOOL',
            'HSC': 'HIGHER SECONDARY SCHOOL',
            'OFFICE HM': 'OFFICE OF HEAD MASTER',

            # Healthcare
            'PHC': 'PRIMARY HEALTH CENTRE', 'P.H.C': 'PRIMARY HEALTH CENTRE',
            'APHC': 'ADDITIONAL PRIMARY HEALTH CENTRE',
            'SNCU': 'SPECIAL NEWBORN CARE UNIT', 'S.N.C.U': 'SPECIAL NEWBORN CARE UNIT',
            'VIMS': 'VARDHMAN INSTITUTE OF MEDICAL SCIENCES',
            'CMO': 'CHIEF MEDICAL OFFICER',
            'JNKTMCH': 'JANNAYAK KARPOORI THAKUR MEDICAL COLLEGE AND HOSPITAL',

            # Government Departments
            'RWD': 'RURAL WORKS DEPARTMENT', 'R.W.D': 'RURAL WORKS DEPARTMENT',
            'T.C DIV': 'TRANSMISSION CIVIL DIVISION',
            'SDO': 'SUB-DIVISIONAL OFFICE',
            'CDPO': 'CHILD DEVELOPMENT PROJECT OFFICER',
            'D.V.B.D.C.O': 'DISTRICT VECTOR-BORNE DISEASE CONTROL OFFICER',
            'ARCS': 'ASSISTANT REGISTRAR OF CO-OPERATIVE SOCIETIES',
            'ESD': 'ELECTRICAL SUB-DIVISION',
            'DEO': 'DISTRICT EDUCATION OFFICE',
            'MIDIV': 'MINOR IRRIGATION DIVISION',
            'WITI': "WOMEN'S INDUSTRIAL TRAINING INSTITUTE",
            'L.A.E.O.': 'LOCAL AREA ENGINEERING ORGANISATION',
            'T.C DIV.':'TIRHUT CANAL DIVISON',
            'O/O AD PP': 'OFFICE OF THE ADDITIONAL DIRECTOR OF PUBLIC PROSECUTION',

            # Police & Security
            'PS': 'POLICE STATION',
            'SSP': 'SENIOR SUPERINTENDENT OF POLICE OFFICE',
            'SP': 'SUPERINTENDENT OF POLICE OFFICE',
            'CTS': 'CONSTABLE TRAINING SCHOOL',
            'BSAP': 'BIHAR SPECIAL ARMED POLICE', 'B.S.A.P': 'BIHAR SPECIAL ARMED POLICE',
            'SCJ': 'SPECIAL CENTRAL JAIL',
            'SRP': 'SUPERINTENDENT OF RAILWAY POLICE',
            'CO, BSAP': 'COMMANDING OFFICER BIHAR SPECIAL ARMED POLICE',
            'BMP': 'BIHAR MILLITARY POLICE',
            'BSAP': 'BIHAR SPECIAL ARMED POLICE (WOMEN WING)',


            # Power & Infrastructure
            'ESS': 'ELECTRICAL SUB STATION',
            'SBPDCL': 'SOUTH BIHAR POWER DISTRIBUTION COMPANY LTD',
            'NBPDCL': 'NORTH BIHAR POWER DISTRIBUTION COMPANY LTD'
        }

    def load_districts(self):
        return [
            'Vaishali', 'Supaul', 'Siwan', 'Sitamarhi', 'Sheohar', 'Sheikhpura', 'Saran',
            'Samastipur', 'Saharsa', 'Rohtas', 'Purvi Champaran', 'Purnia', 'Patna',
            'Pashchim Champaran', 'Nawada', 'Nalanda', 'Muzaffarpur', 'Munger',
            'Madhubani', 'Madhepura', 'Lakhisarai', 'Kishanganj', 'Khagaria', 'Katihar',
            'Kaimur (Bhabua)', 'Jehanabad', 'Jamui', 'Gopalganj', 'Gaya', 'Darbhanga',
            'Buxar', 'Bhojpur', 'Bhagalpur', 'Begusarai', 'Banka', 'Aurangabad', 'Arwal',
            'Araria'
        ]

    def load_aliases(self):
        return {
            'HIGH SCHL': 'HIGH SCHOOL',
            'H SCHOOL': 'HIGH SCHOOL',
            'UCHYA VIDYALAYA': 'HIGH SCHOOL',
            '+2 SCHOOL': 'HIGH SCHOOL',
            'INTERMEDIATE SCHOOL': 'HIGH SCHOOL',
            'PULSE TWO': 'HIGH SCHOOL',
            '10+2 SCHOOL': 'HIGH SCHOOL',
            'MID. SCHOOL': 'MIDDLE SCHOOL',
            'MADHYA SCHOOL': 'MIDDLE SCHOOL',
            'MADHYAMIK SCHOOL': 'MIDDLE SCHOOL',
            'MADHYA VIDYALAYA': 'MIDDLE SCHOOL',
            'PRATHMIK SCHOOL': 'PRIMARY SCHOOL',
            'P SCHOOL': 'PRIMARY SCHOOL',
            'PRI. SCHOOL': 'PRIMARY SCHOOL',
            'KANYA SCHOOL': 'GIRLS SCHOOL',
            'KANYA VIDYALAYA': 'GIRLS SCHOOL',
            'UTKRAMIT MIDDLE SCHOOL': 'UPGRADED MIDDLE SCHOOL',
            'PRIM. HEALTH': 'PRIMARY HEALTH CENTRE',
            'POLICE SUPERINTENDENT OFFICE': 'SUPERINTENDENT OF POLICE OFFICE'
        }

    def normalize_text(self, text: str) -> str:
        if not isinstance(text, str):
            return ""
        text = text.upper().strip()
        text = re.sub(r'[^\w\s,]', '', text)
        return re.sub(r'\s+', ' ', text)

# Instantiate parser
office_parser = OfficeColumnParser()

#Data Load

In [None]:
# Cell 6: Enhanced Data Loader with Alpaca-style prompts
class DataProcessor:
    def __init__(self, parser):
        self.parser = parser

    def load_training_data(self):
        """Load training data from Excel file with Alpaca formatting"""
        try:
            # Load your training data
            df = pd.read_excel("/content/drive/MyDrive/civicBot/llm_training_data_v2.xlsx")
            print(f"Loaded training data: {len(df)} samples")
            return df
        except Exception as e:
            print(f"Error loading training data: {e}")
            # Fallback to sample data
            return pd.DataFrame({
                'office': ['RURAL WORKS DEPARTMENT, WORKS DIVISION, MAHUA', 'ESD HAJIPUR'],
                'office_name': ['RURAL WORKS DEPARTMENT, WORKS DIVISION', 'ESD'],
                'office_location': ['MAHUA', 'HAJIPUR']
            })

    def create_alpaca_prompt(self, sample):
        """Create Alpaca-style prompt for training"""
        instruction = "Extract the office name and location from the following Bihar government office text. Return the result in JSON format with 'office_name' and 'office_location' fields."

        office = sample['office']
        office_name = sample.get('office_name', '')
        office_location = sample.get('office_location', '')

        if office_name and office_location:
            output_json = {
                "office_name": office_name,
                "office_location": office_location
            }
            output = json.dumps(output_json, ensure_ascii=False)
        else:
            output = '{"office_name": "UNKNOWN", "office_location": "UNKNOWN"}'

        return {
            "instruction": instruction,
            "input": office,
            "output": output
        }

    def prepare_training_dataset(self):
        """Prepare dataset for training"""
        df = self.load_training_data()

        # Apply Alpaca formatting
        formatted_data = [self.create_alpaca_prompt(row) for _, row in df.iterrows()]

        # Convert to dataset
        dataset = Dataset.from_list(formatted_data)

        # Tokenize function
        def tokenize_function(examples):
            prompts = []
            for i in range(len(examples['instruction'])):
                instruction = examples['instruction'][i]
                input_text = examples['input'][i]
                output_text = examples['output'][i]

                prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input_text}

### Response:
{output_text}"""
                prompts.append(prompt)

            return tokenizer(prompts, truncation=True, max_length=max_seq_length, padding=False)

        # Tokenize dataset
        tokenized_dataset = dataset.map(
            tokenize_function,
            batched=True,
            batch_size=4,
            remove_columns=dataset.column_names
        )

        print(f"Prepared training dataset with {len(tokenized_dataset)} samples")
        return tokenized_dataset

# Initialize data processor
data_processor = DataProcessor(office_parser)

#LoRa Setup

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing=True,
    random_state=3407,
    max_seq_length=max_seq_length,
)

#Training Setup

In [None]:

# Prepare training data
train_dataset = data_processor.prepare_training_dataset()

# Training arguments
training_args = TrainingArguments(
    output_dir="./qwen2-bihar-offices-v2",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=50,
    max_steps=500,
    learning_rate=2e-4,
    fp16=not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_bf16_supported(),
    logging_steps=25,
    optim="adamw_8bit",
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    report_to="none",
    save_steps=250,
    save_total_limit=1,
)

#Trainning

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    args=training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

# Clear memory before training
gc.collect()
torch.cuda.empty_cache()

print("Starting training...")
trainer.train()

# Save model
model.save_pretrained("./qwen2-bihar-offices-v2")
tokenizer.save_pretrained("./qwen2-bihar-offices-v2")
print("Model saved locally")

# Push to Hugging Face Hub

In [None]:

from huggingface_hub import login

login(token="hf_")

model.push_to_hub("Kai-99/qwen2-1.5-bihar-offices-v2", token="hf_")
tokenizer.push_to_hub("Kai-99/qwen2-1.5-bihar-offices-v2", token="hf_")

print("Model pushed to Hugging Face Hub!")

#Inference Engine

In [None]:
class InferenceEngine:
    def __init__(self, model_path="Kai-99/qwen2-1.5-bihar-offices-v2"):
        self.model_path = model_path
        self.model = None
        self.tokenizer = None
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.load_model()

    def load_model(self):
        """Load the trained model"""
        print("Loading model for inference...")
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
            model_name=self.model_path,
            max_seq_length=1024,
            load_in_4bit=True,
            dtype=torch.float16,
        )
        print("Model loaded successfully!")

    def extract_office_info(self, text):
        """Enhanced extraction with JSON output"""
        prompt = f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Extract the office name and location from the following Bihar government office text. Return the result in JSON format with 'office_name' and 'office_location' fields.

### Input:
{text}

### Response:
"""

        inputs = tokenizer([prompt], return_tensors="pt", truncation=True, max_length=1024)
        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=128,
                temperature=0.1,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )

        response = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Extract just the response part
        if "### Response:" in response:
            response = response.split("### Response:")[1].strip()

        # Try to parse JSON
        try:
            # Find JSON in response
            start_idx = response.find('{')
            end_idx = response.rfind('}') + 1
            if start_idx != -1 and end_idx != 0:
                json_str = response[start_idx:end_idx]
                parsed = json.loads(json_str)
                return parsed
        except:
            pass

        # Fallback parsing
        return self.fallback_parsing(response, text)

    def fallback_parsing(self, response, original_text):
        """Fallback parsing when JSON fails"""
        office_name = "UNKNOWN"
        office_location = "UNKNOWN"

        # Simple pattern matching
        if "office_name" in response.lower():
            try:
                parts = response.split("office_name")[1].split("office_location")[0]
                office_name = parts.split(":")[1].split(",")[0].strip(' "\'')
            except:
                pass

        if "office_location" in response.lower():
            try:
                parts = response.split("office_location")[1]
                office_location = parts.split(":")[1].split("}")[0].strip(' "\'')
            except:
                pass

        return {
            "office_name": office_name if office_name != "UNKNOWN" else original_text,
            "office_location": office_location
        }

    def batch_extract_office_info(self, texts, batch_size=4):
        """Batch processing for efficiency"""
        results = []
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i+batch_size]
            batch_results = []

            for text in batch_texts:
                try:
                    result = self.extract_office_info(text)
                    batch_results.append(result)
                except Exception as e:
                    batch_results.append({
                        "office_name": text,
                        "office_location": f"ERROR: {str(e)}"
                    })

            results.extend(batch_results)
            print(f"Processed {min(i+batch_size, len(texts))}/{len(texts)} samples")

            # Memory cleanup
            if i % 20 == 0:
                gc.collect()
                torch.cuda.empty_cache()

        return results

# Initialize inference engine
inference_engine = InferenceEngine()

#Validation

In [None]:
def test_with_samples():
    """Test the model with sample inputs"""
    test_samples = [
        "RURAL WORKS DEPARTMENT, WORKS DIVISION, MAHUA",
        "ESD HAJIPUR",
        "DISTRICT EDUCATION OFFICE, PATNA",
        "PHC MADHEPURA",
        "PS SAMASTIPUR"
    ]

    print("Testing with sample data:\n")
    for sample in test_samples:
        result = inference_engine.extract_office_info(sample)
        print(f"Input: {sample}")
        print(f"Output: {result}")
        print("-" * 50)

test_with_samples()

#Run

In [None]:
def process_unseen_data():
    """Process 1k random samples from the .dta file"""
    # Load .dta file
    df = pd.read_stata("/content/drive/MyDrive/civicBot/_")

    # Sample 1k random entries
    sample_df = df[['office', 'district', 'cadre', 'year']].sample(n=1000, random_state=42)

    print(f"Processing {len(sample_df)} samples...")

    # Extract office info
    texts = sample_df['office'].tolist()
    extracted_results = inference_engine.batch_extract_office_info(texts)

    # Combine results
    final_results = []
    for i, (_, row) in enumerate(sample_df.iterrows()):
        extracted = extracted_results[i]
        final_results.append({
            'office_original': row['office'],
            'office_name': extracted.get('office_name', ''),
            'office_location': extracted.get('office_location', ''),
            'district': row['district'],
            'cadre': row['cadre'],
            'year': row['year']
        })

    return pd.DataFrame(final_results)

# Run inference
results_df = process_unseen_data()

# Save to CSV
results_df.to_csv("/content/drive/MyDrive/civicBot/A_inference_results_1k_v2.csv", index=False)
print("Results saved to inference_results_1k_v2.csv")

# Display sample results
print("\nSample results:")
print(results_df.head(10))