# train model

In [30]:
# Cell 1: Import libraries
import torch
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
import pandas as pd
import os

In [None]:
# Cell 2: Configuration - Use a balanced model
MODEL_NAME = "microsoft/DialoGPT-small"  # 117M parameters - good balance
DATASET_PATH = "../data/llm_train/prompt_response_dataset.csv"
OUTPUT_DIR = "../models/crop_recommendation_proper"

print(f"üöÄ Using model: {MODEL_NAME}")
print(f"üíæ Output directory: {OUTPUT_DIR}")

üöÄ Using model: microsoft/DialoGPT-small
üíæ Output directory: ../models/crop_recommendation_proper


In [32]:
# Cell 3: Load and analyze data
print("üìä Loading and analyzing dataset...")
df = pd.read_csv(DATASET_PATH)
print(f"Dataset size: {len(df)}")

# Check the data format
print("\nüîç Data sample:")
print("Prompt:", df['prompt'].iloc[0])
print("Response:", df['response'].iloc[0])
print("\nUnique crops:", df['response'].unique()[:10])

üìä Loading and analyzing dataset...
Dataset size: 4513

üîç Data sample:
Prompt: Given that soil color is Black, nitrogen is 75, phosphorus is 50, potassium is 100, ph is 6.5, rainfall is 1000, temperature is 20, which crop should be planted?
Response: Sugarcane

Unique crops: ['Sugarcane' 'Jowar' 'Cotton' 'Rice' 'Wheat' 'Groundnut' 'Maize' 'Tur'
 'Urad' 'Moong']


In [33]:
# Cell 4: Create proper training format
print("üìù Creating training format...")

# Use a clear, consistent format that matches inference
def format_training_example(row):
    return f"### Instruction:\n{row['prompt']}\n\n### Response:\n{row['response']}"

training_texts = [format_training_example(row) for _, row in df.iterrows()]

# Verify format
print("Sample training text:")
print(training_texts[0])
print("-" * 50)

# Create dataset
dataset = Dataset.from_dict({"text": training_texts})

# Split dataset
train_test_split = dataset.train_test_split(test_size=0.1, seed=42, shuffle=True)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

print(f"Train size: {len(train_dataset)}")
print(f"Eval size: {len(eval_dataset)}")

üìù Creating training format...
Sample training text:
### Instruction:
Given that soil color is Black, nitrogen is 75, phosphorus is 50, potassium is 100, ph is 6.5, rainfall is 1000, temperature is 20, which crop should be planted?

### Response:
Sugarcane
--------------------------------------------------
Train size: 4061
Eval size: 452


In [34]:
# Cell 5: Load model and tokenizer
print("üöÄ Loading model and tokenizer...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

print(f"‚úÖ Model loaded: {MODEL_NAME}")
print(f"üìä Model parameters: {sum(p.numel() for p in model.parameters()):,}")

üöÄ Loading model and tokenizer...
‚úÖ Model loaded: microsoft/DialoGPT-small
üìä Model parameters: 124,439,808


In [35]:
# Cell 6: Proper tokenization with attention masks
def tokenize_function(examples):
    # Tokenize with proper attention masks
    tokenized = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=256,
        return_tensors=None,
    )
    
    # For causal LM, labels are the same as input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

print("üîß Tokenizing datasets...")
tokenized_train = train_dataset.map(tokenize_function, batched=True, batch_size=1000)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True, batch_size=1000)

# Remove text column
tokenized_train = tokenized_train.remove_columns(["text"])
tokenized_eval = tokenized_eval.remove_columns(["text"])

print("‚úÖ Tokenization complete")
print("Sample tokenized:", {k: v[:5] for k, v in tokenized_train[0].items() if k in ['input_ids', 'attention_mask']})

üîß Tokenizing datasets...


Map:   0%|          | 0/4061 [00:00<?, ? examples/s]

Map:   0%|          | 0/452 [00:00<?, ? examples/s]

‚úÖ Tokenization complete
Sample tokenized: {'input_ids': [21017, 46486, 25, 198, 15056], 'attention_mask': [1, 1, 1, 1, 1]}


In [36]:
# Cell 7: Optimized training arguments for proper learning
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=5,                    # More epochs for proper learning
    per_device_train_batch_size=2,         # Smaller batch for stability
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,         # Effective batch size = 2 * 4 = 8
    warmup_steps=100,
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps", 
    save_steps=200,
    learning_rate=5e-5,                    # Proper fine-tuning LR
    use_cpu=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to=[],
    remove_unused_columns=True,
    dataloader_drop_last=True,
    save_total_limit=2,
    prediction_loss_only=True,             # Only compute loss for speed
)

print("‚öôÔ∏è Training configuration:")
print(f"  - Epochs: {training_args.num_train_epochs}")
print(f"  - Batch size: {training_args.per_device_train_batch_size}")
print(f"  - Learning rate: {training_args.learning_rate}")

‚öôÔ∏è Training configuration:
  - Epochs: 5
  - Batch size: 2
  - Learning rate: 5e-05


In [37]:
# Cell 8: Data collator and trainer
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print("‚úÖ Trainer ready!")

‚úÖ Trainer ready!


  trainer = Trainer(


In [38]:
# Cell 9: Train with progress monitoring
print("üéØ Starting proper training...")
print("‚è±Ô∏è  Estimated training time: 20-40 minutes")

try:
    # Train the model
    train_result = trainer.train()
    
    # Save the final model
    trainer.save_model()
    tokenizer.save_pretrained(OUTPUT_DIR)
    
    print(f"‚úÖ Training completed!")
    print(f"üíæ Model saved to: {OUTPUT_DIR}")
    print(f"üìà Final training loss: {train_result.metrics['train_loss']:.4f}")
    
except Exception as e:
    print(f"‚ùå Training failed: {e}")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


üéØ Starting proper training...
‚è±Ô∏è  Estimated training time: 20-40 minutes


Step,Training Loss,Validation Loss
200,0.3824,0.341853
400,0.3301,0.308587
600,0.3084,0.289085
800,0.2917,0.274956
1000,0.2859,0.268073
1200,0.2749,0.258916
1400,0.2696,0.257243
1600,0.26,0.253247
1800,0.258,0.24521
2000,0.2547,0.241934


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


‚úÖ Training completed!
üíæ Model saved to: ../models/crop_recommendation_proper
üìà Final training loss: 0.4404
