In [None]:
!pip install torch transformers easyocr pandas tqdm requests



In [None]:
# Function to download an image from a URL and convert to training dataset
import os
import torch
import pandas as pd
import requests
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from io import BytesIO
from PIL import Image
import easyocr
from tqdm import tqdm
import numpy as np

def download_image(url, retries=3, backoff_factor=0.3):
    session = requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=[500, 502, 503, 504],  # Retry on server errors
        raise_on_status=False
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)

    try:
        # Attempt to download the image
        response = session.get(url, timeout=10)
        response.raise_for_status()
        image = Image.open(BytesIO(response.content)).convert('RGB')
        return image
    except Exception as e:
        print(f"Error downloading image from {url}: {e}")
    return None

# Function to extract text from an image using EasyOCR
def extract_text_from_image(image, reader):
    try:
        result = reader.readtext(np.array(image), detail=0, paragraph=True)
        return " ".join(result)
    except Exception as e:
        print(f"Error extracting text: {e}")
        return ""

# Prepare the dataset for fine-tuning
def prepare_data(df, reader):
    inputs = []
    targets = []
    skipped = 0

    for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
        image_url = row['image_link']
        entity_name = row['entity_name']
        entity_value = row['entity_value']

        # Validate target
        if pd.isna(entity_value) or not isinstance(entity_value, str) or not entity_value.strip():
            print(f"Skipping entry with invalid target at index {idx}")
            skipped += 1
            continue

        # Download the image
        image = download_image(image_url)
        if image:
            # Extract text from the image
            context = extract_text_from_image(image, reader)
            if not context.strip():
                print(f"No text extracted from image at index {idx}, skipping.")
                skipped += 1
                continue
        else:
            print(f"Image download failed at index {idx}, skipping.")
            skipped += 1
            continue

        question = f"What is the {entity_name}?"
        input_text = f"Question: {question} Context: {context}"

        inputs.append(input_text)
        targets.append(entity_value)

    print(f"Data preparation completed. Skipped {skipped} entries due to invalid data.")
    return inputs, targets

# Load the data
df = pd.read_csv('/kaggle/input/dataset/train.csv')  # Replace with your actual CSV path
df = df.head(5000)  # Use more data if available

# Set up the OCR reader
reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())

# Prepare the data
inputs, targets = prepare_data(df, reader)

# Save the prepared dataset to disk
prepared_data = {'inputs': inputs, 'targets': targets}
torch.save(prepared_data, 'prepared_data.pth')

print("Data preparation completed and saved to 'prepared_data.pth'.")


In [None]:
# function for training
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm

# Set CUDA configuration to avoid memory fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

# Enable cuDNN benchmark
torch.backends.cudnn.benchmark = True

# Custom Dataset class for handling the inputs and targets
class EntityDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, max_length=256):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        input_text = str(self.inputs[index])
        target_text = str(self.targets[index])

        input_encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt",
        )
        target_encoding = self.tokenizer(
            target_text,
            max_length=32,  # Assuming the target is short
            padding='max_length',
            truncation=True,
            return_tensors="pt",
        )

        input_ids = input_encoding['input_ids'].flatten()
        attention_mask = input_encoding['attention_mask'].flatten()
        labels = target_encoding['input_ids'].flatten()

        # Replace padding token id's of the labels by -100 so it's ignored by the loss
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }

def main():
    # Clear CUDA cache to free memory before loading the model
    torch.cuda.empty_cache()

    # Load the prepared data
    prepared_data = torch.load('/kaggle/working/prepared_data.pth')
    inputs, targets = prepared_data['inputs'], prepared_data['targets']

    # Check for empty inputs or targets
    valid_data = [(inp, tgt) for inp, tgt in zip(inputs, targets) if inp.strip() and tgt.strip()]
    if not valid_data:
        print("No valid data available for training.")
        return

    inputs, targets = zip(*valid_data)

    # Split data into training and validation sets
    train_size = int(0.9 * len(inputs))
    train_inputs, val_inputs = inputs[:train_size], inputs[train_size:]
    train_targets, val_targets = targets[:train_size], targets[train_size:]

    # Initialize tokenizer and model
    tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-base')  # Use 't5-small' if needed
    model = T5ForConditionalGeneration.from_pretrained('google/flan-t5-base')

    # Move model to GPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Enable gradient checkpointing to save memory
    model.gradient_checkpointing_enable()

    # Create datasets and dataloaders
    train_dataset = EntityDataset(train_inputs, train_targets, tokenizer)
    val_dataset = EntityDataset(val_inputs, val_targets, tokenizer)

    # Reduce batch size to fit in memory
    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=1)

    # Optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

    # Implement gradient clipping
    from torch.nn.utils import clip_grad_norm_

    # Fine-tuning loop
    for epoch in range(3):  # Adjust the number of epochs as necessary
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )
            loss = outputs.loss

            # Check for NaN loss
            if torch.isnan(loss):
                print("NaN loss encountered, skipping this batch.")
                continue

            total_loss += loss.item()

            loss.backward()

            # Clip gradients
            clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}, Average Loss: {avg_loss:.4f}")

        # Validation after each epoch
        model.eval()
        correct_predictions = 0
        total_predictions = 0
        val_loss = 0
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validation"):
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels,
                )
                loss = outputs.loss

                # Check for NaN loss
                if torch.isnan(loss):
                    print("NaN loss encountered in validation, skipping this batch.")
                    continue

                val_loss += loss.item()

                # Generate predictions
                generated_ids = model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_length=32,
                    num_beams=2,
                )
                preds = tokenizer.batch_decode(
                    generated_ids, skip_special_tokens=True
                )

                # Decode labels while ignoring -100
                labels_for_decoding = labels.clone()
                labels_for_decoding[labels_for_decoding == -100] = tokenizer.pad_token_id
                targets = tokenizer.batch_decode(
                    labels_for_decoding, skip_special_tokens=True
                )

                # Compare predictions with ground truth
                for pred, target in zip(preds, targets):
                    if pred.strip().lower() == target.strip().lower():
                        correct_predictions += 1
                    total_predictions += 1

        avg_val_loss = val_loss / len(val_loader)
        accuracy = (correct_predictions / total_predictions * 100) if total_predictions > 0 else 0
        print(f"Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.2f}%")

    # Save the model
    model.save_pretrained("fine_tuned_model")
    tokenizer.save_pretrained("fine_tuned_model")
    print("Model saved to 'fine_tuned_model' directory.")

if __name__ == "__main__":
    main()


In [None]:
# continue training
import os
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm

# Set CUDA configuration to avoid memory fragmentation
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

# Enable cuDNN benchmark
torch.backends.cudnn.benchmark = True

# Custom Dataset class for handling the inputs and targets
class EntityDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, max_length=256):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        input_text = str(self.inputs[index])
        target_text = str(self.targets[index])

        input_encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt",
        )
        target_encoding = self.tokenizer(
            target_text,
            max_length=32,  # Assuming the target is short
            padding='max_length',
            truncation=True,
            return_tensors="pt",
        )

        input_ids = input_encoding['input_ids'].flatten()
        attention_mask = input_encoding['attention_mask'].flatten()
        labels = target_encoding['input_ids'].flatten()

        # Replace padding token id's of the labels by -100 so it's ignored by the loss
        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }

def main():
    # Clear CUDA cache to free memory before loading the model
    torch.cuda.empty_cache()

    # Load the prepared data
    prepared_data = torch.load('/kaggle/working/prepared_data.pth')
    inputs, targets = prepared_data['inputs'], prepared_data['targets']

    # Check for empty inputs or targets
    valid_data = [(inp, tgt) for inp, tgt in zip(inputs, targets) if inp.strip() and tgt.strip()]
    if not valid_data:
        print("No valid data available for training.")
        return

    inputs, targets = zip(*valid_data)

    # Split data into training and validation sets
    train_size = int(0.9 * len(inputs))
    train_inputs, val_inputs = inputs[:train_size], inputs[train_size:]
    train_targets, val_targets = targets[:train_size], targets[train_size:]

    # Load the saved model and tokenizer
    tokenizer = T5Tokenizer.from_pretrained('fine_tuned_model')
    model = T5ForConditionalGeneration.from_pretrained('fine_tuned_model')

    # Move model to GPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Enable gradient checkpointing to save memory
    model.gradient_checkpointing_enable()

    # Create datasets and dataloaders
    train_dataset = EntityDataset(train_inputs, train_targets, tokenizer)
    val_dataset = EntityDataset(val_inputs, val_targets, tokenizer)

    # Reduce batch size to fit in memory
    train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=1)

    # Optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

    # Implement gradient clipping
    from torch.nn.utils import clip_grad_norm_

    # Fine-tuning loop (resuming from saved model)
    for epoch in range(3):  # Adjust the number of epochs as necessary
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}"):
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels,
            )
            loss = outputs.loss

            # Check for NaN loss
            if torch.isnan(loss):
                print("NaN loss encountered, skipping this batch.")
                continue

            total_loss += loss.item()

            loss.backward()

            # Clip gradients
            clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch + 1}, Average Loss: {avg_loss:.4f}")

        # Validation after each epoch
        model.eval()
        correct_predictions = 0
        total_predictions = 0
        val_loss = 0
        with torch.no_grad():
            for batch in tqdm(val_loader, desc="Validation"):
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)

                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels,
                )
                loss = outputs.loss

                # Check for NaN loss
                if torch.isnan(loss):
                    print("NaN loss encountered in validation, skipping this batch.")
                    continue

                val_loss += loss.item()

                # Generate predictions
                generated_ids = model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_length=32,
                    num_beams=2,
                )
                preds = tokenizer.batch_decode(
                    generated_ids, skip_special_tokens=True
                )

                # Decode labels while ignoring -100
                labels_for_decoding = labels.clone()
                labels_for_decoding[labels_for_decoding == -100] = tokenizer.pad_token_id
                targets = tokenizer.batch_decode(
                    labels_for_decoding, skip_special_tokens=True
                )

                # Compare predictions with ground truth
                for pred, target in zip(preds, targets):
                    if pred.strip().lower() == target.strip().lower():
                        correct_predictions += 1
                    total_predictions += 1

        avg_val_loss = val_loss / len(val_loader)
        accuracy = (correct_predictions / total_predictions * 100) if total_predictions > 0 else 0
        print(f"Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.2f}%")

    # Save the model after fine-tuning
    model.save_pretrained("fine_tuned_model")
    tokenizer.save_pretrained("fine_tuned_model")
    print("Model saved to 'fine_tuned_model' directory.")

if __name__ == "__main__":
    main()


In [None]:
# Predictor
import os
import torch
import pandas as pd
import requests
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from io import BytesIO
from PIL import Image
import easyocr
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm
import numpy as np

# Function to download an image from a URL with retry logic and delay
def download_image(url, retries=3, backoff_factor=0.3):
    session = requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=[500, 502, 503, 504],  # Retry on server errors
        raise_on_status=False
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)

    try:
        # Attempt to download the image
        response = session.get(url, timeout=10)
        response.raise_for_status()
        image = Image.open(BytesIO(response.content)).convert('RGB')
        return image
    except Exception as e:
        print(f"Error downloading image from {url}: {e}")
    return None

# Function to extract text from an image using EasyOCR
def extract_text_from_image(image, reader):
    try:
        result = reader.readtext(np.array(image), detail=0, paragraph=True)
        return " ".join(result)
    except Exception as e:
        print(f"Error extracting text: {e}")
        return ""

def main():
    # Load the test data
    test_df = pd.read_csv('/kaggle/input/qweqwe/test.csv')  # Replace with your test CSV file path
    test_df=test_df[:10000]
    # Ensure the test dataframe has the necessary columns
    required_columns = ['image_link', 'entity_name']
    missing_columns = [col for col in required_columns if col not in test_df.columns]
    if missing_columns:
        print(f"Error: Test data is missing columns: {missing_columns}")
        return

    # Set up the OCR reader
    reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())

    # Load the fine-tuned model and tokenizer
    tokenizer = T5Tokenizer.from_pretrained('/kaggle/working/fine_tuned_model')
    model = T5ForConditionalGeneration.from_pretrained('/kaggle/working/fine_tuned_model')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    model.eval()  # Set model to evaluation mode

    # Prepare for predictions
    predictions = []
    for idx, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Processing Test Data"):
        image_url = row['image_link']
        entity_name = row['entity_name']

        # Download the image
        image = download_image(image_url)
        if image:
            # Extract text from the image
            context = extract_text_from_image(image, reader)
            if not context.strip():
                print(f"No text extracted from image at index {idx}")
                context = ""
        else:
            print(f"Image download failed at index {idx}")
            context = ""

        question = f"What is the {entity_name}?"
        input_text = f"Question: {question} Context: {context}"

        # Tokenize input text
        input_encoding = tokenizer(
            input_text,
            max_length=256,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
        )
        input_ids = input_encoding['input_ids'].to(device)
        attention_mask = input_encoding['attention_mask'].to(device)

        # Generate prediction
        with torch.no_grad():
            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=32,
                num_beams=2,
                early_stopping=True,
            )
            pred = tokenizer.decode(
                generated_ids[0],
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True
            )
            # Append the prediction to the list
            predictions.append({
                'image_link': image_url,
                'entity_name': entity_name,
                'predicted_value': pred
            })

    # Convert predictions to DataFrame
    output_df = pd.DataFrame(predictions)

    # Save to CSV
    output_df.to_csv('sample_out.csv', index=False)
    print("Predictions saved to 'sample_out.csv'")

if __name__ == '__main__':
    main()
