In [None]:
#pip install -U langchain-community
!pip install PyPDF2

In [None]:
!pip uninstall googletrans
!pip install googletrans==4.0.0-rc1
!pip install --upgrade httpcore httpx



In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from tqdm import tqdm
import warnings
import os
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from PyPDF2 import PdfReader
import re
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from deep_translator import GoogleTranslator


warnings.filterwarnings('ignore')

In [3]:


# Configuration constants
MODEL_NAME = "meta-llama/Llama-3.2-3b"
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
CONFIDENCE_PDF = 1.0
CONFIDENCE_LLM = 0.7

# Status code mappings
MACHINE_TYPE_MAPPING = {1: "New", 2: "Used"}
CONDITION_MAPPING = {
    1: "In Stock", 2: "Running", 3: "Rebuilt", 
    4: "In Transit", 5: "In Production", 6: "Excellent"
}
AVAILABILITY_MAPPING = {
    1: "Immediately", 2: "Less than 30 days", 
    3: "More than 30 days", 4: "Immediately from stock"
}

In [13]:


def is_english(text):
    """Check if text is primarily in English with robust error handling."""
    if not text or not isinstance(text, str):
        return True  # Return True for empty or non-string input
    try:
        cleaned_text = ' '.join(text.split())  # Remove extra spaces
        if len(cleaned_text) < 10:  # If text is too short
            return True
        return detect(cleaned_text) == 'en'
    except LangDetectException:
        return True  # Default to True on detection failure
    except Exception as e:
        print(f"Language detection error: {str(e)}")
        return True

def translate_to_english(text: str) -> str:
    """Translate non-English text to English with chunk processing"""
    if not text or not isinstance(text, str):
        return text
    translator = GoogleTranslator(source='auto', target='en')
    chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
    translated = []
    for chunk in chunks:
        try:
            if not is_english(chunk):
                translated.append(translator.translate(chunk))
            else:
                translated.append(chunk)
        except Exception:
            translated.append(chunk)
    return ' '.join(translated)

def load_model():
    """Load the model and tokenizer with error handling."""
    try:
        model_name = "meta-llama/Llama-3.2-3b"
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(
            model_name, 
            torch_dtype=torch.float16, 
            device_map="auto"
        )
        return model, tokenizer
    except Exception as e:
        raise RuntimeError(f"Failed to load model: {str(e)}")

def clean_value(value):
    """Enhanced value cleaning with type checking."""
    if isinstance(value, (float, int)):
        return str(value) if not pd.isna(value) else None
    if not value or pd.isna(value) or str(value).lower() == 'nan':
        return None
    return str(value).strip()

def parse_pdf_and_create_embeddings(folder_path):
    """Parse PDFs from a folder structure and extract text content."""
    pdf_data = {}
    if not os.path.exists(folder_path):
        print(f"Warning: PDF folder path {folder_path} does not exist")
        return pdf_data
    try:
        for manufacturer_folder in os.listdir(folder_path):
            manufacturer_path = os.path.join(folder_path, manufacturer_folder)
            if not os.path.isdir(manufacturer_path):
                continue
            for model_folder in os.listdir(manufacturer_path):
                model_path = os.path.join(manufacturer_path, model_folder)
                if not os.path.isdir(model_path):
                    continue
                combined_text = []
                for pdf_file in os.listdir(model_path):
                    if not pdf_file.lower().endswith('.pdf'):
                        continue
                    pdf_path = os.path.join(model_path, pdf_file)
                    try:
                        text = extract_text_from_pdf(pdf_path)
                        if text:
                            cleaned_text = preprocess_text(text)
                            combined_text.append(cleaned_text)
                    except Exception as e:
                        print(f"Error processing PDF {pdf_file}: {str(e)}")
                        continue
                if combined_text:
                    pdf_data[(manufacturer_folder, model_folder)] = "\n".join(combined_text)
    except Exception as e:
        print(f"Error processing PDF folder: {str(e)}")
    return pdf_data

def extract_text_from_pdf(pdf_path):
    """Extract text from a single PDF file."""
    try:
        reader = PdfReader(pdf_path)
        text = []
        for page in reader.pages:
            try:
                page_text = page.extract_text()
                if page_text:
                    text.append(page_text)
            except Exception as e:
                print(f"Error extracting text from page in {pdf_path}: {str(e)}")
                continue
        return "\n".join(text)
    except Exception as e:
        print(f"Error reading PDF {pdf_path}: {str(e)}")
        return ""

def preprocess_text(text):
    """Enhanced text preprocessing with language detection and translation."""
    if not text:
        return ""
    try:
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text)
        text = text.replace('\n', ' ').strip()
        text = ' '.join(text.split())
        text = re.sub(r'Page \d+ of \d+', '', text)
        text = re.sub(r'\d+/\d+', '', text)
        if not is_english(text):
            text = translate_to_english(text)
        return text
    except Exception as e:
        print(f"Text preprocessing error: {str(e)}")
        return text

def validate_pdf_structure(folder_path):
    """Validate the PDF folder structure."""
    if not os.path.exists(folder_path):
        print(f"Error: PDF folder path {folder_path} does not exist")
        return False
    valid_structure = True
    manufacturer_count = 0
    for manufacturer in os.listdir(folder_path):
        manufacturer_path = os.path.join(folder_path, manufacturer)
        if not os.path.isdir(manufacturer_path):
            continue
        manufacturer_count += 1
        for model in os.listdir(manufacturer_path):
            model_path = os.path.join(manufacturer_path, model)
            if not os.path.isdir(model_path):
                continue
            pdf_count = sum(1 for f in os.listdir(model_path) if f.lower().endswith('.pdf'))
            if pdf_count == 0:
                print(f"Warning: No PDFs found in {manufacturer}/{model}/")
                valid_structure = False
    if manufacturer_count == 0:
        print("Error: No manufacturer folders found")
        return False
    print(f"Found {manufacturer_count} manufacturers with PDF data")
    return valid_structure

def create_vector_store(pdf_data):
    """Create FAISS vector store from PDF data."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-mpnet-base-v2"
    )
    documents = []
    metadatas = []
    for (manufacturer, model), text in pdf_data.items():
        chunks = text_splitter.split_text(text)
        documents.extend(chunks)
        metadatas.extend([{"manufacturer": manufacturer, "model": model}] * len(chunks))
    return FAISS.from_texts(documents, embeddings, metadatas=metadatas)

def map_status_codes(row, manufacturer_mapping, model_mapping):
    """Maps numeric codes to their corresponding text values."""
    try:
        mfg_code = clean_value(row.get('ManufacturerId'))
        model_code = clean_value(row.get('MachineModelId'))
        machine_type = clean_value(row.get('MachineTypeId'))
        condition = clean_value(row.get('MachineConditionId'))
        availability = clean_value(row.get('MachineAvailabilityId'))
        location = clean_value(row.get('Location'))
        year = clean_value(row.get('Year'))
        description = clean_value(row.get('Description'))
        if not all([mfg_code, model_code]):
            print(f"Missing required codes for row: {row.name}")
            return None
        try:
            manufacturer = manufacturer_mapping.loc[
                manufacturer_mapping['Id'] == int(mfg_code), 
                'Name'
            ].iloc[0]
        except (IndexError, ValueError):
            print(f"Invalid manufacturer code: {mfg_code}")
            return None
        try:
            model = model_mapping.loc[
                model_mapping['MachineModelId'] == int(model_code), 
                'ModelName'
            ].iloc[0]
        except (IndexError, ValueError):
            print(f"Invalid model code: {model_code}")
            return None
        mapped_data = {
            'ManufacturerId': manufacturer,
            'MachineModelId': model,
            'MachineTypeId': MACHINE_TYPE_MAPPING.get(int(machine_type)) if machine_type else 'Unknown',
            'MachineConditionId': CONDITION_MAPPING.get(int(condition)) if condition else 'Unknown',
            'MachineAvailabilityId': AVAILABILITY_MAPPING.get(int(availability)) if availability else 'Unknown',
            'Location': location or 'Unknown',
            'Year': year or 'Unknown',
            'Description': description or 'No description available'
        }
        return mapped_data
    except Exception as e:
        print(f"Error mapping status codes: {str(e)}")
        return None

def get_relevant_context(vector_store, manufacturer, model, k=3):
    """Retrieve relevant context from vector store."""
    query = f"{manufacturer} {model} technical specifications and features"
    docs = vector_store.similarity_search(query, k=k)
    unique_content = set()
    filtered_content = []
    for doc in docs:
        sentences = doc.page_content.split('.')
        for sentence in sentences:
            cleaned = sentence.strip()
            if cleaned and cleaned not in unique_content:
                unique_content.add(cleaned)
                filtered_content.append(cleaned)
    return ". ".join(filtered_content)

template = """The {MachineConditionId} {MachineTypeId} machine, Model {MachineModelId} manufactured by {ManufacturerId}, is located in {Location}. This {Year} model is currently {MachineAvailabilityId} available. {context}"""


def create_prompt(row, manufacturer_mapping, model_mapping, vector_store, user_description=None):
    """
    Create a prompt for description generation based on row data.
    
    Args:
        row (pd.Series): Input data row
        manufacturer_mapping (pd.DataFrame): Manufacturer mapping data
        model_mapping (pd.DataFrame): Model mapping data
        vector_store: FAISS vector store
        user_description (str, optional): Additional user-provided description
        
    Returns:
        str: Generated prompt or None if mapping fails
    """
    try:
        # Map the status codes first
        mapped_data = map_status_codes(row, manufacturer_mapping, model_mapping)
        if not mapped_data:
            return None
            
        # Get relevant context from vector store
        context = get_relevant_context(
            vector_store,
            mapped_data['ManufacturerId'],
            mapped_data['MachineModelId']
        )
        
        # Format the template with the mapped data
        prompt = template.format(
            **mapped_data,
            context=context,
            location=mapped_data['Location'],
            description=mapped_data['Description']
        )
        
        # Add the user-provided description to the prompt, if available
        if user_description:
            prompt += f"\n\nAdditional Details:\n{user_description}"
        
        return prompt
        
    except Exception as e:
        print(f"Error creating prompt: {str(e)}")
        return None

def process_excel(input_file, output_file, manufacturer_mapping_file, model_mapping_file, pdf_folder, user_description=None):
    """Process Excel files and generate descriptions."""
    try:
        manufacturer_mapping = pd.read_csv(manufacturer_mapping_file)
        model_mapping = pd.read_excel(model_mapping_file)
        df = pd.read_excel(input_file)
        model, tokenizer = load_model()
        pdf_data = parse_pdf_and_create_embeddings(pdf_folder)
        vector_store = create_vector_store(pdf_data)
        df['Generated_Description'] = ''
        df['Source'] = ''
        df['Confidence_Score'] = 0.0
        for idx, row in tqdm(df.iterrows(), total=len(df)):
            manufacturer = clean_value(row.get('ManufacturerId'))
            model_code = clean_value(row.get('MachineModelId'))
            pdf_source = (manufacturer, model_code) in pdf_data
            if pdf_source:
                pdf_text = pdf_data[(manufacturer, model_code)]
                description = generate_description(model, tokenizer, pdf_text)
                df.at[idx, 'Generated_Description'] = description
                df.at[idx, 'Source'] = 'PDF'
                df.at[idx, 'Confidence_Score'] = 1.0
            else:
                prompt = create_prompt(row, manufacturer_mapping, model_mapping, vector_store, user_description)
                if prompt:
                    description = generate_description(model, tokenizer, prompt)
                    df.at[idx, 'Generated_Description'] = description
                    df.at[idx, 'Source'] = 'Non-PDF'
                    df.at[idx, 'Confidence_Score'] = 0.7
        output_df = df[['ManufacturerId', 'MachineModelId', 'MachineTypeId', 'MachineConditionId',
                         'MachineAvailabilityId', 'Location', 'Year', 'Description', 
                         'Generated_Description', 'Source', 'Confidence_Score']]
        output_df.to_excel(output_file, index=False)
        print(f"Processing complete. Results saved to {output_file}")
    except Exception as e:
        print(f"Error processing Excel files: {str(e)}")

def clean_generated_text(text):
    """Enhanced cleaning of generated text."""
    if not text:
        return "Error: Empty generated text."
    try:
        text = re.sub(r'You are a technical writer.*?\n', '', text, flags=re.DOTALL)
        text = re.sub(r'Create a detailed, professional description.*?Response must be in English only.', '', text, flags=re.DOTALL)
        text = re.sub(r'\s+', ' ', text).strip()
        text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', text)
        if not text.endswith(('.', '!', '?')):
            text = text.rstrip() + '.'
        if not is_english(text):
            translated = translate_to_english(text)
            if translated and len(translated.split()) >= 3:
                return translated
            return "Error: Unable to generate valid English description."
        if len(text.split()) < 3:
            return "Error: Generated description too short."
        return text
    except Exception as e:
        print(f"Text cleaning error: {str(e)}")
        return "Error: Text cleaning failed."

def generate_description(model, tokenizer, prompt, max_length=2048):
    """Generate description with improved parameters."""
    try:
        if not all([prompt, model, tokenizer]):
            return "Insufficient data for description generation."
        device = model.device
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, 
                           max_length=max_length).to(device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=500,
                temperature=0.7,
                top_p=0.9,
                repetition_penalty=1.2,
                no_repeat_ngram_size=3,
                do_sample=True,
                num_return_sequences=1,
                pad_token_id=tokenizer.eos_token_id,
            )
        raw_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        cleaned_text = clean_generated_text(raw_text)
        if len(cleaned_text.split()) < 50:
            cleaned_text = generate_description_fallback(model, tokenizer, prompt, max_length)
        return cleaned_text
    except Exception as e:
        print(f"Error in description generation: {str(e)}")
        return "Error generating description."

def generate_description_fallback(model, tokenizer, prompt, max_length=2048):
    """Fallback function to generate a description if the initial attempt is too short."""
    try:
        device = model.device
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, 
                           max_length=max_length).to(device)
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=500,
                temperature=0.8,
                top_p=0.9,
                repetition_penalty=1.1,
                no_repeat_ngram_size=2,
                do_sample=True,
                num_return_sequences=1,
                pad_token_id=tokenizer.eos_token_id,
            )
        raw_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        cleaned_text = clean_generated_text(raw_text)
        return cleaned_text
    except Exception as e:
        print(f"Error in description generation fallback: {str(e)}")
        return "Error generating description."



In [6]:
pip install faiss-gpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [14]:
input_file="/home/hbs/Downloads/MachineTbl_Data.xlsx"
output_file="/home/hbs/Downloads/descriptions.xlsx"
pdf_folder="/home/hbs/Downloads/PDFs"
manufacturer_mapping_file = "/home/hbs/Downloads/manufacturer_mapping.csv"
model_mapping_file = "/home/hbs/Downloads/model_mapping.xlsx"
user_description = "This is a high-performance CNC machine with advanced controls and precision tooling. It can handle a wide range of materials and produce parts with exceptional accuracy."
        

In [None]:
process_excel(
            input_file,
            output_file,
            manufacturer_mapping_file,
            model_mapping_file,
            pdf_folder,
            user_description
        )