In [None]:
import os
import json
import time
import openai
from tqdm import tqdm
import sys
import subprocess

# Directly specify input and output paths
INPUT_FILE =  #input file location
OUTPUT_FILE =  #output file location

# DeepSeek API key - set directly here or via environment variable
DEEPSEEK_API_KEY = ""  # Enter your API key here

def read_sentences_from_file(file_path):
    """
    Read sentences from a file. Supports .docx and .txt formats.
    
    Args:
        file_path (str): Path to the file containing sentences
        
    Returns:
        list: List of sentences
    """
    try:
        file_ext = os.path.splitext(file_path)[1].lower()
        
        # Handle .docx files
        if file_ext == '.docx':
            try:
                import docx
                doc = docx.Document(file_path)
                sentences = [para.text.strip() for para in doc.paragraphs if para.text.strip()]
            except ImportError:
                print("Warning: python-docx not installed. Trying alternative method to read docx...")
                try:
                    import zipfile
                    import xml.etree.ElementTree as ET
                    from io import BytesIO
                    
                    with zipfile.ZipFile(file_path) as zip_ref:
                        if 'word/document.xml' in zip_ref.namelist():
                            content = zip_ref.read('word/document.xml')
                            root = ET.fromstring(content)
                            ns = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
                            text_elements = root.findall('.//w:p//w:t', ns)
                            current_sentence = []
                            sentences = []
                            for elem in text_elements:
                                if elem.text:
                                    current_sentence.append(elem.text)
                                if elem.text and (elem.text.endswith('.') or elem.text.endswith('ã€‚')):
                                    if current_sentence:
                                        sentences.append(''.join(current_sentence).strip())
                                        current_sentence = []
                            if current_sentence:
                                sentences.append(''.join(current_sentence).strip())
                        else:
                            raise Exception("document.xml not found in docx file")
                except Exception as ex:
                    raise Exception(f"Failed to parse docx file: {str(ex)}")
        
        # Handle .txt files
        elif file_ext == '.txt':
            encodings = ['utf-8', 'gbk', 'gb2312', 'latin-1']
            success = False
            for encoding in encodings:
                try:
                    with open(file_path, 'r', encoding=encoding) as f:
                        sentences = [line.strip() for line in f if line.strip()]
                    success = True
                    print(f"Successfully read file using {encoding} encoding")
                    break
                except UnicodeDecodeError:
                    continue
            
            if not success:
                raise Exception(f"Failed to read file with encodings: {', '.join(encodings)}")
        else:
            raise Exception(f"Unsupported file format: {file_ext}. Use .docx or .txt files.")
        
        print(f"Read {len(sentences)} sentences from {file_path}")
        return sentences
    except Exception as e:
        print(f"Error reading file: {str(e)}")
        print("Make sure you have installed python-docx: pip install python-docx")
        sys.exit(1)

def save_sentences_to_file(sentences, output_file):
    """
    Save sentences to a file, supporting .docx and .txt formats.
    
    Args:
        sentences (list): Sentences to save
        output_file (str): Output file path
    """
    try:
        output_dir = os.path.dirname(output_file)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            
        file_ext = os.path.splitext(output_file)[1].lower()
        
        if file_ext == '.docx':
            try:
                import docx
                doc = docx.Document()
                doc.add_heading('UHPC Related Sentences', 0)
                for sentence in sentences:
                    doc.add_paragraph(sentence)
                doc.save(output_file)
                print(f"Saved {len(sentences)} valid sentences to {output_file} (DOCX format)")
            except ImportError:
                print("Warning: python-docx not installed. Saving as TXT instead.")
                txt_output = output_file.replace('.docx', '.txt')
                with open(txt_output, 'w', encoding='utf-8') as f:
                    for sentence in sentences:
                        f.write(f"{sentence}\n")
                print(f"Saved {len(sentences)} valid sentences to {txt_output} (TXT format)")
        else:
            with open(output_file, 'w', encoding='utf-8') as f:
                for sentence in sentences:
                    f.write(f"{sentence}\n")
            print(f"Saved {len(sentences)} valid sentences to {output_file}")
    except Exception as e:
        print(f"Error saving file: {str(e)}")
        try:
            backup_file = f"valid_sentences_backup_{int(time.time())}.txt"
            with open(backup_file, 'w', encoding='utf-8') as f:
                for sentence in sentences:
                    f.write(f"{sentence}\n")
            print(f"Sentences saved to backup file: {backup_file}")
        except Exception as ex:
            print(f"Backup save failed: {str(ex)}")

def gpt_judge_sentences(sentences):
    """
    Filter sentences to identify those relevant to UHPC and suitable for a knowledge graph.
    
    Args:
        sentences (list): Sentences to evaluate
        
    Returns:
        list: Filtered list of sentences
    """
    valid_sentences = []
    
    try:
        api_key = DEEPSEEK_API_KEY or os.environ.get("DEEPSEEK_API_KEY")
        if not api_key:
            print("Error: DeepSeek API key not set. Please set DEEPSEEK_API_KEY in code or environment.")
            sys.exit(1)
            
        client = openai.OpenAI(
            api_key=api_key,
            base_url="https://api.deepseek.com/v1"
        )
    except Exception as e:
        print(f"Error initializing DeepSeek client: {str(e)}")
        sys.exit(1)
    
    batch_size = 10
    
    prompt_template = """
You are an AI assistant responsible for evaluating whether extracted sentences from research papers 
are relevant to Ultra-High Performance Concrete (UHPC) and contain valuable scientific knowledge for 
a knowledge graph (KG). Your task is to assess each sentence strictly based on the following criteria.
Instruction:
Evaluate each sentence to determine whether it contains precise and useful scientific knowledge about ultra-high performance concrete suitable for inclusion in a knowledge graph.
Evaluation criteria:
1.	Knowledge relevance: Sentence must express meaningful UHPC-related technical knowledge, such as properties, composition, performance, test methods, values, or scientific relationships.
2.	Specificity and quantifiability: Given preference to sentences with quantitative data. Use qualitative statements only if they are scientifically significant and precise.
3.	Domain relevance: Sentences must be explicitly or contextually related to UHPC (e.g., high strength, steel fibers, silica fume, advanced curing).
4.	Standalone information: Discard sentences that refer only to tables, figures, or sections without stating core information.
5.	Informational clarity: Exclude vague, general, or purely rhetorical/introductory statements lacking concrete facts.
6.	Self-contained meaning: Sentence should be understandable on its own without requiring extended surrounding context.
7.	Grammatical integrity: Sentence must be complete and grammatically correct.

"""
    total_batches = (len(sentences) + batch_size - 1) // batch_size
    for i in tqdm(range(0, len(sentences), batch_size), total=total_batches, desc="Evaluating Sentences"):
        batch = sentences[i:i + batch_size]
        batched_text = "\n".join(f"{j+1}. {s}" for j, s in enumerate(batch))
        prompt = prompt_template.replace("{batched_sentences}", batched_text)
        
        max_retries = 3
        retry_count = 0
        
        while retry_count < max_retries:
            try:
                print(f"\nProcessing batch {i//batch_size + 1}/{total_batches} ({len(batch)} sentences)...")
                
                response = client.chat.completions.create(
                    model="deepseek-chat",
                    messages=[{"role": "user", "content": prompt}],
                    temperature=0.3,
                    max_tokens=8000,
                    response_format={"type": "json_object"}
                )
                
                gpt_output = response.choices[0].message.content.strip()
                print(f"Batch {i//batch_size + 1}/{total_batches} processed...")
                
                try:
                    cleaned_output = gpt_output.strip().replace("```json", "").replace("```", "").strip()
                    result = json.loads(cleaned_output)
                    
                    if isinstance(result, dict):
                        if "evaluations" in result:
                            result_array = result["evaluations"]
                        elif "sentences" in result:
                            result_array = result["sentences"]
                        elif "results" in result:
                            result_array = result["results"]
                        elif "data" in result:
                            result_array = result["data"]
                        else:
                            keys = list(result.keys())
                            if keys and (keys[0].isdigit() or keys[0].startswith('sentence_')):
                                result_array = [result[k] for k in keys]
                            else:
                                if "original_sentence" in result and "is_kg_worthy" in result:
                                    result_array = [result]
                                else:
                                    print(f"Unrecognized JSON structure, keys: {keys[:5]}...")
                                    print(f"Raw API output:\n{cleaned_output[:300]}...")
                                    retry_count += 1
                                    time.sleep(1)
                                    continue
                    else:
                        result_array = result
                    
                    if isinstance(result_array, list):
                        for item in result_array:
                            try:
                                if isinstance(item, dict) and item.get("is_kg_worthy"):
                                    valid_sentences.append(item["original_sentence"])
                            except KeyError as ke:
                                print(f"Key error: {ke}, item: {str(item)[:50]}...")
                                continue
                        break
                    else:
                        print(f"Unexpected result type: {type(result_array).__name__}")
                        retry_count += 1
                        
                except json.JSONDecodeError as e:
                    print(f"JSON decode error: {e}\nOutput:\n{gpt_output[:200]}...")
                    retry_count += 1
                    
            except Exception as e:
                print(f"Processing exception: {str(e)}")
                retry_count += 1
                time.sleep(2)
            
            if retry_count >= max_retries:
                print(f"Batch {i//batch_size + 1}/{total_batches} failed, skipping...")
                break
        
        if (i // batch_size) % 5 == 0 and i > 0:
            temp_output_file = f"valid_sentences_intermediate_{i//batch_size}.txt"
            save_sentences_to_file(valid_sentences, temp_output_file)
            print(f"Intermediate results saved to {temp_output_file}")
            
    print(f"Total valid sentences found: {len(valid_sentences)}")
    return valid_sentences

def check_install_package(package_name):
    """
    Check and install missing Python package.
    
    Args:
        package_name (str): Package name
    """
    try:
        __import__(package_name)
        return True
    except ImportError:
        print(f"Installing required package: {package_name}...")
        try:
            subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
            print(f"{package_name} installed successfully!")
            return True
        except Exception as e:
            print(f"Failed to install {package_name}: {str(e)}")
            return False

def main():
    """
    Main function: read, filter, and save sentences.
    """
    print("=== UHPC Sentence Processing Program ===")
    print(f"Input file: {INPUT_FILE}")
    print(f"Output file: {OUTPUT_FILE}")
    print("========================================")
    
    if not os.path.exists(INPUT_FILE):
        print(f"Error: Input file '{INPUT_FILE}' not found.")
        return
    
    file_ext = os.path.splitext(INPUT_FILE)[1].lower()
    if file_ext == '.docx':
        print("DOCX file detected, checking dependencies...")
        check_install_package("python-docx")
    
    sentences = read_sentences_from_file(INPUT_FILE)
    
    if not sentences or len(sentences) == 0:
        print("Error: No sentences read from file. Please check file content.")
        return
    
    start_time = time.time()
    valid_sentences = gpt_judge_sentences(sentences)
    end_time = time.time()
    processing_time = end_time - start_time
    
    save_sentences_to_file(valid_sentences, OUTPUT_FILE)
    
    print("\n============= Processing Complete =============")
    print(f"Total sentences: {len(sentences)}")
    print(f"Valid sentences: {len(valid_sentences)}")
    print(f"Discarded sentences: {len(sentences) - len(valid_sentences)}")
    print(f"Validity ratio: {len(valid_sentences) / len(sentences) * 100:.2f}%")
    print(f"Processing time: {processing_time:.2f} sec ({processing_time / 60:.2f} min)")
    print("===============================================")

if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("\nProgram interrupted by user")
    except Exception as e:
        print(f"Program error: {str(e)}")
        import traceback
        traceback.print_exc()
    finally:
        print("Program ended")
