## Import Libraries

In [1]:
import torch
import pandas as pd
import os
from pathlib import Path
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    pipeline, 
    BitsAndBytesConfig
)
import logging
import TextAnalysis as TA
from langchain import LLMChain, HuggingFacePipeline, PromptTemplate

# Model Configuration

In [None]:
def setup_logging():
    """Configure logging with detailed formatting."""
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler('model_processing.log'),
            logging.StreamHandler()
        ]
    )
    return logging.getLogger(__name__)

def setup_model(model_path="/nobackup/ielhaime/models/casualLM/Llama-3.1-8B"):
    """Initialize tokenizer and model with 4-bit quantization, and set up LangChain pipeline."""
    logger = logging.getLogger(__name__)
    logger.info(f"Initializing model from path: {model_path}")
    
    try:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )
        logger.info("BitsAndBytesConfig initialized successfully")
        
        logger.info("Loading tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        tokenizer.pad_token = tokenizer.eos_token
        logger.info("Tokenizer loaded successfully")
        
        logger.info("Loading model...")
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            quantization_config=bnb_config,
            device_map="auto"
        )
        logger.info("Model loaded successfully")
        
        logger.info("Setting up LangChain pipeline...")
        llm = HuggingFacePipeline(pipeline=pipeline(
            "summarization",
            model=model,
            tokenizer=tokenizer,
            torch_dtype=torch.bfloat16,
            trust_remote_code=True,
            device_map="auto",
            max_new_tokens=4096,
            do_sample=True,
            top_k=10,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id
        ), model_kwargs={'temperature': 0.5})
        logger.info("LangChain pipeline setup complete")
        
        template = """
                    Write a summary of the following text delimited by triple backticks.
                    Return your response which covers the key points of the text.
                    ```{text}```
                    SUMMARY:
                 """
        prompt = PromptTemplate(template=template, input_variables=["text"])
        llm_chain = LLMChain(prompt=prompt, llm=llm)
        logger.info("LLM chain created successfully")
        
        return llm_chain
        
    except Exception as e:
        logger.error(f"Error in setup_model: {str(e)}", exc_info=True)
        raise

#  Text Processing Functions

In [3]:
def process_summaries(df, stopwords):
    """Calculate n-grams for summaries."""
    logger = logging.getLogger(__name__)
    logger.info(f"Processing summaries for {len(df)} entries")

    processed_count = 0
    error_count = 0

    for idx, row in df.iterrows():
        try:
            summary = row.get('Summary', '')
            logger.debug(f"Processing summary for index {idx}")

            if pd.isna(summary) or summary == '':
                logger.warning(f"Empty or NaN summary found at index {idx}")
                empty_result = [('', 0)] * 10
                results = {
                    'Top_Words': empty_result,
                    'Top_Bigrams': empty_result,
                    'Top_Trigrams': empty_result,
                }
            else:
                logger.debug(f"Calculating n-grams for summary at index {idx}")
                results = {
                    'Top_Words': TA.topwords(summary, stopwords),
                    'Top_Bigrams': TA.topbigrams(summary, stopwords),
                    'Top_Trigrams': TA.toptrigrams(summary, stopwords),
                }

            for key, value in results.items():
                df.at[idx, key] = value

            processed_count += 1
            if processed_count % 10 == 0:
                logger.info(f"Processed {processed_count} summaries")

        except Exception as e:
            logger.error(f"Error processing summary at index {idx}: {str(e)}", exc_info=True)
            error_count += 1

    logger.info(f"Summary processing complete. Processed: {processed_count}, Errors: {error_count}")
    return df

# File Processing Functions

In [4]:
def read_text_file(file_path):
    """Read text file with fallback encoding."""
    logger = logging.getLogger(__name__)
    logger.info(f"Attempting to read file: {file_path}")
    
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            logger.info(f"Successfully read file with UTF-8 encoding: {file_path}")
            return content
    except UnicodeDecodeError:
        logger.warning(f"UTF-8 decode failed for {file_path}, attempting with latin-1")
        try:
            with open(file_path, 'r', encoding='latin-1') as file:
                content = file.read()
                logger.info(f"Successfully read file with latin-1 encoding: {file_path}")
                return content
        except Exception as e:
            logger.error(f"Failed to read file with latin-1 encoding: {file_path}", exc_info=True)
            return None
    except Exception as e:
        logger.error(f"Error reading file {file_path}: {str(e)}", exc_info=True)
        return None

def process_directory(llm_chain, directory_path='content'):
    """Process text files and generate summaries."""
    logger = logging.getLogger(__name__)
    logger.info(f"Processing directory: {directory_path}")
    
    Path(directory_path).mkdir(exist_ok=True)
    txt_files = [f for f in os.listdir(directory_path) if f.endswith('.txt')]
    
    if not txt_files:
        logger.warning(f"No text files found in {directory_path}")
        return None
    
    logger.info(f"Found {len(txt_files)} text files to process")
    summaries = []
    processed_count = 0
    error_count = 0
    
    for file_name in txt_files:
        file_path = os.path.join(directory_path, file_name)
        logger.info(f"Processing file: {file_name}")
        
        content = read_text_file(file_path)
        if content:
            try:
                logger.debug(f"Generating summary for {file_name}")
                summary = llm_chain.run(content)
                summaries.append({
                    'file_name': file_name,
                    'Summary': summary
                })
                processed_count += 1
                logger.info(f"Successfully processed {file_name}")
            except Exception as e:
                logger.error(f"Error processing {file_name}: {str(e)}", exc_info=True)
                error_count += 1
        else:
            logger.error(f"Could not read content from {file_name}")
            error_count += 1
    
    logger.info(f"Directory processing complete. Processed: {processed_count}, Errors: {error_count}")
    return pd.DataFrame(summaries) if summaries else None

# Main Execution

In [5]:
if __name__ == "__main__":
    logger = setup_logging()
    logger.info("Starting text processing application")
    
    try:
        logger.info("Setting up model...")
        llm_chain = setup_model()
        
        logger.info("Processing directory...")
        df = process_directory(llm_chain)
        
        if df is not None:
            logger.info("Processing summaries...")
            stopwords_path = "stopwords.txt"
            df = process_summaries(df, stopwords_path)
            
            output_path = './LLM/summaries/meta_summaries.csv'
            logger.info(f"Saving results to {output_path}")
            df.to_csv(output_path, index=False)
            
            logger.info(f"Successfully processed {len(df)} files")
            logger.debug("\nFirst few entries:")
            print(df.head())
        else:
            logger.warning("No files were processed")
            
    except Exception as e:
        logger.error("Fatal error in main execution", exc_info=True)
        raise
    
    logger.info("Application completed successfully")

2024-11-17 19:09:35,360 - __main__ - INFO - Starting text processing application
2024-11-17 19:09:35,361 - __main__ - INFO - Setting up model...
2024-11-17 19:09:35,362 - __main__ - INFO - Initializing model from path: /nobackup/ielhaime/models/casualLM/Llama-3.1-8B
2024-11-17 19:09:35,364 - __main__ - INFO - BitsAndBytesConfig initialized successfully
2024-11-17 19:09:35,365 - __main__ - INFO - Loading tokenizer...
2024-11-17 19:09:35,366 - __main__ - ERROR - Error in setup_model: Incorrect path_or_model_id: '/nobackup/ielhaime/models/casualLM/Llama-3.1-8B'. Please provide either the path to a local folder or the repo_id of a model on the Hub.
Traceback (most recent call last):
  File "c:\Users\admin\AppData\Local\Programs\Python\Python312\Lib\site-packages\transformers\utils\hub.py", line 403, in cached_file
    resolved_file = hf_hub_download(
                    ^^^^^^^^^^^^^^^^
  File "c:\Users\admin\AppData\Local\Programs\Python\Python312\Lib\site-packages\huggingface_hub\utils\_

OSError: Incorrect path_or_model_id: '/nobackup/ielhaime/models/casualLM/Llama-3.1-8B'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

# DEPRACTED BELOW

## Import Libraries

In [None]:
# import torch
# import pandas as pd
# import os
# from pathlib import Path
# from transformers import (
#     AutoTokenizer, 
#     AutoModelForCausalLM, 
#     pipeline, 
#     BitsAndBytesConfig
# )
# import logging
# import TextAnalysis as TA

## Choose Meta Llama Model

In [None]:
# model_id = "meta-llama/Llama-3.1-8B"

## Set the Precision and Load the Tokenizer and Model from Backup

In [None]:
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16
# )

# tokenizer = AutoTokenizer.from_pretrained("/nobackup/ielhaime/models/")
# tokenizer.pad_token = tokenizer.eos_token
# model = AutoModelForCausalLM.from_pretrained(
#     "/nobackup/ielhaime/models/",
#     quantization_config=bnb_config,
#     device_map="auto")


## Setup the Text Gen Pipeline and Set Size of Reponse

In [None]:
# text_generator = pipeline(
#     "text_generation",
#     tokenizer=tokenizer,
#     model=model,
#     max_new_tokens=512
#     )

## Summary Function

In [None]:
# def get_summary(document):
#     response = text_generator(document)
#     gen_text = response[0]['generated_text']
#     return gen_text

# Stopwords Load

In [None]:
# def load_stopwords(filepath='stopwords.txt'):
#     """
#     Load stopwords from file or create default if doesn't exist.
#     Returns set of stopwords.
#     """
#     default_stopwords = {
#         'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
#         'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
#         'to', 'was', 'were', 'will', 'with', 'the', 'this', 'but', 'they',
#         'have', 'had', 'what', 'when', 'where', 'who', 'which', 'why', 'how'
#     }
    
#     if not os.path.exists(filepath):
#         logging.info(f"Stopwords file not found at {filepath}. Creating default stopwords file.")
#         with open(filepath, 'w', encoding='utf-8') as f:
#             f.write('\n'.join(sorted(default_stopwords)))
#         return default_stopwords
    
#     with open(filepath, 'r', encoding='utf-8') as f:
#         stopwords = {word.strip() for word in f.readlines() if word.strip()}
    
#     logging.info(f"Loaded {len(stopwords)} stopwords from {filepath}")
#     return stopwords

# # Create/load stopwords
# stopwords = load_stopwords()

# N-Grams

In [None]:
# def process_summaries(df, stopwords_path):
#     """Process summaries and calculate n-grams using TextAnalysis module."""
#     # Load stopwords at the start of processing
#     stopwords = stopwords_path
    
#     results = {
#         'Summaries Top 10 Words': [],
#         'Summaries Top 10 Bigrams': [],
#         'Summaries Top 10 Trigrams': []
#     }
    
#     for _, row in df.iterrows():
#         summaries = row.get('Summary', '')
#         if pd.isna(summaries) or summaries == '':
#             empty_result = [('', 0)] * 10
#             for key in results:
#                 results[key].append(empty_result)
#             continue
            
#         results['Summaries Top 10 Words'].append(TA.topwords(summaries, stopwords))
#         results['Summaries Top 10 Bigrams'].append(TA.topbigrams(summaries, stopwords))
#         results['Summaries Top 10 Trigrams'].append(TA.toptrigrams(summaries, stopwords))
    
#     # Add results to DataFrame
#     for key in results:
#         df[key] = results[key]
    
#     return df

## Get Summaries

In [None]:
# import os
# import pandas as pd
# from pathlib import Path

# def get_summary(document):
#     """Generate a summary for a given document text."""
#     # response = text_generator(document)
#     # gen_text = response[0]['generated_text']
#     # return gen_text
#     return document

# def read_text_file(file_path):
#     """Read and return the contents of a text file."""
#     try:
#         with open(file_path, 'r', encoding='utf-8') as file:
#             return file.read()
#     except UnicodeDecodeError:
#         # Try a different encoding if UTF-8 fails
#         with open(file_path, 'r', encoding='latin-1') as file:
#             return file.read()
#     except Exception as e:
#         print(f"Error reading file {file_path}: {str(e)}")
#         return None

# def process_directory(directory_path='content', output_file='summaries.csv'):
#     """
#     Process all text files in the specified directory and save summaries to CSV using pandas.
    
#     Args:
#         directory_path (str): Path to the directory containing text files
#         output_file (str): Name of the output CSV file
    
#     Returns:
#         pandas.DataFrame: DataFrame containing the summaries, or None if no files were processed
#     """
#     # Create content directory if it doesn't exist
#     Path(directory_path).mkdir(exist_ok=True)
    
#     # Get all .txt files in the directory
#     txt_files = [f for f in os.listdir(directory_path) if f.endswith('.txt')]
    
#     if not txt_files:
#         print(f"No text files found in {directory_path}")
#         return None
    
#     # Process each file and store results
#     summaries = []
#     for file_name in txt_files:
#         file_path = os.path.join(directory_path, file_name)
#         print(f"Processing {file_name}...")
        
#         # Read the file content
#         content = read_text_file(file_path)
#         if content is None:
#             continue
            
#         try:
#             # Generate summary
#             summary = get_summary(content)
#             summaries.append({
#                 'file_name': file_name,
#                 'Summary': summary
#             })
#         except Exception as e:
#             print(f"Error processing {file_name}: {str(e)}")
#             continue
    
#     # Create DataFrame from summaries
#     if summaries:
#         try:
#             # Convert list of dictionaries to DataFrame
#             df = pd.DataFrame(summaries)
            
#             # Save to CSV
#             df.to_csv(output_file, index=False, encoding='utf-8')
#             print(f"Summaries saved to {output_file}")
            
#             return df
#         except Exception as e:
#             print(f"Error creating DataFrame or saving to CSV: {str(e)}")
#             return None
#     else:
#         print("No summaries were generated")
#         return None

# def save_dataframe(df, filename='summaries.csv'):
#     """
#     Save the DataFrame as a CSV file.
    
#     Args:
#         df (pandas.DataFrame): DataFrame to save
#         filename (str): Name of the output CSV file
#     """
#     try:
#         df.to_csv(filename, index=False, encoding='utf-8')
#         print(f"Successfully saved DataFrame to {filename}")
#     except Exception as e:
#         print(f"Error saving DataFrame to CSV: {str(e)}")

# if __name__ == "__main__":
#     # Process the directory and get the DataFrame
#     df = process_directory()
    
#     if df is not None:
#         # Display summary information
#         print("\nSummary of processed files:")
#         print(f"Total files processed: {len(df)}")
#         process_summaries(df, "stopwords.txt")
#         print("\nFirst few entries:")
#         print(df.head())
        
#         # Save DataFrame to CSV
#         save_dataframe(df, 'summaries.csv')