## Import Libraries

In [33]:
import torch
import pandas as pd
import os
from pathlib import Path
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    pipeline, 
    BitsAndBytesConfig
)
import logging
import TextAnalysis as TA

# Model Configuration

In [34]:
def setup_model(model_path="/nobackup/ielhaime/models/"):
    """Initialize tokenizer and model with 4-bit quantization."""
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16
    )
    
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenizer.pad_token = tokenizer.eos_token
    
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        quantization_config=bnb_config,
        device_map="auto"
    )
    
    return pipeline(
        "text_generation",
        tokenizer=tokenizer,
        model=model,
        max_new_tokens=512
    )

#  Text Processing Functions

In [None]:
def process_summaries(df, stopwords):
    """Calculate n-grams for summaries."""
    for _, row in df.iterrows():
        summary = row.get('Summary', '')
        if pd.isna(summary) or summary == '':
            empty_result = [('', 0)] * 10
            results = {
                'Top_Words': empty_result,
                'Top_Bigrams': empty_result,
                'Top_Trigrams': empty_result
            }
        else:
            results = {
                'Top_Words': TA.topwords(summary, stopwords),
                'Top_Bigrams': TA.topbigrams(summary, stopwords),
                'Top_Trigrams': TA.toptrigrams(summary, stopwords)
            }
        
        for key, value in results.items():
            df.at[_, key] = value
    
    return df


# File Processing Functions

In [42]:
def read_text_file(file_path):
    """Read text file with fallback encoding."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except UnicodeDecodeError:
        with open(file_path, 'r', encoding='latin-1') as file:
            return file.read()
    except Exception as e:
        logging.error(f"Error reading {file_path}: {e}")
        return None

def process_directory(text_generator, directory_path='content'):
    """Process text files and generate summaries."""
    Path(directory_path).mkdir(exist_ok=True)
    txt_files = [f for f in os.listdir(directory_path) if f.endswith('.txt')]
    
    if not txt_files:
        logging.warning(f"No text files found in {directory_path}")
        return None
    
    summaries = []
    for file_name in txt_files:
        content = read_text_file(os.path.join(directory_path, file_name))
        if content:
            try:
                summary = text_generator(content)[0]['generated_text']
                summaries.append({
                    'file_name': file_name,
                    'Summary': summary
                })
            except Exception as e:
                logging.error(f"Error processing {file_name}: {e}")
    
    return pd.DataFrame(summaries) if summaries else None


# Main Execution

In [None]:
if __name__ == "__main__":
    # Initialize logging
    logging.basicConfig(level=logging.INFO)
    
    # Setup model and process files
    text_generator = setup_model()
    df = process_directory(text_generator)
    
    if df is not None:
        # Process and save results
        stopwords_path = "stopwords.txt"
        df = process_summaries(df, stopwords_path)
        df.to_csv('./LLM/summaries/meta_summaries.csv', index=False)
        
        # Display summary
        logging.info(f"Processed {len(df)} files")
        logging.info("\nFirst few entries:")
        print(df.head())

# DEPRACTED BELOW

## Import Libraries

In [None]:
# import torch
# import pandas as pd
# import os
# from pathlib import Path
# from transformers import (
#     AutoTokenizer, 
#     AutoModelForCausalLM, 
#     pipeline, 
#     BitsAndBytesConfig
# )
# import logging
# import TextAnalysis as TA

## Choose Meta Llama Model

In [5]:
# model_id = "meta-llama/Llama-3.1-8B"

## Set the Precision and Load the Tokenizer and Model from Backup

In [None]:
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16
# )

# tokenizer = AutoTokenizer.from_pretrained("/nobackup/ielhaime/models/")
# tokenizer.pad_token = tokenizer.eos_token
# model = AutoModelForCausalLM.from_pretrained(
#     "/nobackup/ielhaime/models/",
#     quantization_config=bnb_config,
#     device_map="auto")


## Setup the Text Gen Pipeline and Set Size of Reponse

In [None]:
# text_generator = pipeline(
#     "text_generation",
#     tokenizer=tokenizer,
#     model=model,
#     max_new_tokens=512
#     )

## Summary Function

In [None]:
# def get_summary(document):
#     response = text_generator(document)
#     gen_text = response[0]['generated_text']
#     return gen_text

# Stopwords Load

In [21]:
# def load_stopwords(filepath='stopwords.txt'):
#     """
#     Load stopwords from file or create default if doesn't exist.
#     Returns set of stopwords.
#     """
#     default_stopwords = {
#         'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
#         'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
#         'to', 'was', 'were', 'will', 'with', 'the', 'this', 'but', 'they',
#         'have', 'had', 'what', 'when', 'where', 'who', 'which', 'why', 'how'
#     }
    
#     if not os.path.exists(filepath):
#         logging.info(f"Stopwords file not found at {filepath}. Creating default stopwords file.")
#         with open(filepath, 'w', encoding='utf-8') as f:
#             f.write('\n'.join(sorted(default_stopwords)))
#         return default_stopwords
    
#     with open(filepath, 'r', encoding='utf-8') as f:
#         stopwords = {word.strip() for word in f.readlines() if word.strip()}
    
#     logging.info(f"Loaded {len(stopwords)} stopwords from {filepath}")
#     return stopwords

# # Create/load stopwords
# stopwords = load_stopwords()

# N-Grams

In [31]:
# def process_summaries(df, stopwords_path):
#     """Process summaries and calculate n-grams using TextAnalysis module."""
#     # Load stopwords at the start of processing
#     stopwords = stopwords_path
    
#     results = {
#         'Summaries Top 10 Words': [],
#         'Summaries Top 10 Bigrams': [],
#         'Summaries Top 10 Trigrams': []
#     }
    
#     for _, row in df.iterrows():
#         summaries = row.get('Summary', '')
#         if pd.isna(summaries) or summaries == '':
#             empty_result = [('', 0)] * 10
#             for key in results:
#                 results[key].append(empty_result)
#             continue
            
#         results['Summaries Top 10 Words'].append(TA.topwords(summaries, stopwords))
#         results['Summaries Top 10 Bigrams'].append(TA.topbigrams(summaries, stopwords))
#         results['Summaries Top 10 Trigrams'].append(TA.toptrigrams(summaries, stopwords))
    
#     # Add results to DataFrame
#     for key in results:
#         df[key] = results[key]
    
#     return df

## Get Summaries

In [None]:
# import os
# import pandas as pd
# from pathlib import Path

# def get_summary(document):
#     """Generate a summary for a given document text."""
#     # response = text_generator(document)
#     # gen_text = response[0]['generated_text']
#     # return gen_text
#     return document

# def read_text_file(file_path):
#     """Read and return the contents of a text file."""
#     try:
#         with open(file_path, 'r', encoding='utf-8') as file:
#             return file.read()
#     except UnicodeDecodeError:
#         # Try a different encoding if UTF-8 fails
#         with open(file_path, 'r', encoding='latin-1') as file:
#             return file.read()
#     except Exception as e:
#         print(f"Error reading file {file_path}: {str(e)}")
#         return None

# def process_directory(directory_path='content', output_file='summaries.csv'):
#     """
#     Process all text files in the specified directory and save summaries to CSV using pandas.
    
#     Args:
#         directory_path (str): Path to the directory containing text files
#         output_file (str): Name of the output CSV file
    
#     Returns:
#         pandas.DataFrame: DataFrame containing the summaries, or None if no files were processed
#     """
#     # Create content directory if it doesn't exist
#     Path(directory_path).mkdir(exist_ok=True)
    
#     # Get all .txt files in the directory
#     txt_files = [f for f in os.listdir(directory_path) if f.endswith('.txt')]
    
#     if not txt_files:
#         print(f"No text files found in {directory_path}")
#         return None
    
#     # Process each file and store results
#     summaries = []
#     for file_name in txt_files:
#         file_path = os.path.join(directory_path, file_name)
#         print(f"Processing {file_name}...")
        
#         # Read the file content
#         content = read_text_file(file_path)
#         if content is None:
#             continue
            
#         try:
#             # Generate summary
#             summary = get_summary(content)
#             summaries.append({
#                 'file_name': file_name,
#                 'Summary': summary
#             })
#         except Exception as e:
#             print(f"Error processing {file_name}: {str(e)}")
#             continue
    
#     # Create DataFrame from summaries
#     if summaries:
#         try:
#             # Convert list of dictionaries to DataFrame
#             df = pd.DataFrame(summaries)
            
#             # Save to CSV
#             df.to_csv(output_file, index=False, encoding='utf-8')
#             print(f"Summaries saved to {output_file}")
            
#             return df
#         except Exception as e:
#             print(f"Error creating DataFrame or saving to CSV: {str(e)}")
#             return None
#     else:
#         print("No summaries were generated")
#         return None

# def save_dataframe(df, filename='summaries.csv'):
#     """
#     Save the DataFrame as a CSV file.
    
#     Args:
#         df (pandas.DataFrame): DataFrame to save
#         filename (str): Name of the output CSV file
#     """
#     try:
#         df.to_csv(filename, index=False, encoding='utf-8')
#         print(f"Successfully saved DataFrame to {filename}")
#     except Exception as e:
#         print(f"Error saving DataFrame to CSV: {str(e)}")

# if __name__ == "__main__":
#     # Process the directory and get the DataFrame
#     df = process_directory()
    
#     if df is not None:
#         # Display summary information
#         print("\nSummary of processed files:")
#         print(f"Total files processed: {len(df)}")
#         process_summaries(df, "stopwords.txt")
#         print("\nFirst few entries:")
#         print(df.head())
        
#         # Save DataFrame to CSV
#         save_dataframe(df, 'summaries.csv')