## Import Libraries

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig

## Choose Meta Llama Model

In [5]:
model_id = "meta-llama/Llama-3.1-8B"

## Set the Precision and Load the Tokenizer and Model from Backup

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained("/nobackup/ielhaime/models/")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
    "/nobackup/ielhaime/models/",
    quantization_config=bnb_config,
    device_map="auto")


## Setup the Text Gen Pipeline and Set Size of Reponse

In [None]:
text_generator = pipeline(
    "text_generation",
    tokenizer=tokenizer,
    model=model,
    max_new_tokens=512
    )

## Summary Function

In [None]:
def get_summary(document):
    response = text_generator(document)
    gen_text = response[0]['generated_text']
    return gen_text

## Get Summaries

In [18]:
import os
import pandas as pd
from pathlib import Path

def get_summary(document):
    """Generate a summary for a given document text."""
    response = text_generator(document)
    gen_text = response[0]['generated_text']
    return gen_text

def read_text_file(file_path):
    """Read and return the contents of a text file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()
    except UnicodeDecodeError:
        # Try a different encoding if UTF-8 fails
        with open(file_path, 'r', encoding='latin-1') as file:
            return file.read()
    except Exception as e:
        print(f"Error reading file {file_path}: {str(e)}")
        return None

def process_directory(directory_path='content', output_file='summaries.csv'):
    """
    Process all text files in the specified directory and save summaries to CSV using pandas.
    
    Args:
        directory_path (str): Path to the directory containing text files
        output_file (str): Name of the output CSV file
    
    Returns:
        pandas.DataFrame: DataFrame containing the summaries, or None if no files were processed
    """
    # Create content directory if it doesn't exist
    Path(directory_path).mkdir(exist_ok=True)
    
    # Get all .txt files in the directory
    txt_files = [f for f in os.listdir(directory_path) if f.endswith('.txt')]
    
    if not txt_files:
        print(f"No text files found in {directory_path}")
        return None
    
    # Process each file and store results
    summaries = []
    for file_name in txt_files:
        file_path = os.path.join(directory_path, file_name)
        print(f"Processing {file_name}...")
        
        # Read the file content
        content = read_text_file(file_path)
        if content is None:
            continue
            
        try:
            # Generate summary
            summary = get_summary(content)
            summaries.append({
                'file_name': file_name,
                'summary': summary
            })
        except Exception as e:
            print(f"Error processing {file_name}: {str(e)}")
            continue
    
    # Create DataFrame from summaries
    if summaries:
        try:
            # Convert list of dictionaries to DataFrame
            df = pd.DataFrame(summaries)
            
            # Save to CSV
            df.to_csv(output_file, index=False, encoding='utf-8')
            print(f"Summaries saved to {output_file}")
            
            return df
        except Exception as e:
            print(f"Error creating DataFrame or saving to CSV: {str(e)}")
            return None
    else:
        print("No summaries were generated")
        return None

def save_dataframe(df, filename='summaries.csv'):
    """
    Save the DataFrame as a CSV file.
    
    Args:
        df (pandas.DataFrame): DataFrame to save
        filename (str): Name of the output CSV file
    """
    try:
        df.to_csv(filename, index=False, encoding='utf-8')
        print(f"Successfully saved DataFrame to {filename}")
    except Exception as e:
        print(f"Error saving DataFrame to CSV: {str(e)}")

if __name__ == "__main__":
    # Process the directory and get the DataFrame
    df = process_directory()
    
    if df is not None:
        # Display summary information
        print("\nSummary of processed files:")
        print(f"Total files processed: {len(df)}")
        print("\nFirst few entries:")
        print(df.head())
        
        # Save DataFrame to CSV
        save_dataframe(df, 'summaries.csv')

Processing 2410.11851v1.txt...
Processing 2410.17135v1.txt...
Processing 2410.17142v1.txt...
Processing 2410.17163v1.txt...
Processing 2410.17169v1.txt...
Processing 2410.17176v1.txt...
Processing 2410.17178v1.txt...
Processing 2410.17187v1.txt...
Processing 2410.17189v1.txt...
Processing 2410.17232v1.txt...
Processing 2410.17252v1.txt...
Summaries saved to summaries.csv

Summary of processed files:
Total files processed: 11

First few entries:
          file_name  summary
0  2410.11851v1.txt  Summary
1  2410.17135v1.txt  Summary
2  2410.17142v1.txt  Summary
3  2410.17163v1.txt  Summary
4  2410.17169v1.txt  Summary
Successfully saved DataFrame to summaries.csv
