In [None]:
import os
import glob
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
import textstat
import pandas as pd

nltk.download('punkt')
nltk.download('vader_lexicon')

# Initialize sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to calculate Lexical Diversity
def calculate_lexical_diversity(text):
    words = word_tokenize(text)
    return len(set(words)) / len(words)

# Function to perform Sentiment Analysis
def calculate_sentiment(text):
    return sia.polarity_scores(text)

# Function to calculate Readability Scores using textstat
def calculate_readability_scores(text):
    return textstat.flesch_kincaid_grade(text)

# Function to analyze Sentence Length and Structure
def analyze_sentence_structure(text):
    sentences = sent_tokenize(text)
    lengths = [len(word_tokenize(sent)) for sent in sentences]
    avg_length = sum(lengths) / len(sentences)
    return avg_length, lengths

# Main function to process files and calculate metrics
def process_files_and_calculate_metrics(folder, num_files):
    results = []

    files = sorted(glob.glob(os.path.join(folder, "*.txt")))

    for i in range(min(num_files, len(files))):
        file = files[i]

        with open(file, 'r') as f:
            text = f.read()

        readability_score = calculate_readability_scores(text)
        sentence_length_avg, sentence_lengths = analyze_sentence_structure(text)
        lexical_diversity = calculate_lexical_diversity(text)
        sentiment = calculate_sentiment(text)

        results.append({
            "file": os.path.basename(file),
            "readability_score": readability_score,
            "sentence_length_avg": sentence_length_avg,
            "lexical_diversity": lexical_diversity,
            "sentiment_positive": sentiment['pos'],
            "sentiment_neutral": sentiment['neu'],
            "sentiment_negative": sentiment['neg'],
            "sentiment_compound": sentiment['compound']
        })

    df = pd.DataFrame(results)
    return df

# Define folders and number of files to process
folders = [
    'Written Blog Posts/3.5T Blog',
    'Written Blog Posts/4o Blog Raw Text',
    'Written Blog Posts/4o Blog',
    'summaries'
]
num_files = 100  # Set the number of files to process per folder

# Process each folder and save results
for folder in folders:
    results_df = process_files_and_calculate_metrics(folder, num_files)
    output_file = f'{folder.replace(" ", "_").replace("/", "_")}_analysis_results.csv'
    results_df.to_csv(output_file, index=False)
    print(f"Results for {folder} saved to {output_file}")

# Display results for the first folder as an example
print(results_df)
