In [None]:
!pip install nltk scikit-learn gensim pandas transformers textstat

In [None]:
import os
import glob
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim import corpora, models
from transformers import BertTokenizer, BertModel
import torch
import textstat
import pandas as pd

nltk.download('punkt')

# Function to calculate Exact Match Ratio
def calculate_exact_match_ratio(summary_text, blog_text):
    summary_words = set(word_tokenize(summary_text))
    blog_words = set(word_tokenize(blog_text))
    matches = summary_words.intersection(blog_words)
    return len(matches) / len(summary_words)

# Function to calculate Cosine Similarity
def calculate_cosine_similarity(summary_text, blog_text):
    vectorizer = TfidfVectorizer().fit_transform([summary_text, blog_text])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0, 1]

# Function for Topic Modeling
def calculate_topic_overlap(summary_text, blog_text):
    documents = [summary_text, blog_text]
    dictionary = corpora.Dictionary([word_tokenize(doc) for doc in documents])
    corpus = [dictionary.doc2bow(word_tokenize(doc)) for doc in documents]
    lda = models.LdaModel(corpus, num_topics=5, id2word=dictionary)
    topics_summary = lda.get_document_topics(corpus[0])
    topics_blog = lda.get_document_topics(corpus[1])
    return compute_topic_overlap(topics_summary, topics_blog)

# Helper function to compute topic overlap
def compute_topic_overlap(topics_summary, topics_blog):
    summary_topics = {topic[0] for topic in topics_summary}
    blog_topics = {topic[0] for topic in topics_blog}
    overlap = summary_topics.intersection(blog_topics)
    return len(overlap) / max(len(summary_topics), len(blog_topics))

# Function to extract and compare Key Phrases
def calculate_key_phrase_overlap(summary_text, blog_text):
    summary_phrases = extract_key_phrases(summary_text)
    blog_phrases = extract_key_phrases(blog_text)
    matches = set(summary_phrases).intersection(set(blog_phrases))
    return len(matches) / len(summary_phrases)

# Helper function to extract key phrases (simple implementation)
def extract_key_phrases(text):
    words = word_tokenize(text)
    return list(set(words))  # Simplified: using unique words as key phrases

# Function for Semantic Similarity using BERT
def calculate_semantic_similarity(summary_text, blog_text):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    def embed_text(text):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).detach().numpy()

    summary_embedding = embed_text(summary_text)
    blog_embedding = embed_text(blog_text)

    print(f"Summary embedding shape: {summary_embedding.shape}")
    print(f"Blog embedding shape: {blog_embedding.shape}")

    if summary_embedding.shape[0] == 1 and blog_embedding.shape[0] == 1:
        return cosine_similarity(summary_embedding, blog_embedding)[0, 0]
    else:
        return 0  # Return 0 similarity if embeddings are not as expected

# Function to calculate Readability Scores using textstat
def calculate_readability_scores(text):
    return textstat.flesch_kincaid_grade(text)

# Function to analyze Sentence Length and Structure
def analyze_sentence_structure(text):
    sentences = sent_tokenize(text)
    lengths = [len(word_tokenize(sent)) for sent in sentences]
    avg_length = sum(lengths) / len(sentences)
    return avg_length, lengths

# Main function to process files and calculate metrics
def process_files_and_calculate_metrics(summary_folder, blog_folder):
    results = []

    summary_files = sorted(glob.glob(os.path.join(summary_folder, "*.txt")))
    blog_files = sorted(glob.glob(os.path.join(blog_folder, "*.txt")))

    for summary_file, blog_file in zip(summary_files, blog_files):
        with open(summary_file, 'r') as sf:
            summary_text = sf.read()
        with open(blog_file, 'r') as bf:
            blog_text = bf.read()

        exact_match_ratio = calculate_exact_match_ratio(summary_text, blog_text)
        cosine_sim = calculate_cosine_similarity(summary_text, blog_text)
        topic_overlap = calculate_topic_overlap(summary_text, blog_text)
        key_phrase_overlap = calculate_key_phrase_overlap(summary_text, blog_text)
        semantic_similarity = calculate_semantic_similarity(summary_text, blog_text)
        readability_summary = calculate_readability_scores(summary_text)
        readability_blog = calculate_readability_scores(blog_text)
        sentence_length_summary, _ = analyze_sentence_structure(summary_text)
        sentence_length_blog, _ = analyze_sentence_structure(blog_text)

        results.append({
            "summary_file": os.path.basename(summary_file),
            "blog_file": os.path.basename(blog_file),
            "exact_match_ratio": exact_match_ratio,
            "cosine_similarity": cosine_sim,
            "topic_overlap": topic_overlap,
            "key_phrase_overlap": key_phrase_overlap,
            "semantic_similarity": semantic_similarity,
            "readability_summary": readability_summary,
            "readability_blog": readability_blog,
            "sentence_length_summary": sentence_length_summary,
            "sentence_length_blog": sentence_length_blog
        })

    df = pd.DataFrame(results)
    df.to_csv('comparison_results.csv', index=False)
    return df

# Call main function with appropriate folder paths
summary_folder = 'summaries'
blog_folder = 'Written Blog Posts/3.5T Blog'
results_df_sum_to_35 = process_files_and_calculate_metrics(summary_folder, blog_folder)

# Display results
print(results_df_sum_to_35)


In [None]:
results_df_sum_to_35.to_csv('Individual Comparative CSVs/comparison_results_sum_to_3.5.csv', index=False)

In [None]:
import os
import glob
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim import corpora, models
from transformers import BertTokenizer, BertModel
import torch
import textstat
import pandas as pd

nltk.download('punkt')

# Function to calculate Exact Match Ratio
def calculate_exact_match_ratio(raw_text, blog_text):
    raw_words = set(word_tokenize(raw_text))
    blog_words = set(word_tokenize(blog_text))
    matches = raw_words.intersection(blog_words)
    return len(matches) / len(raw_words)

# Function to calculate Cosine Similarity
def calculate_cosine_similarity(raw_text, blog_text):
    vectorizer = TfidfVectorizer().fit_transform([raw_text, blog_text])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0, 1]

# Function for Topic Modeling
def calculate_topic_overlap(raw_text, blog_text):
    documents = [raw_text, blog_text]
    dictionary = corpora.Dictionary([word_tokenize(doc) for doc in documents])
    corpus = [dictionary.doc2bow(word_tokenize(doc)) for doc in documents]
    lda = models.LdaModel(corpus, num_topics=5, id2word=dictionary)
    topics_raw = lda.get_document_topics(corpus[0])
    topics_blog = lda.get_document_topics(corpus[1])
    return compute_topic_overlap(topics_raw, topics_blog)

# Helper function to compute topic overlap
def compute_topic_overlap(topics_raw, topics_blog):
    raw_topics = {topic[0] for topic in topics_raw}
    blog_topics = {topic[0] for topic in topics_blog}
    overlap = raw_topics.intersection(blog_topics)
    return len(overlap) / max(len(raw_topics), len(blog_topics))

# Function to extract and compare Key Phrases
def calculate_key_phrase_overlap(raw_text, blog_text):
    raw_phrases = extract_key_phrases(raw_text)
    blog_phrases = extract_key_phrases(blog_text)
    matches = set(raw_phrases).intersection(set(blog_phrases))
    return len(matches) / len(raw_phrases)

# Helper function to extract key phrases (simple implementation)
def extract_key_phrases(text):
    words = word_tokenize(text)
    return list(set(words))  # Simplified: using unique words as key phrases

# Function for Semantic Similarity using BERT
def calculate_semantic_similarity(raw_text, blog_text):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    def embed_text(text):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).detach().numpy()

    raw_embedding = embed_text(raw_text)
    blog_embedding = embed_text(blog_text)

    print(f"Raw text embedding shape: {raw_embedding.shape}")
    print(f"Blog embedding shape: {blog_embedding.shape}")

    if raw_embedding.shape[0] == 1 and blog_embedding.shape[0] == 1:
        return cosine_similarity(raw_embedding, blog_embedding)[0, 0]
    else:
        return 0  # Return 0 similarity if embeddings are not as expected

# Function to calculate Readability Scores using textstat
def calculate_readability_scores(text):
    return textstat.flesch_kincaid_grade(text)

# Function to analyze Sentence Length and Structure
def analyze_sentence_structure(text):
    sentences = sent_tokenize(text)
    lengths = [len(word_tokenize(sent)) for sent in sentences]
    avg_length = sum(lengths) / len(sentences)
    return avg_length, lengths

# Main function to process files and calculate metrics
def process_files_and_calculate_metrics(raw_folder, blog_folder, num_files):
    results = []

    raw_files = sorted(glob.glob(os.path.join(raw_folder, "*.txt")))
    blog_files = sorted(glob.glob(os.path.join(blog_folder, "*.txt")))

    for i in range(num_files):
        raw_file = raw_files[i]
        blog_file = blog_files[i]

        with open(raw_file, 'r') as rf:
            raw_text = rf.read()
        with open(blog_file, 'r') as bf:
            blog_text = bf.read()

        exact_match_ratio = calculate_exact_match_ratio(raw_text, blog_text)
        cosine_sim = calculate_cosine_similarity(raw_text, blog_text)
        topic_overlap = calculate_topic_overlap(raw_text, blog_text)
        key_phrase_overlap = calculate_key_phrase_overlap(raw_text, blog_text)
        semantic_similarity = calculate_semantic_similarity(raw_text, blog_text)
        readability_raw = calculate_readability_scores(raw_text)
        readability_blog = calculate_readability_scores(blog_text)
        sentence_length_raw, _ = analyze_sentence_structure(raw_text)
        sentence_length_blog, _ = analyze_sentence_structure(blog_text)

        results.append({
            "raw_file": os.path.basename(raw_file),
            "blog_file": os.path.basename(blog_file),
            "exact_match_ratio": exact_match_ratio,
            "cosine_similarity": cosine_sim,
            "topic_overlap": topic_overlap,
            "key_phrase_overlap": key_phrase_overlap,
            "semantic_similarity": semantic_similarity,
            "readability_raw": readability_raw,
            "readability_blog": readability_blog,
            "sentence_length_raw": sentence_length_raw,
            "sentence_length_blog": sentence_length_blog
        })

    df = pd.DataFrame(results)
    df.to_csv('comparison_results.csv', index=False)
    return df

# Call main function with appropriate folder paths and number of files
raw_folder = 'raw_texts'
blog_folder = 'Written Blog Posts/4o Blog Raw Text'
num_files = 100  # Set the number of files to process

results_df_raw_to_4o = process_files_and_calculate_metrics(raw_folder, blog_folder, num_files)

# Display results
print(results_df_raw_to_4o)


In [None]:
results_df_raw_to_4o.to_csv('Individual Comparative CSVs/comparison_results_raw_to_4o.csv', index=False)

In [None]:
import os
import glob
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim import corpora, models
from transformers import BertTokenizer, BertModel
import torch
import textstat
import pandas as pd

nltk.download('punkt')

# Function to calculate Exact Match Ratio
def calculate_exact_match_ratio(summary_text, raw_text):
    summary_words = set(word_tokenize(summary_text))
    raw_words = set(word_tokenize(raw_text))
    matches = summary_words.intersection(raw_words)
    return len(matches) / len(summary_words)

# Function to calculate Cosine Similarity
def calculate_cosine_similarity(summary_text, raw_text):
    vectorizer = TfidfVectorizer().fit_transform([summary_text, raw_text])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0, 1]

# Function for Topic Modeling
def calculate_topic_overlap(summary_text, raw_text):
    documents = [summary_text, raw_text]
    dictionary = corpora.Dictionary([word_tokenize(doc) for doc in documents])
    corpus = [dictionary.doc2bow(word_tokenize(doc)) for doc in documents]
    lda = models.LdaModel(corpus, num_topics=5, id2word=dictionary)
    topics_summary = lda.get_document_topics(corpus[0])
    topics_raw = lda.get_document_topics(corpus[1])
    return compute_topic_overlap(topics_summary, topics_raw)

# Helper function to compute topic overlap
def compute_topic_overlap(topics_summary, topics_raw):
    summary_topics = {topic[0] for topic in topics_summary}
    raw_topics = {topic[0] for topic in topics_raw}
    overlap = summary_topics.intersection(raw_topics)
    return len(overlap) / max(len(summary_topics), len(raw_topics))

# Function to extract and compare Key Phrases
def calculate_key_phrase_overlap(summary_text, raw_text):
    summary_phrases = extract_key_phrases(summary_text)
    raw_phrases = extract_key_phrases(raw_text)
    matches = set(summary_phrases).intersection(set(raw_phrases))
    return len(matches) / len(summary_phrases)

# Helper function to extract key phrases (simple implementation)
def extract_key_phrases(text):
    words = word_tokenize(text)
    return list(set(words))  # Simplified: using unique words as key phrases

# Function for Semantic Similarity using BERT
def calculate_semantic_similarity(summary_text, raw_text):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    def embed_text(text):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).detach().numpy()

    summary_embedding = embed_text(summary_text)
    raw_embedding = embed_text(raw_text)

    print(f"Summary embedding shape: {summary_embedding.shape}")
    print(f"Raw text embedding shape: {raw_embedding.shape}")

    if summary_embedding.shape[0] == 1 and raw_embedding.shape[0] == 1:
        return cosine_similarity(summary_embedding, raw_embedding)[0, 0]
    else:
        return 0  # Return 0 similarity if embeddings are not as expected

# Function to calculate Readability Scores using textstat
def calculate_readability_scores(text):
    return textstat.flesch_kincaid_grade(text)

# Function to analyze Sentence Length and Structure
def analyze_sentence_structure(text):
    sentences = sent_tokenize(text)
    lengths = [len(word_tokenize(sent)) for sent in sentences]
    avg_length = sum(lengths) / len(sentences)
    return avg_length, lengths

# Main function to process files and calculate metrics
def process_files_and_calculate_metrics(summary_folder, raw_folder, num_files):
    results = []

    summary_files = sorted(glob.glob(os.path.join(summary_folder, "*.txt")))
    raw_files = sorted(glob.glob(os.path.join(raw_folder, "*.txt")))

    for i in range(num_files):
        summary_file = summary_files[i]
        raw_file = raw_files[i]

        with open(summary_file, 'r') as sf:
            summary_text = sf.read()
        with open(raw_file, 'r') as rf:
            raw_text = rf.read()

        exact_match_ratio = calculate_exact_match_ratio(summary_text, raw_text)
        cosine_sim = calculate_cosine_similarity(summary_text, raw_text)
        topic_overlap = calculate_topic_overlap(summary_text, raw_text)
        key_phrase_overlap = calculate_key_phrase_overlap(summary_text, raw_text)
        semantic_similarity = calculate_semantic_similarity(summary_text, raw_text)
        readability_summary = calculate_readability_scores(summary_text)
        readability_raw = calculate_readability_scores(raw_text)
        sentence_length_summary, _ = analyze_sentence_structure(summary_text)
        sentence_length_raw, _ = analyze_sentence_structure(raw_text)

        results.append({
            "summary_file": os.path.basename(summary_file),
            "raw_file": os.path.basename(raw_file),
            "exact_match_ratio": exact_match_ratio,
            "cosine_similarity": cosine_sim,
            "topic_overlap": topic_overlap,
            "key_phrase_overlap": key_phrase_overlap,
            "semantic_similarity": semantic_similarity,
            "readability_summary": readability_summary,
            "readability_raw": readability_raw,
            "sentence_length_summary": sentence_length_summary,
            "sentence_length_raw": sentence_length_raw
        })

    df = pd.DataFrame(results)
    df.to_csv('comparison_results.csv', index=False)
    return df

# Call main function with appropriate folder paths and number of files
summary_folder = 'Written Blog Posts/4o Blog'
raw_folder = 'Written Blog Posts/4o Blog Raw Text'
num_files = 100  # Set the number of files to process

results_df_4o_Sum_to_4o_Raw = process_files_and_calculate_metrics(summary_folder, raw_folder, num_files)

# Display results
print(results_df_4o_Sum_to_4o_Raw)


In [None]:
results_df_4o_Sum_to_4o_Raw.to_csv('Individual Comparative CSVs/comparison_results_4o_Sum_to_4o_Raw.csv', index=False)

In [None]:
import os
import glob
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim import corpora, models
from transformers import BertTokenizer, BertModel
import torch
import textstat
import pandas as pd

nltk.download('punkt')

# Function to calculate Exact Match Ratio
def calculate_exact_match_ratio(summary_text, blog_text):
    summary_words = set(word_tokenize(summary_text))
    blog_words = set(word_tokenize(blog_text))
    matches = summary_words.intersection(blog_words)
    return len(matches) / len(summary_words)

# Function to calculate Cosine Similarity
def calculate_cosine_similarity(summary_text, blog_text):
    vectorizer = TfidfVectorizer().fit_transform([summary_text, blog_text])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0, 1]

# Function for Topic Modeling
def calculate_topic_overlap(summary_text, blog_text):
    documents = [summary_text, blog_text]
    dictionary = corpora.Dictionary([word_tokenize(doc) for doc in documents])
    corpus = [dictionary.doc2bow(word_tokenize(doc)) for doc in documents]
    lda = models.LdaModel(corpus, num_topics=5, id2word=dictionary)
    topics_summary = lda.get_document_topics(corpus[0])
    topics_blog = lda.get_document_topics(corpus[1])
    return compute_topic_overlap(topics_summary, topics_blog)

# Helper function to compute topic overlap
def compute_topic_overlap(topics_summary, topics_blog):
    summary_topics = {topic[0] for topic in topics_summary}
    blog_topics = {topic[0] for topic in topics_blog}
    overlap = summary_topics.intersection(blog_topics)
    return len(overlap) / max(len(summary_topics), len(blog_topics))

# Function to extract and compare Key Phrases
def calculate_key_phrase_overlap(summary_text, blog_text):
    summary_phrases = extract_key_phrases(summary_text)
    blog_phrases = extract_key_phrases(blog_text)
    matches = set(summary_phrases).intersection(set(blog_phrases))
    return len(matches) / len(summary_phrases)

# Helper function to extract key phrases (simple implementation)
def extract_key_phrases(text):
    words = word_tokenize(text)
    return list(set(words))  # Simplified: using unique words as key phrases

# Function for Semantic Similarity using BERT
def calculate_semantic_similarity(summary_text, blog_text):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    def embed_text(text):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).detach().numpy()

    summary_embedding = embed_text(summary_text)
    blog_embedding = embed_text(blog_text)

    print(f"Summary embedding shape: {summary_embedding.shape}")
    print(f"Blog text embedding shape: {blog_embedding.shape}")

    if summary_embedding.shape[0] == 1 and blog_embedding.shape[0] == 1:
        return cosine_similarity(summary_embedding, blog_embedding)[0, 0]
    else:
        return 0  # Return 0 similarity if embeddings are not as expected

# Function to calculate Readability Scores using textstat
def calculate_readability_scores(text):
    return textstat.flesch_kincaid_grade(text)

# Function to analyze Sentence Length and Structure
def analyze_sentence_structure(text):
    sentences = sent_tokenize(text)
    lengths = [len(word_tokenize(sent)) for sent in sentences]
    avg_length = sum(lengths) / len(sentences)
    return avg_length, lengths

# Main function to process files and calculate metrics
def process_files_and_calculate_metrics(summary_folder, blog_folder, num_files):
    results = []

    summary_files = sorted(glob.glob(os.path.join(summary_folder, "*.txt")))
    blog_files = sorted(glob.glob(os.path.join(blog_folder, "*.txt")))

    for i in range(num_files):
        summary_file = summary_files[i]
        blog_file = blog_files[i]

        with open(summary_file, 'r') as sf:
            summary_text = sf.read()
        with open(blog_file, 'r') as bf:
            blog_text = bf.read()

        exact_match_ratio = calculate_exact_match_ratio(summary_text, blog_text)
        cosine_sim = calculate_cosine_similarity(summary_text, blog_text)
        topic_overlap = calculate_topic_overlap(summary_text, blog_text)
        key_phrase_overlap = calculate_key_phrase_overlap(summary_text, blog_text)
        semantic_similarity = calculate_semantic_similarity(summary_text, blog_text)
        readability_summary = calculate_readability_scores(summary_text)
        readability_blog = calculate_readability_scores(blog_text)
        sentence_length_summary, _ = analyze_sentence_structure(summary_text)
        sentence_length_blog, _ = analyze_sentence_structure(blog_text)

        results.append({
            "summary_file": os.path.basename(summary_file),
            "blog_file": os.path.basename(blog_file),
            "exact_match_ratio": exact_match_ratio,
            "cosine_similarity": cosine_sim,
            "topic_overlap": topic_overlap,
            "key_phrase_overlap": key_phrase_overlap,
            "semantic_similarity": semantic_similarity,
            "readability_summary": readability_summary,
            "readability_blog": readability_blog,
            "sentence_length_summary": sentence_length_summary,
            "sentence_length_blog": sentence_length_blog
        })

    df = pd.DataFrame(results)
    df.to_csv('comparison_results.csv', index=False)
    return df

# Call main function with appropriate folder paths and number of files
summary_folder = 'summaries'
blog_folder = 'Written Blog Posts/4o Blog'
num_files = 100  # Set the number of files to process

results_df_sum_to_4o_blog = process_files_and_calculate_metrics(summary_folder, blog_folder, num_files)

# Display results
print(results_df_sum_to_4o_blog)


In [None]:
results_df_sum_to_4o_blog.to_csv('Individual Comparative CSVs/comparison_results_sum_to_4o_blog.csv', index=False)

In [None]:
import os
import glob
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim import corpora, models
from transformers import BertTokenizer, BertModel
import torch
import textstat
import pandas as pd

nltk.download('punkt')

# Function to calculate Exact Match Ratio
def calculate_exact_match_ratio(blog_text_35t, blog_text_4o):
    blog_words_35t = set(word_tokenize(blog_text_35t))
    blog_words_4o = set(word_tokenize(blog_text_4o))
    matches = blog_words_35t.intersection(blog_words_4o)
    return len(matches) / len(blog_words_35t)

# Function to calculate Cosine Similarity
def calculate_cosine_similarity(blog_text_35t, blog_text_4o):
    vectorizer = TfidfVectorizer().fit_transform([blog_text_35t, blog_text_4o])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0, 1]

# Function for Topic Modeling
def calculate_topic_overlap(blog_text_35t, blog_text_4o):
    documents = [blog_text_35t, blog_text_4o]
    dictionary = corpora.Dictionary([word_tokenize(doc) for doc in documents])
    corpus = [dictionary.doc2bow(word_tokenize(doc)) for doc in documents]
    lda = models.LdaModel(corpus, num_topics=5, id2word=dictionary)
    topics_blog_35t = lda.get_document_topics(corpus[0])
    topics_blog_4o = lda.get_document_topics(corpus[1])
    return compute_topic_overlap(topics_blog_35t, topics_blog_4o)

# Helper function to compute topic overlap
def compute_topic_overlap(topics_blog_35t, topics_blog_4o):
    blog_topics_35t = {topic[0] for topic in topics_blog_35t}
    blog_topics_4o = {topic[0] for topic in topics_blog_4o}
    overlap = blog_topics_35t.intersection(blog_topics_4o)
    return len(overlap) / max(len(blog_topics_35t), len(blog_topics_4o))

# Function to extract and compare Key Phrases
def calculate_key_phrase_overlap(blog_text_35t, blog_text_4o):
    blog_phrases_35t = extract_key_phrases(blog_text_35t)
    blog_phrases_4o = extract_key_phrases(blog_text_4o)
    matches = set(blog_phrases_35t).intersection(set(blog_phrases_4o))
    return len(matches) / len(blog_phrases_35t)

# Helper function to extract key phrases (simple implementation)
def extract_key_phrases(text):
    words = word_tokenize(text)
    return list(set(words))  # Simplified: using unique words as key phrases

# Function for Semantic Similarity using BERT
def calculate_semantic_similarity(blog_text_35t, blog_text_4o):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    def embed_text(text):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).detach().numpy()

    blog_embedding_35t = embed_text(blog_text_35t)
    blog_embedding_4o = embed_text(blog_text_4o)

    print(f"3.5t Blog Post embedding shape: {blog_embedding_35t.shape}")
    print(f"4o Blog Post embedding shape: {blog_embedding_4o.shape}")

    if blog_embedding_35t.shape[0] == 1 and blog_embedding_4o.shape[0] == 1:
        return cosine_similarity(blog_embedding_35t, blog_embedding_4o)[0, 0]
    else:
        return 0  # Return 0 similarity if embeddings are not as expected

# Function to calculate Readability Scores using textstat
def calculate_readability_scores(text):
    return textstat.flesch_kincaid_grade(text)

# Function to analyze Sentence Length and Structure
def analyze_sentence_structure(text):
    sentences = sent_tokenize(text)
    lengths = [len(word_tokenize(sent)) for sent in sentences]
    avg_length = sum(lengths) / len(sentences)
    return avg_length, lengths

# Main function to process files and calculate metrics
def process_files_and_calculate_metrics(blog_folder_35t, blog_folder_4o, num_files):
    results = []

    blog_files_35t = sorted(glob.glob(os.path.join(blog_folder_35t, "*.txt")))
    blog_files_4o = sorted(glob.glob(os.path.join(blog_folder_4o, "*.txt")))

    for i in range(num_files):
        blog_file_35t = blog_files_35t[i]
        blog_file_4o = blog_files_4o[i]

        with open(blog_file_35t, 'r') as bf35t:
            blog_text_35t = bf35t.read()
        with open(blog_file_4o, 'r') as bf4o:
            blog_text_4o = bf4o.read()

        exact_match_ratio = calculate_exact_match_ratio(blog_text_35t, blog_text_4o)
        cosine_sim = calculate_cosine_similarity(blog_text_35t, blog_text_4o)
        topic_overlap = calculate_topic_overlap(blog_text_35t, blog_text_4o)
        key_phrase_overlap = calculate_key_phrase_overlap(blog_text_35t, blog_text_4o)
        semantic_similarity = calculate_semantic_similarity(blog_text_35t, blog_text_4o)
        readability_35t = calculate_readability_scores(blog_text_35t)
        readability_4o = calculate_readability_scores(blog_text_4o)
        sentence_length_35t, _ = analyze_sentence_structure(blog_text_35t)
        sentence_length_4o, _ = analyze_sentence_structure(blog_text_4o)

        results.append({
            "blog_file_35t": os.path.basename(blog_file_35t),
            "blog_file_4o": os.path.basename(blog_file_4o),
            "exact_match_ratio": exact_match_ratio,
            "cosine_similarity": cosine_sim,
            "topic_overlap": topic_overlap,
            "key_phrase_overlap": key_phrase_overlap,
            "semantic_similarity": semantic_similarity,
            "readability_35t": readability_35t,
            "readability_4o": readability_4o,
            "sentence_length_35t": sentence_length_35t,
            "sentence_length_4o": sentence_length_4o
        })

    df = pd.DataFrame(results)
    df.to_csv('comparison_results.csv', index=False)
    return df

# Call main function with appropriate folder paths and number of files
blog_folder_35t = 'Written Blog Posts/3.5T Blog'
blog_folder_4o = 'Written Blog Posts/4o Blog'
num_files = 100  # Set the number of files to process

results_df_35blog_sum_to_4oblog_sum = process_files_and_calculate_metrics(blog_folder_35t, blog_folder_4o, num_files)

# Display results
print(results_df_35blog_sum_to_4oblog_sum)


In [None]:
results_df_35blog_sum_to_4oblog_sum.to_csv('Individual Comparative CSVs/comparison_results_35blog_sum_to_4oblog_sum.csv', index=False)

In [None]:
import os
import glob
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim import corpora, models
from transformers import BertTokenizer, BertModel
import torch
import textstat
import pandas as pd

nltk.download('punkt')

# Function to calculate Exact Match Ratio
def calculate_exact_match_ratio(summary_text, raw_text):
    summary_words = set(word_tokenize(summary_text))
    raw_words = set(word_tokenize(raw_text))
    matches = summary_words.intersection(raw_words)
    return len(matches) / len(summary_words)

# Function to calculate Cosine Similarity
def calculate_cosine_similarity(summary_text, raw_text):
    vectorizer = TfidfVectorizer().fit_transform([summary_text, raw_text])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0, 1]

# Function for Topic Modeling
def calculate_topic_overlap(summary_text, raw_text):
    documents = [summary_text, raw_text]
    dictionary = corpora.Dictionary([word_tokenize(doc) for doc in documents])
    corpus = [dictionary.doc2bow(word_tokenize(doc)) for doc in documents]
    lda = models.LdaModel(corpus, num_topics=5, id2word=dictionary)
    topics_summary = lda.get_document_topics(corpus[0])
    topics_raw = lda.get_document_topics(corpus[1])
    return compute_topic_overlap(topics_summary, topics_raw)

# Helper function to compute topic overlap
def compute_topic_overlap(topics_summary, topics_raw):
    summary_topics = {topic[0] for topic in topics_summary}
    raw_topics = {topic[0] for topic in topics_raw}
    overlap = summary_topics.intersection(raw_topics)
    return len(overlap) / max(len(summary_topics), len(raw_topics))

# Function to extract and compare Key Phrases
def calculate_key_phrase_overlap(summary_text, raw_text):
    summary_phrases = extract_key_phrases(summary_text)
    raw_phrases = extract_key_phrases(raw_text)
    matches = set(summary_phrases).intersection(set(raw_phrases))
    return len(matches) / len(summary_phrases)

# Helper function to extract key phrases (simple implementation)
def extract_key_phrases(text):
    words = word_tokenize(text)
    return list(set(words))  # Simplified: using unique words as key phrases

# Function for Semantic Similarity using BERT
def calculate_semantic_similarity(summary_text, raw_text):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    def embed_text(text):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).detach().numpy()

    summary_embedding = embed_text(summary_text)
    raw_embedding = embed_text(raw_text)

    print(f"Summary embedding shape: {summary_embedding.shape}")
    print(f"Raw text embedding shape: {raw_embedding.shape}")

    if summary_embedding.shape[0] == 1 and raw_embedding.shape[0] == 1:
        return cosine_similarity(summary_embedding, raw_embedding)[0, 0]
    else:
        return 0  # Return 0 similarity if embeddings are not as expected

# Function to calculate Readability Scores using textstat
def calculate_readability_scores(text):
    return textstat.flesch_kincaid_grade(text)

# Function to analyze Sentence Length and Structure
def analyze_sentence_structure(text):
    sentences = sent_tokenize(text)
    lengths = [len(word_tokenize(sent)) for sent in sentences]
    avg_length = sum(lengths) / len(sentences)
    return avg_length, lengths

# Main function to process files and calculate metrics
def process_files_and_calculate_metrics(summary_folder, raw_folder, num_files):
    results = []

    summary_files = sorted(glob.glob(os.path.join(summary_folder, "*.txt")))
    raw_files = sorted(glob.glob(os.path.join(raw_folder, "*.txt")))

    for i in range(num_files):
        summary_file = summary_files[i]
        raw_file = raw_files[i]

        with open(summary_file, 'r') as sf:
            summary_text = sf.read()
        with open(raw_file, 'r') as rf:
            raw_text = rf.read()

        exact_match_ratio = calculate_exact_match_ratio(summary_text, raw_text)
        cosine_sim = calculate_cosine_similarity(summary_text, raw_text)
        topic_overlap = calculate_topic_overlap(summary_text, raw_text)
        key_phrase_overlap = calculate_key_phrase_overlap(summary_text, raw_text)
        semantic_similarity = calculate_semantic_similarity(summary_text, raw_text)
        readability_summary = calculate_readability_scores(summary_text)
        readability_raw = calculate_readability_scores(raw_text)
        sentence_length_summary, _ = analyze_sentence_structure(summary_text)
        sentence_length_raw, _ = analyze_sentence_structure(raw_text)

        results.append({
            "summary_file": os.path.basename(summary_file),
            "raw_file": os.path.basename(raw_file),
            "exact_match_ratio": exact_match_ratio,
            "cosine_similarity": cosine_sim,
            "topic_overlap": topic_overlap,
            "key_phrase_overlap": key_phrase_overlap,
            "semantic_similarity": semantic_similarity,
            "readability_summary": readability_summary,
            "readability_raw": readability_raw,
            "sentence_length_summary": sentence_length_summary,
            "sentence_length_raw": sentence_length_raw
        })

    df = pd.DataFrame(results)
    df.to_csv('comparison_results.csv', index=False)
    return df

# Call main function with appropriate folder paths and number of files
summary_folder = 'summaries'
raw_folder = 'raw_texts'
num_files = 100  # Set the number of files to process

results_df_sum_to_raw = process_files_and_calculate_metrics(summary_folder, raw_folder, num_files)

# Display results
print(results_df_sum_to_raw)


In [None]:
results_df_sum_to_raw.to_csv('Individual Comparative CSVs/comparison_results_sum_to_raw.csv', index=False)

In [None]:
import os
import glob
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim import corpora, models
from transformers import BertTokenizer, BertModel
import torch
import textstat
import pandas as pd

nltk.download('punkt')

# Function to calculate Exact Match Ratio
def calculate_exact_match_ratio(raw_text, blog_text):
    raw_words = set(word_tokenize(raw_text))
    blog_words = set(word_tokenize(blog_text))
    matches = raw_words.intersection(blog_words)
    return len(matches) / len(raw_words)

# Function to calculate Cosine Similarity
def calculate_cosine_similarity(raw_text, blog_text):
    vectorizer = TfidfVectorizer().fit_transform([raw_text, blog_text])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0, 1]

# Function for Topic Modeling
def calculate_topic_overlap(raw_text, blog_text):
    documents = [raw_text, blog_text]
    dictionary = corpora.Dictionary([word_tokenize(doc) for doc in documents])
    corpus = [dictionary.doc2bow(word_tokenize(doc)) for doc in documents]
    lda = models.LdaModel(corpus, num_topics=5, id2word=dictionary)
    topics_raw = lda.get_document_topics(corpus[0])
    topics_blog = lda.get_document_topics(corpus[1])
    return compute_topic_overlap(topics_raw, topics_blog)

# Helper function to compute topic overlap
def compute_topic_overlap(topics_raw, topics_blog):
    raw_topics = {topic[0] for topic in topics_raw}
    blog_topics = {topic[0] for topic in topics_blog}
    overlap = raw_topics.intersection(blog_topics)
    return len(overlap) / max(len(raw_topics), len(blog_topics))

# Function to extract and compare Key Phrases
def calculate_key_phrase_overlap(raw_text, blog_text):
    raw_phrases = extract_key_phrases(raw_text)
    blog_phrases = extract_key_phrases(blog_text)
    matches = set(raw_phrases).intersection(set(blog_phrases))
    return len(matches) / len(raw_phrases)

# Helper function to extract key phrases (simple implementation)
def extract_key_phrases(text):
    words = word_tokenize(text)
    return list(set(words))  # Simplified: using unique words as key phrases

# Function for Semantic Similarity using BERT
def calculate_semantic_similarity(raw_text, blog_text):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    def embed_text(text):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).detach().numpy()

    raw_embedding = embed_text(raw_text)
    blog_embedding = embed_text(blog_text)

    print(f"Raw text embedding shape: {raw_embedding.shape}")
    print(f"Blog embedding shape: {blog_embedding.shape}")

    if raw_embedding.shape[0] == 1 and blog_embedding.shape[0] == 1:
        return cosine_similarity(raw_embedding, blog_embedding)[0, 0]
    else:
        return 0  # Return 0 similarity if embeddings are not as expected

# Function to calculate Readability Scores using textstat
def calculate_readability_scores(text):
    return textstat.flesch_kincaid_grade(text)

# Function to analyze Sentence Length and Structure
def analyze_sentence_structure(text):
    sentences = sent_tokenize(text)
    lengths = [len(word_tokenize(sent)) for sent in sentences]
    avg_length = sum(lengths) / len(sentences)
    return avg_length, lengths

# Main function to process files and calculate metrics
def process_files_and_calculate_metrics(raw_folder, blog_folder, num_files):
    results = []

    raw_files = sorted(glob.glob(os.path.join(raw_folder, "*.txt")))
    blog_files = sorted(glob.glob(os.path.join(blog_folder, "*.txt")))

    for i in range(num_files):
        raw_file = raw_files[i]
        blog_file = blog_files[i]

        with open(raw_file, 'r') as rf:
            raw_text = rf.read()
        with open(blog_file, 'r') as bf:
            blog_text = bf.read()

        exact_match_ratio = calculate_exact_match_ratio(raw_text, blog_text)
        cosine_sim = calculate_cosine_similarity(raw_text, blog_text)
        topic_overlap = calculate_topic_overlap(raw_text, blog_text)
        key_phrase_overlap = calculate_key_phrase_overlap(raw_text, blog_text)
        semantic_similarity = calculate_semantic_similarity(raw_text, blog_text)
        readability_raw = calculate_readability_scores(raw_text)
        readability_blog = calculate_readability_scores(blog_text)
        sentence_length_raw, _ = analyze_sentence_structure(raw_text)
        sentence_length_blog, _ = analyze_sentence_structure(blog_text)

        results.append({
            "raw_file": os.path.basename(raw_file),
            "blog_file": os.path.basename(blog_file),
            "exact_match_ratio": exact_match_ratio,
            "cosine_similarity": cosine_sim,
            "topic_overlap": topic_overlap,
            "key_phrase_overlap": key_phrase_overlap,
            "semantic_similarity": semantic_similarity,
            "readability_raw": readability_raw,
            "readability_blog": readability_blog,
            "sentence_length_raw": sentence_length_raw,
            "sentence_length_blog": sentence_length_blog
        })

    df = pd.DataFrame(results)
    df.to_csv('comparison_results.csv', index=False)
    return df

# Call main function with appropriate folder paths and number of files
raw_folder = 'raw_texts'
blog_folder = 'Written Blog Posts/4o Blog'
num_files = 100  # Set the number of files to process

results_df_raw_to_4o_sum_blog = process_files_and_calculate_metrics(raw_folder, blog_folder, num_files)

# Display results
print(results_df_raw_to_4o_sum_blog)


In [None]:
results_df_sum_to_raw.to_csv('Individual Comparative CSVs/comparison_results_raw_to_4o_sum_blog.csv', index=False)