In [None]:
from transformers import pipeline, RobertaTokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer
from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import numpy as np
from collections import Counter
from math import sqrt
from google.colab import files
import shutil
from datasets import Dataset

In [None]:
from huggingface_hub import login

login()

# **DF**

In [None]:
test_df = pd.read_csv("java_test_code_and_docs.csv", header=None)
test_df.columns = ["Code","Summary"]
test_df=test_df[1:]
test_df

In [None]:
train_df = pd.read_csv("Java_train_data.csv", header=None)
train_df.columns = ["Code","Summary"]
train_df

# **Pre Processing**

In [None]:
import re

def is_good_summary_text(summary):
    # Convert to string (in case of None or non-string input)
    if "@" in summary:
      return False
    if "<" in summary:
      return False
    if len(summary.split())<15:
      return False

    # If it passes all filters, return True
    return True

In [None]:
train_df["check"] = train_df["Summary"].apply(lambda x: is_good_summary_text(x))
filtered_train = train_df[train_df["check"] == True]
filtered_train=filtered_train[320:480]
filtered_train.drop(columns=["check"], inplace=True)

In [None]:
test_df["check"] = test_df["Summary"].apply(lambda x: is_good_summary_text(x))
test_df = test_df[test_df["check"] == True]
test_df.drop(columns=["check"], inplace=True)

In [None]:
test_df

# **Train**

In [None]:
# Initialize the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("java_db_320")

# Convert your DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(filtered_train)

def tokenize_function(examples):
    codes = [code if code is not None else "" for code in examples["Code"]]
    summaries = [summ if summ is not None else "" for summ in examples["Summary"]]

    summarize_prompts = [
        f"""
        You are a summarization assistant specialized in analyzing Java code. Your task is to generate a concise and accurate summary of the provided Java code.

        Focus on the following:
        1. The main purpose of the class and its role in the application.
        2. Key behaviors and methods implemented in the class.
        3. How the class interacts with other components of the system or game.

        Do not:
        - Repeat inheritance details unnecessarily.
        - Include redundant or inaccurate references.
        - Use overly technical jargon; keep the summary accessible.

        Here is the Java code:
        {code}
        """ for code in codes
    ]

    input_encodings = tokenizer(
        summarize_prompts,
        max_length=512,  # Reduced max length for efficiency
        truncation=True,
        padding="max_length"
    )
    target_encodings = tokenizer(
        summaries,
        max_length=128,  # Reduced max length for summaries
        truncation=True,
        padding="max_length"
    )
    input_encodings["labels"] = target_encodings["input_ids"]
    return input_encodings

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["Code", "Summary"])

# Split into train and test sets
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

In [None]:
# Load the pre-trained CodeT5 model
model = T5ForConditionalGeneration.from_pretrained("java_db_320")

training_args = TrainingArguments(
    output_dir="./results",           # Output directory
    evaluation_strategy="epoch",     # Evaluate after each epoch
    learning_rate=5e-5,              # Learning rate
    per_device_train_batch_size=4,   # Batch size for training
    per_device_eval_batch_size=4,    # Batch size for evaluation
    num_train_epochs=10,              # Number of epochs
    save_strategy="epoch",           # Save model after every epoch
    logging_dir="./logs",            # Directory for logs
    logging_steps=10,
    save_total_limit=2,
    fp16=True,
    load_best_model_at_end=True,  # Load best model at the end
    metric_for_best_model="eval_loss",  # Use eval_loss to track best model
    greater_is_better=False,  # Lower loss is better
    seed=42  # For reproducibility                   # Use mixed precision (if supported by GPU)
)

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)
# Train the model
trainer.train()

In [None]:
# Save the model and tokenizer
model.save_pretrained("java_db_480")
tokenizer.save_pretrained("java_db_480")

In [None]:
# Zip the directory
shutil.make_archive("java_db_480", "zip", "java_db_480")

# **Trained Model**

In [None]:
# Initialize the tokenizer and model for Java code summarization
tokenizer_java = RobertaTokenizer.from_pretrained("java_db_320")
model_java = T5ForConditionalGeneration.from_pretrained("java_db_320")

def summarize_t5_java(code):
    # Generate the summarization prompt with improved instructions
    summarize_prompt = """
    You are a summarization assistant specialized in analyzing Java code. Your task is to generate a concise and accurate summary of the provided Java code.

    Focus on the following:
    1. The main purpose of the class and its role in the application.
    2. Key behaviors and methods implemented in the class.
    3. How the class interacts with other components of the system or game.

    Do not:
    - Repeat inheritance details unnecessarily.
    - Include redundant or inaccurate references.
    - Use overly technical jargon; keep the summary accessible.

    Here is the Java code:
    """ + code

    input_ids = tokenizer_java.encode(summarize_prompt, return_tensors="pt", truncation=True)

    # Generate the summary directly with improved sampling parameters
    summary_ids = model_java.generate(
        input_ids,
        min_length=50,       # Ensures a meaningful minimum content length
        max_length=1024,     # High value to allow long summaries if needed
        num_beams=5,         # Improves summarization quality
        length_penalty=1.5,  # Balances length and conciseness
        temperature=1.0,     # Neutral randomness
        top_p=0.9,           # Diverse but controlled output
        repetition_penalty=1.2, # Reduces repetitive phrases
        early_stopping=True  # Stops once the model determines the end of content
    )
    summary = tokenizer_java.decode(summary_ids[0], skip_special_tokens=True)

    return summary


# **creating df**

In [None]:
df_test = pd.read_csv("filtered_test_java.csv")
df_test.drop(columns=["Summary_t5"], inplace=True)
df_test

# **Cleaning repetitive text in summaries**

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch
import re

# Load SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

def remove_semantic_duplicates(text, similarity_threshold=0.8):
    # Split the text into sentences using regex to handle various punctuation marks
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())

    # Encode the complete sentences
    embeddings = model.encode(sentences, convert_to_tensor=True)

    # Track unique embeddings
    unique_embeddings = []
    unique_sentences = []

    for i, emb in enumerate(embeddings):
        # Check for semantic similarity
        if not any(util.cos_sim(emb, unique_emb) > similarity_threshold for unique_emb in unique_embeddings):
            unique_embeddings.append(emb)
            unique_sentences.append(sentences[i])

    # Ensure at least one sentence is always returned
    if not unique_sentences:
        return sentences[0]

    # Return filtered sentences joined with proper punctuation
    return ' '.join(unique_sentences)


# **summarize code**

## **Java Test**

In [None]:

test_df=pd.read_csv("only_codes.csv")


test_df['Summaries']=test_df["func_code_string"].apply(lambda x: summarize_t5_java(x))
test_df['Summaries'] = test_df['Summaries'].apply(lambda x: remove_semantic_duplicates(x))
test_df

In [None]:
from google.colab import files

test_df.to_csv("CodeT5_with_summaries.csv", index=False)
files.download("CodeT5_with_summaries.csv")

## *overall_semantic_similarity*

In [None]:
from sentence_transformers import SentenceTransformer, util

def calculate_overall_semantic_similarity(reference_texts, generated_texts, model_name='all-MiniLM-L6-v2'):
    """
    Calculate the overall semantic similarity score between two sets of texts.

    Args:
        reference_texts (list): List of reference texts (gold standard summaries).
        generated_texts (list): List of generated texts (model outputs).
        model_name (str): Name of the SentenceTransformer model to use (default: 'all-MiniLM-L6-v2').

    Returns:
        float: Overall semantic similarity score (average of all pairwise scores).
    """
    if not isinstance(reference_texts, list) or not isinstance(generated_texts, list):
        raise ValueError("Both inputs must be lists of strings.")

    # Load the model
    model = SentenceTransformer(model_name)

    # Encode the texts
    embeddings1 = model.encode(reference_texts, convert_to_tensor=True)
    embeddings2 = model.encode(generated_texts, convert_to_tensor=True)

    # Compute pairwise semantic similarity
    similarities = util.cos_sim(embeddings1, embeddings2)

    # Calculate the average similarity score
    total_score = similarities.sum().item()
    count = similarities.numel()
    overall_similarity = round(total_score / count, 4)

    return overall_similarity





In [None]:
filtered_test['Similarity_Score'] = filtered_test.apply(
    lambda row: calculate_overall_semantic_similarity(
        [row['Summary']],  # Wrapping the single string in a list
        [row['Summary_t5']],  # Wrapping the single string in a list
        model_name='all-MiniLM-L6-v2'
    ),
    axis=1
)

In [None]:
filtered_test['Similarity_Score'].mean()

In [None]:
filtered_test

In [None]:
filtered_test = filtered_test.drop(columns=['Summary', 'Similarity_Score'])
filtered_test = filtered_test.rename(columns={'Summary_t5': 'Summary'})

In [None]:
filtered_test

In [None]:
filtered_test.to_csv("filtered_test_java_500_rows_new_mode.csv", index=False)


In [None]:
df=pd.read_csv("filtered_test_java_100_rows (1).csv")
df

In [None]:
df = df.rename(columns={'Summary_t5': 'Summary'})