In [8]:
from transformers import AutoTokenizer, LlamaForCausalLM, PretrainedConfig, TrainingArguments, Trainer
import pandas as pd
import torch
from huggingface_hub import login, hf_hub_download
import json
from tqdm import tqdm

In [3]:
# Load the dataframe
final_df = pd.read_pickle('final_dataset_clust.pkl')

In [4]:
login("your key")
model_name = "meta-llama/Llama-3.2-3B-Instruct"

# Download the config.json file and load it
config_path = hf_hub_download(repo_id=model_name, filename="config.json")
with open(config_path, 'r') as f:
    config_data = json.load(f)

# Modify config.json and save it
config_data["rope_scaling"] = {"factor": 8.0, "type": "dynamic"}
with open("fixed_config.json", 'w') as f:
    json.dump(config_data, f, indent=4)
    
# Load the modified config.json
config = PretrainedConfig.from_json_file("fixed_config.json")

# Load the model and tokenizer
model = LlamaForCausalLM.from_pretrained(model_name, config=config)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
""" # Filtering and processing data by cluster
def generate_cluster_reviews(df, cluster_name):
    cluster_df = df[df['cluster'] == cluster_name]
    # Use only relevant fields for summarization
    reviews = cluster_df[['review', 'rating', 'sentiment', 'sentiment_score', 'name']]
    return reviews """

# Filtering and processing data by cluster [ONLY 10 SAMPLES]
def generate_cluster_reviews(df, cluster_name):
    cluster_df = df[df['cluster'] == cluster_name].sample(n=10, random_state=42)
    # Use only relevant fields for summarization
    reviews = cluster_df[['review', 'rating', 'sentiment', 'sentiment_score', 'name']]
    return reviews

# Format the input prompt for summarization
def format_prompt(reviews):
    prompt = """
        The following are customer reviews of various products in the category {category_name}. Based on these reviews, generate a detailed summary that provides:
        - The Top 3 products and key differences between them. When should a consumer choose one or another?
        - Top complaints for each of those products
        - The worst product in the category and why customers should avoid it.
        Reviews:
    """
    prompt += "\n".join(f"Product Name: {row['name']}, Review: {row['review']}, Rating: {row['rating']}, Sentiment: {row['sentiment']}, Score: {row['sentiment_score']}" for _, row in reviews.iterrows())
    return prompt

In [14]:
# Generate article using the Llama model
def chunk_text(text, max_length):
    """Splits text into chunks of max_length tokens"""
    tokens = tokenizer.tokenize(text)
    for i in range(0, len(tokens), max_length):
        yield tokenizer.convert_tokens_to_string(tokens[i:i+max_length])

def generate_summary_from_chunks(model, tokenizer, prompt, max_length=2048):
    summaries = []
    for chunk in tqdm(chunk_text(prompt, max_length), desc="Generating summaries", unit="chunk"):
        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=max_length)
        outputs = model.generate(**inputs, max_new_tokens=512, num_beams=5, early_stopping=True)
        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        summaries.append(summary)
    return " ".join(summaries)

# Sample data for quicker processing
sampled_df = final_df.sample(frac=0.1, random_state=42)  # Sample 10% of the data

# Iterate through clusters and generate summaries
cluster_names = sampled_df['cluster'].unique()
summary_results = {}
for cluster_name in tqdm(cluster_names, desc="Processing clusters", unit="cluster"):
    cluster_reviews = generate_cluster_reviews(sampled_df, cluster_name)
    prompt = format_prompt(cluster_reviews)
    summary = generate_summary_from_chunks(model, tokenizer, prompt)
    summary_results[cluster_name] = summary
    print(f"Summary for {cluster_name}:\n{summary}\n")

Processing clusters:   0%|          | 0/4 [00:00<?, ?cluster/s]
Generating summaries: 0chunk [00:00, ?chunk/s]

[ASetting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


: 

: 

: 

In [None]:
# Optional: Save summaries to a file
with open('cluster_summaries.txt', 'w') as f:
    for cluster, summary in summary_results.items():
        f.write(f"Summary for {cluster}:\n{summary}\n\n")