In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/project-multi/test_random_400-2.csv
/kaggle/input/project-multi/test_top_cosine_200-2.csv
/kaggle/input/project-multi/test_top_rougeL_200-2.csv
/kaggle/input/project-multi/train_data.csv
/kaggle/input/project-multi/test_random_600-2.csv


In [2]:
!pip install -q -U transformers accelerate datasets peft bitsandbytes

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m78.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.9/374.9 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m504.9/504.9 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m561.5/561.5 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m


In [3]:
from huggingface_hub import login

login("your_token_here")

In [None]:

import gc
import random
import os, json, torch
import pandas as pd
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence


if hasattr(torch.backends, "cuda"):
    torch.backends.cuda.enable_flash_sdp(False)
    torch.backends.cuda.enable_mem_efficient_sdp(False)
    torch.backends.cuda.enable_math_sdp(True)


TRAIN_CSV = "/kaggle/input/project-multi/train_data.csv"
TEST_CSV  = "/kaggle/input/project-multi/test_random_400-2.csv"

assert os.path.exists(TRAIN_CSV), f"Missing TRAIN_CSV: {TRAIN_CSV}"
assert os.path.exists(TEST_CSV),  f"Missing TEST_CSV: {TEST_CSV}"


train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)

INPUT_COL = "description_html_clean" if "description_html_clean" in train_df.columns else \
            ("description_html" if "description_html" in train_df.columns else None)
assert INPUT_COL is not None, "Train CSV needs 'description_html_clean' or 'description_html'"

TARGET_COL = "description_short"
assert TARGET_COL in train_df.columns, "Train CSV needs 'description_short'"

def build_prompt(description_html: str) -> str:
    return (
        "You are an expert app store editor. "
        "Given the following app description in HTML format, summarize it in 2-3 sentences, "
        "with a concise, engaging short description (max 80 characters) suitable for an app store listing. "
        f"App Description HTML:\n{description_html}\n"
        "Format your response as:\n"
        "Short Description: <your short description>\n\n"
    )

records = []
for _, row in train_df.iterrows():
    html = str(row[INPUT_COL])
    target = str(row[TARGET_COL]).strip()
    records.append({"prompt": build_prompt(html), "response": target})

random.seed(42)
random.shuffle(records)
split_idx = max(1, int(0.9 * len(records))) if len(records) > 1 else 1
train_items = records[:split_idx]
dev_items   = records[split_idx:] if split_idx < len(records) else records[:1]


class PromptDataset(Dataset):
    def __init__(self, items, tokenizer, max_len=2048):
        self.items = items
        self.tok = tokenizer
        self.max_len = max_len

    def __len__(self): return len(self.items)

    def __getitem__(self, idx):
        ex = self.items[idx]
        prompt_ids   = self.tok(ex["prompt"], add_special_tokens=False, truncation=True, max_length=self.max_len)["input_ids"]
        # + eos
        response_ids = self.tok(ex["response"] + self.tok.eos_token, add_special_tokens=False, truncation=True, max_length=self.max_len)["input_ids"]
        input_ids = prompt_ids + response_ids
        labels = [-100]*len(prompt_ids) + response_ids

        # truncate to max_len from the right
        if len(input_ids) > self.max_len:
            input_ids = input_ids[-self.max_len:]
            labels    = labels[-self.max_len:]

        return {
            "input_ids": torch.tensor(input_ids, dtype=torch.long),
            "labels": torch.tensor(labels, dtype=torch.long),
            "attention_mask": torch.ones(len(input_ids), dtype=torch.long),
        }


def collate_fn(batch):
    pad_id = tokenizer.pad_token_id
    input_ids = [b["input_ids"] for b in batch]
    labels = [b["labels"] for b in batch]
    attn = [b["attention_mask"] for b in batch]

    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=pad_id)
    labels_padded    = pad_sequence(labels,    batch_first=True, padding_value=-100)
    attn_padded      = pad_sequence(attn,      batch_first=True, padding_value=0)

    return {"input_ids": input_ids_padded, "labels": labels_padded, "attention_mask": attn_padded}


MODEL_NAME = "google/gemma-2-2b-it"

gpu_ok = torch.cuda.is_available()
bf16_ok = gpu_ok and (torch.cuda.get_device_capability(0)[0] >= 8)
compute_dtype = torch.bfloat16 if bf16_ok else torch.float16
device_index = torch.cuda.current_device() if gpu_ok else 0
device_map = {"": device_index}

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    torch_dtype=compute_dtype,
    device_map=device_map,
    attn_implementation="eager", 
)

base_model = prepare_model_for_kbit_training(base_model)

lora_config = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
)
model = get_peft_model(base_model, lora_config)
model.config.use_cache = False

train_ds = PromptDataset(train_items, tokenizer, max_len=2048)
dev_ds   = PromptDataset(dev_items, tokenizer, max_len=2048)

args = TrainingArguments(
    output_dir="./gemma2_adapter",
    per_device_train_batch_size=1,     
    gradient_accumulation_steps=16,    
    num_train_epochs=3,               
    learning_rate=1e-4,
    fp16=not bf16_ok,
    bf16=bf16_ok,
    logging_steps=10,
    save_strategy="epoch",
    optim="paged_adamw_8bit",
    gradient_checkpointing=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=dev_ds,                
    data_collator=collate_fn,          
)

trainer.train()
os.makedirs("./gemma2_adapter", exist_ok=True)
model.save_pretrained("./gemma2_adapter")
tokenizer.save_pretrained("./gemma2_adapter")

del trainer, model
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()


base_for_infer = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    torch_dtype=compute_dtype,
    device_map=device_map,
    attn_implementation="eager",  
)
model_inf = PeftModel.from_pretrained(base_for_infer, "./gemma2_adapter")
model_inf.eval()

TEST_INPUT_COL = "description_html_clean" if "description_html_clean" in test_df.columns else \
                 ("description_html" if "description_html" in test_df.columns else None)
assert TEST_INPUT_COL is not None, "Test CSV needs 'description_html_clean' or 'description_html'"

@torch.inference_mode()
def generate_short(description_html: str, max_new_tokens: int = 64) -> str:
    prompt = build_prompt(description_html)
    inputs = tokenizer(prompt, return_tensors="pt").to(model_inf.device)
    out = model_inf.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        temperature=0.0,
        top_p=1.0,
        eos_token_id=tokenizer.eos_token_id,
    )

    generated_tokens = out[0][inputs["input_ids"].shape[1]:]
    text = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    return text.strip()

test_df["pred_short_description"] = [
    generate_short(str(x)) for x in test_df[TEST_INPUT_COL].astype(str).tolist()
]

OUT_CSV = "predictions.csv"
test_df.to_csv(OUT_CSV, index=False)
print(f"Done. Saved: {OUT_CSV}")

2025-08-26 16:58:53.375791: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756227533.750038      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756227533.856072      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Step,Training Loss


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

train_texts = train_df[INPUT_COL].astype(str).tolist()
test_texts  = test_df[TEST_INPUT_COL].astype(str).tolist()


train_set = set(train_texts)
test_set  = set(test_texts)
exact_overlap = train_set.intersection(test_set)

print(f"Số lượng test samples: {len(test_texts)}")
print(f"Số lượng test trùng EXACT với train: {len(exact_overlap)}")
print(f"Tỉ lệ exact overlap: {len(exact_overlap)/len(test_texts):.2%}")


vectorizer = TfidfVectorizer(max_features=5000).fit(train_texts + test_texts)
train_vecs = vectorizer.transform(train_texts)
test_vecs  = vectorizer.transform(test_texts)

threshold = 0.8 
similar_count = 0
for i, test_vec in enumerate(test_vecs):
    sims = cosine_similarity(test_vec, train_vecs).flatten()
    if sims.max() >= threshold:
        similar_count += 1
