In [None]:
# ✅ 1. 패키지 설치 및 초기 모델 로딩
!pip install transformers datasets sentencepiece nltk
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

model_name = "google/pegasus-large"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

In [None]:
# ✅ 2. 데이터 로딩
!wget https://huggingface.co/datasets/gfissore/arxiv-abstracts-2021/resolve/main/arxiv-abstracts.jsonl.gz
import gzip
import shutil
import json

with gzip.open('arxiv-abstracts.jsonl.gz', 'rb') as f_in:
    with open('arxiv-abstracts.jsonl', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

filtered_papers = []

with open('arxiv-abstracts.jsonl', 'r') as f:
    for line in f:
        data = json.loads(line)
        categories = data.get('categories', [])
        if any('cs.' in category for category in categories):
            filtered_papers.append({
                "title": data["title"].strip(),
                "abstract": data["abstract"].strip()
            })

print(f"총 {len(filtered_papers)}개의 Computer Science 논문이 로드됨.")


In [None]:
# ✅ 3. 사전학습 데이터 생성
from transformers import PegasusTokenizer
from tqdm import tqdm
import torch
import random
import os

# 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

# PEGASUS tokenizer 로드
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")

# 문장 분리 + GSG-style 전처리
def simple_sentence_split(text):
    sentences = [s.strip() + '.' for s in text.split('.') if len(s.strip()) > 0]
    return sentences

def gap_sentence_preparation_simple(text, num_sentences_to_mask=3):
    sentences = simple_sentence_split(text)
    if len(sentences) <= num_sentences_to_mask:
        return None
    selected = sorted(random.sample(range(len(sentences)), num_sentences_to_mask))
    target = " ".join([sentences[i] for i in selected])
    source = " ".join([sentences[i] for i in range(len(sentences)) if i not in selected])
    return source, target

# 배치 토크나이징
def tokenize_in_batches(text_list, batch_size=128, max_length=512, is_target=False):
    all_input_ids = []
    all_attention_masks = []

    for i in tqdm(range(0, len(text_list), batch_size)):
        batch = text_list[i:i+batch_size]
        if is_target:
            with tokenizer.as_target_tokenizer():
                tokenized = tokenizer(batch, truncation=True, padding="max_length",
                                      max_length=max_length, return_tensors="pt")
        else:
            tokenized = tokenizer(batch, truncation=True, padding="max_length",
                                  max_length=max_length, return_tensors="pt")

        all_input_ids.append(tokenized["input_ids"])
        all_attention_masks.append(tokenized["attention_mask"])

    return {
        "input_ids": torch.cat(all_input_ids),
        "attention_mask": torch.cat(all_attention_masks)
    }

# 전체 GSG + 토크나이징 + 드라이브 저장까지
def process_and_save_chunks(all_papers, chunk_size=10000, output_dir="/content/drive/MyDrive/pegasus_datas"):
    os.makedirs(output_dir, exist_ok=True)
    chunk_id = 0
    train_pairs = []
    train_titles = []

    for paper in all_papers:
        combined_text = paper["title"] + ". " + paper["abstract"]
        result = gap_sentence_preparation_simple(combined_text)
        if result:
            train_pairs.append(result)
            train_titles.append(paper["title"])  # ✅ 같이 저장

        if len(train_pairs) == chunk_size:
            print(f"\n🧩 Chunk {chunk_id+1} - GSG 쌍 {len(train_pairs)}개 처리 중")

            sources = [src for src, _ in train_pairs]
            targets = [tgt for _, tgt in train_pairs]

            print("🔄 Source 토크나이징 중...")
            source_inputs = tokenize_in_batches(sources, max_length=512)

            print("🔄 Target (labels) 토크나이징 중...")
            target_inputs = tokenize_in_batches(targets, max_length=128, is_target=True)

            tokenized_inputs = {
                "input_ids": source_inputs["input_ids"],
                "attention_mask": source_inputs["attention_mask"],
                "labels": target_inputs["input_ids"],
                "titles": train_titles  # ✅ 진짜 title 추가 저장
            }

            save_path = os.path.join(output_dir, f"chunk_{chunk_id:03d}.pt")
            torch.save(tokenized_inputs, save_path)
            print(f"✅ 저장 완료: {save_path}")

            # ✅ 초기화
            train_pairs = []
            train_titles = []
            chunk_id += 1

    print("\n🎉 모든 chunk 처리 및 Google Drive 저장 완료!")



process_and_save_chunks(filtered_papers, chunk_size=10000)

In [None]:
# ✅ 4. Pegasus 추가 pretraining
import os
import torch
import gc
from transformers import PegasusForConditionalGeneration, Trainer, TrainingArguments
from torch.utils.data import Dataset

# 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

#  Huggingface 로깅 비활성화
os.environ["WANDB_DISABLED"] = "true"

# GSG Dataset 정의
class GSGDataset(Dataset):
    def __init__(self, data):
        self.input_ids = data["input_ids"]
        self.attention_mask = data["attention_mask"]
        self.labels = data["labels"]

    def __len__(self):
        return self.input_ids.size(0)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx]
        }

# A100 기준 학습 인자
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/pegasus_ckpt",
    per_device_train_batch_size=8,          # A100에선 8도 여유로움
    gradient_accumulation_steps=4,          # effective batch size = 32
    num_train_epochs=1,
    logging_steps=10,
    save_steps=999999,
    save_total_limit=1,
    fp16=True,                              # A100에서 fp16 완벽 지원
    report_to="none"
)

# chunk 디렉토리
chunk_dir = "/content/drive/MyDrive/pegasus_chunks"
chunk_files = sorted([f for f in os.listdir(chunk_dir) if f.endswith(".pt") and "part" not in f])

# 0번 ~ 9번 chunk (총 100,000개 샘플) 학습
for chunk_id, chunk_file in enumerate(chunk_files[:10]):
    print(f"\n🚀 [Chunk {chunk_id+1}/10] 학습 시작: {chunk_file}")

    # 불러오기
    chunk_path = os.path.join(chunk_dir, chunk_file)
    data = torch.load(chunk_path)
    dataset = GSGDataset(data)

    # 모델
    model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-large")
    trainer = Trainer(model=model, args=training_args, train_dataset=dataset)

    # 학습
    trainer.train()

    # 저장
    model_path = f"/content/drive/MyDrive/pegasus_ckpt/{chunk_file.replace('.pt', '')}"
    model.save_pretrained(model_path)
    print(f"✅ 모델 저장 완료: {model_path}")

    # 정리
    del data
    del dataset
    del model
    del trainer
    gc.collect()

In [None]:
# ✅ 5. Pegasus 요약문 생성
import os
import json
import torch
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

# 모델 로드
model_path = "/content/drive/MyDrive/pegasus_ckpt/chunk_009"
model = PegasusForConditionalGeneration.from_pretrained(model_path, local_files_only=True).cuda()
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
model.eval()

# chunk 경로
chunk_dir = "/content/drive/MyDrive/pegasus_datas"
chunk_files = sorted([
    f for f in os.listdir(chunk_dir)
    if f.endswith(".pt") and "part" not in f
])[10:20]

# 저장 경로
os.makedirs("/content/drive/MyDrive/pegasus_outputs", exist_ok=True)
output_jsonl = "/content/drive/MyDrive/pegasus_outputs/title_summary_100k.jsonl"

# 추론 설정
batch_size = 32
num_beams = 2

with open(output_jsonl, "w", encoding="utf-8") as f_out:
    for chunk_id, chunk_file in enumerate(chunk_files):
        print(f"\n📦 [Chunk {chunk_id+10}] 요약문 생성 시작: {chunk_file}")
        chunk_path = os.path.join(chunk_dir, chunk_file)
        data = torch.load(chunk_path)

        input_ids = data["input_ids"].cuda()
        attention_mask = data["attention_mask"].cuda()
        titles = data["titles"]  # ✅ 진짜 타이틀 사용

        for i in range(0, len(input_ids), batch_size):
            input_batch = input_ids[i:i+batch_size]
            attn_batch = attention_mask[i:i+batch_size]
            titles_batch = titles[i:i+batch_size]

            with torch.no_grad():
                summary_ids = model.generate(
                    input_batch,
                    attention_mask=attn_batch,
                    max_length=128,
                    num_beams=num_beams,
                    no_repeat_ngram_size=3,   # ✅ 반복 방지
                    early_stopping=True
                )

            summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)

            for title, summary in zip(titles_batch, summaries):
                f_out.write(json.dumps({
                    "title": title,
                    "summary": summary
                }, ensure_ascii=False) + "\n")

        print(f"✅ {chunk_file} 생성 및 저장 완료")


In [None]:
# ✅ 6. SimCSE 임베딩 벡터 생성 (별도의 추가 학습 없이 사용)
import os
import json
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

# 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

# 모델 로드 (SimCSE base version)
model_name = "princeton-nlp/sup-simcse-bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).cuda()
model.eval()

# 요약문 로드
input_path = "/content/drive/MyDrive/pegasus_outputs/title_summary_100k.jsonl"
titles = []
summaries = []

with open(input_path, "r", encoding="utf-8") as f:
    for line in f:
        obj = json.loads(line)
        titles.append(obj["title"])
        summaries.append(obj["summary"])

# 임베딩 함수 (batch 단위 처리)
def get_embeddings(texts, batch_size=32):
    all_embeddings = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128).to("cuda")

        with torch.no_grad():
            outputs = model(**inputs, return_dict=True)
            embeddings = outputs.pooler_output  # [CLS] 임베딩 (SimCSE에서는 이게 기준)
            all_embeddings.append(embeddings.cpu())

    return torch.cat(all_embeddings, dim=0)  # (N, 768)

# 임베딩 생성
print("🔄 SimCSE 임베딩 생성 중...")
summary_embeddings = get_embeddings(summaries)  # shape: (100000, 768)

# 저장
save_path = "/content/drive/MyDrive/pegasus_outputs/simcse_summary_embeddings.pt"
torch.save({
    "titles": titles,
    "embeddings": summary_embeddings
}, save_path)

print(f"✅ 저장 완료: {save_path} (shape: {summary_embeddings.shape})")


In [None]:
# ✅ 7. 실제 검색할 초록의 요약문 및 임베딩 생성 함수 정의
import torch
import torch.nn.functional as F
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import json

# 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

# 사전 준비: 드라이브 마운트 & 모델 로드
pegasus_path = "/content/drive/MyDrive/pegasus_ckpt/chunk_009"
simcse_path = "/content/drive/MyDrive/pegasus_outputs/simcse_summary_embeddings.pt"

# PEGASUS
pegasus_tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
pegasus_model = PegasusForConditionalGeneration.from_pretrained(pegasus_path, local_files_only=True).cuda()
pegasus_model.eval()

# SimCSE
simcse_tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
simcse_model = AutoModel.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased").cuda()
simcse_model.eval()

# 저장된 임베딩 불러오기
data = torch.load(simcse_path)
all_titles = data["titles"]
all_embeddings = data["embeddings"]  # (100000, 768), CPU 상에 있음

# 함수 1: PEGASUS로 요약
def summarize_abstract(abstract):
    inputs = pegasus_tokenizer(abstract, return_tensors="pt", truncation=True, padding="longest", max_length=512).to("cuda")
    with torch.no_grad():
        summary_ids = pegasus_model.generate(
            **inputs,
            max_length=128,
            num_beams=2,
            no_repeat_ngram_size=3,
            early_stopping=True
        )
    return pegasus_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# 함수 2: SimCSE 임베딩 생성
def get_simcse_embedding(text):
    inputs = simcse_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128).to("cuda")
    with torch.no_grad():
        outputs = simcse_model(**inputs)
        return outputs.pooler_output[0].cpu()  # (768,)

# 함수 3: 유사도 top-k
def retrieve_top_k(query_vector, candidate_vectors, candidate_titles, top_k=20):
    similarities = F.cosine_similarity(query_vector.unsqueeze(0), candidate_vectors)  # (100000,)
    topk_indices = torch.topk(similarities, k=top_k).indices
    return [(candidate_titles[i], similarities[i].item()) for i in topk_indices]

# 전체 시스템 실행
def find_similar_papers(abstract):
    print("🔄 초록 요약 중...")
    summary = summarize_abstract(abstract)
    print(f"📝 요약문: {summary}")

    print("🔄 임베딩 생성 중...")
    query_vec = get_simcse_embedding(summary)

    print("🔍 유사 논문 검색 중...")
    results = retrieve_top_k(query_vec, all_embeddings, all_titles, top_k=20)

    print("\n📚 가장 유사한 논문 Top-20:")
    for i, (title, score) in enumerate(results, 1):
        print(f"{i:2d}. ({score:.4f}) {title}")


In [2]:
# ✅ 8. 실제 추론
abstract_input = """A conflict-free k-coloring of a graph assigns one of k different colors to some of the vertices such that, for every vertex v, there is a color that is assigned to exactly one vertex among v and v's neighbors. Such colorings have applications in wireless networking, robotics, and geometry, and are well-studied in graph theory. Here we study the natural problem of the conflict-free chromatic number chi_CF(G) (the smallest k for which conflict-free k-colorings exist). We provide results both for closed neighborhoods N[v], for which a vertex v is a member of its neighborhood, and for open neighborhoods N(v), for which vertex v is not a member of its neighborhood.
For closed neighborhoods, we prove the conflict-free variant of the famous Hadwiger Conjecture: If an arbitrary graph G does not contain K_{k+1} as a minor, then chi_CF(G) <= k. For planar graphs, we obtain a tight worst-case bound: three colors are sometimes necessary and always sufficient. We also give a complete characterization of the computational complexity of conflict-free coloring. Deciding whether chi_CF(G)<= 1 is NP-complete for planar graphs G, but polynomial for outerplanar graphs. Furthermore, deciding whether chi_CF(G)<= 2 is NP-complete for planar graphs G, but always true for outerplanar graphs. For the bicriteria problem of minimizing the number of colored vertices subject to a given bound k on the number of colors, we give a full algorithmic characterization in terms of complexity and approximation for outerplanar and planar graphs.
For open neighborhoods, we show that every planar bipartite graph has a conflict-free coloring with at most four colors; on the other hand, we prove that for k in {1,2,3}, it is NP-complete to decide whether a planar bipartite graph has a conflict-free k-coloring. Moreover, we establish that any general} planar graph has a conflict-free coloring with at most eight colors."""
find_similar_papers(abstract_input)

🔄 초록 요약 중...


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


📝 요약문: We show that chi_CF(G)= 1 is NP-complete for planar graphs G, but polynomial for outerplanar graphs. For closed neighborhoods, we show that every planar bipartite graph has a conflict-free coloring with at most four colors; on the other hand, we prove that for k in 1,2,3, it is NPcomplete to decide whether a planar graph with at least four colors has a k-coloring.
🔄 임베딩 생성 중...
🔍 유사 논문 검색 중...

📚 가장 유사한 논문 Top-20:
 1. (0.9937) Conflict-Free Coloring of Planar Graphs
 2. (0.8451) The Parameterized Complexity of Graph Cyclability
 3. (0.8355) Equitable Colorings of $l$-Corona Products of Cubic Graphs
 4. (0.8297) Colouring graphs with constraints on connectivity
 5. (0.8258) Algorithmic Aspects of Regular Graph Covers
 6. (0.8243) On the complete width and edge clique cover problems
 7. (0.8242) Polyhedral studies of vertex coloring problems: The asymmetric
  representatives formulation
 8. (0.8223) A note on $\mathtt{V}$-free $2$-matchings
 9. (0.8208) Rainbow Colouring of Split 