In [None]:
import pandas as pd
import torch
import pickle
from transformers import T5Tokenizer, T5EncoderModel
from tqdm import tqdm

In [None]:
news_df=pd.read_csv("pens_news (2).csv")
news_df

In [None]:
# Load T5-base encoder and tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5EncoderModel.from_pretrained("t5-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

In [None]:
def get_embedding(text, tokenizer, model, device):
    """Encode a single text into T5 embedding using mean pooling"""
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=512  # limit for T5-base
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        token_embeddings = outputs.last_hidden_state  # [1, seq_len, hidden_dim]
        sentence_embedding = token_embeddings.mean(dim=1).squeeze().cpu().numpy()
    return sentence_embedding

In [None]:
# Store embeddings
headline_embeddings = {}
newsbody_embeddings = {}

In [9]:
print("Generating headline embeddings...")
for i, row in tqdm(news_df[:5].iterrows(), total=len(news_df[:5])):
    headline_embeddings[row["NewsID"]] = get_embedding(
        str(row["Headline"]), tokenizer, model, device
    )

Generating headline embeddings...


100%|██████████| 5/5 [00:01<00:00,  3.23it/s]


In [10]:
print("Generating news body embeddings...")
for i, row in tqdm(news_df[:5].iterrows(), total=len(news_df[:5])):
    newsbody_embeddings[row["NewsID"]] = get_embedding(
        str(row["NewsBody"]), tokenizer, model, device
    )

Generating news body embeddings...


100%|██████████| 5/5 [00:01<00:00,  3.22it/s]


In [11]:
# Save as pickle
with open("headline_T5.pkl", "wb") as f:
    pickle.dump(headline_embeddings, f)

with open("newsbody_T5.pkl", "wb") as f:
    pickle.dump(newsbody_embeddings, f)

print("✅ Embeddings saved: headline_T5.pkl and newsbody_T5.pkl")

✅ Embeddings saved: headline_T5.pkl and newsbody_T5.pkl


In [12]:
with open("headline_T5.pkl", "rb") as f:
    headline_embeddings = pickle.load(f)

# Load newsbody embeddings
with open("newsbody_T5.pkl", "rb") as f:
    newsbody_embeddings = pickle.load(f)

# Convert to DataFrames
headline_df = pd.DataFrame(list(headline_embeddings.items()), columns=["NewsID", "Headline_Embedding"])
newsbody_df = pd.DataFrame(list(newsbody_embeddings.items()), columns=["NewsID", "NewsBody_Embedding"])




In [14]:
newsbody_df

Unnamed: 0,NewsID,NewsBody_Embedding
0,N10000,"[-0.08127328, 0.019974899, -0.07350032, 0.0337..."
1,N10001,"[-0.16276242, -0.0029857457, 0.010285702, 0.01..."
2,N10002,"[-0.055686325, 0.061170243, -0.31549126, 0.350..."
3,N10003,"[-0.3972332, 0.17620325, -0.16000667, 0.093594..."
4,N10004,"[-0.22680107, -0.033510517, -0.09148933, 0.046..."


In [15]:
import numpy as np

# Check one headline embedding
sample_headline = headline_df["Headline_Embedding"].iloc[0]
print("Type:", type(sample_headline))
print("Shape:", np.array(sample_headline).shape)

# Check one newsbody embedding
sample_newsbody = newsbody_df["NewsBody_Embedding"].iloc[0]
print("Type:", type(sample_newsbody))
print("Shape:", np.array(sample_newsbody).shape)


Type: <class 'numpy.ndarray'>
Shape: (768,)
Type: <class 'numpy.ndarray'>
Shape: (768,)
