In [None]:
pip install torch transformers pandas tqdm

In [None]:
# Import required libraries - torch first!
import torch
from transformers import AutoTokenizer, AutoModel
from torch.nn.functional import normalize
import pandas as pd
import pickle
from tqdm.auto import tqdm
import os

In [None]:
# Now we can safely check for CUDA
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

# Configuration
CSV_FILE_PATH = "news.csv"  # Update with your CSV file path
HEADLINE_EMBEDDINGS_FILE = r"C:\Users\admin\Desktop\ImPerSum\Embeddings\headline_embeddings.pkl"
NEWSBODY_EMBEDDINGS_FILE = r"C:\Users\admin\Desktop\ImPerSum\Embeddings\newsbody_embeddings.pkl"
BATCH_SIZE = 32  # Adjust based on your GPU memory

In [None]:
# Load the E5 model and tokenizer
def load_e5_model():
    print("Loading E5 model...")
    model_name = "intfloat/e5-base-v2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(DEVICE)
    return tokenizer, model

tokenizer, model = load_e5_model()

In [None]:
# Function to generate embeddings in batches
def generate_embeddings(texts, tokenizer, model, device=DEVICE, batch_size=BATCH_SIZE):
    """
    Generate embeddings for a list of texts using the E5 model.

    Args:
        texts: List of text strings to embed
        tokenizer: E5 tokenizer
        model: E5 model
        device: Device to run the model on
        batch_size: Number of texts to process at once

    Returns:
        numpy array of embeddings
    """
    model.eval()
    embeddings = []

    # Process in batches
    for i in tqdm(range(0, len(texts), batch_size), desc="Generating embeddings"):
        batch_texts = texts[i:i+batch_size]

        # Add prefix and tokenize
        batch_texts = ["passage: " + text for text in batch_texts]
        inputs = tokenizer(
            batch_texts,
            max_length=512,
            padding=True,
            truncation=True,
            return_tensors="pt"
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)

        # Mean pooling
        batch_embeddings = mean_pooling(outputs.last_hidden_state, inputs['attention_mask'])
        batch_embeddings = normalize(batch_embeddings, p=2, dim=1)

        embeddings.append(batch_embeddings.cpu())

    # Concatenate all batch embeddings
    return torch.cat(embeddings, dim=0).numpy()

def mean_pooling(token_embeddings, attention_mask):
    """Perform mean pooling with attention mask"""
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [None]:
# Load the news dataset
news_df = pd.read_csv(r"C:\Users\admin\Desktop\ImPerSum\Datasets\pens_news (2).csv") # or whatever delimiter your file uses

# Display sample data
print(f"Found {len(news_df)} news articles")
news_df.head()


In [None]:
# Preprocess text data
print("Preprocessing text data...")
news_df['Headline'] = news_df['Headline'].fillna("").astype(str)
news_df['NewsBody'] = news_df['NewsBody'].fillna("").astype(str)

In [None]:
print(f"Found {len(news_df)} news articles")

In [None]:
print(len(news_df['Headline'].tolist()))       # Number of input headlines
#print(len(headline_embeddings))                # Number of generated embeddings
print(news_df.shape[0], len(news_df['NewsID'].unique()))  # Total and unique NewsIDs


In [None]:
print(news_df['Headline'].isnull().sum())      # Count of NaN headlines
print((news_df['Headline'] == '').sum())       # Count of empty strings


In [None]:
print(f"Total rows: {len(news_df)}")
print(f"Non-empty headlines: {(news_df['Headline'].notnull() & news_df['Headline'].str.strip().ne('')).sum()}")
print(f"Empty or null headlines: {(news_df['Headline'].isnull() | news_df['Headline'].str.strip().eq('')).sum()}")


In [None]:
# Generate headline embeddings
print("\nGenerating headline embeddings...")
headline_embeddings = generate_embeddings(
    texts=news_df['Headline'].tolist(),
    tokenizer=tokenizer,
    model=model
)

# Create a dictionary mapping NewsID to headline embeddings
headline_embeddings_dict = dict(zip(news_df['NewsID'], headline_embeddings))

# Save to pickle file
with open(HEADLINE_EMBEDDINGS_FILE, 'wb') as f:
    pickle.dump(headline_embeddings_dict, f)

print(f"Saved headline embeddings to {HEADLINE_EMBEDDINGS_FILE}")
print(f"Embedding shape: {headline_embeddings.shape}")

In [None]:
# Generate news body embeddings
print("\nGenerating news body embeddings...")
newsbody_embeddings = generate_embeddings(
    texts=news_df['NewsBody'].tolist(),
    tokenizer=tokenizer,
    model=model
)

# Create a dictionary mapping NewsID to news body embeddings
newsbody_embeddings_dict = dict(zip(news_df['NewsID'], newsbody_embeddings))

# Save to pickle file
with open(NEWSBODY_EMBEDDINGS_FILE, 'wb') as f:
    pickle.dump(newsbody_embeddings_dict, f)

print(f"Saved news body embeddings to {NEWSBODY_EMBEDDINGS_FILE}")
print(f"Embedding shape: {newsbody_embeddings.shape}")

In [None]:
df=pd.read_csv(r"C:\Users\admin\Desktop\ImPerSum\Datasets\summaries.csv", encoding='latin-1')
df

In [None]:
SUMM_EMBEDDINGS_FILE = r"C:\Users\admin\Desktop\ImPerSum\Embeddings\summary_embeddings.pkl"

In [None]:
print(f"Total rows: {len(df)}")
print(f"Non-empty headlines: {(df['Summary'].notnull() & df['Summary'].str.strip().ne('')).sum()}")
print(f"Empty or null headlines: {(df['Summary'].isnull() | df['Summary'].str.strip().eq('')).sum()}")


In [None]:
# Clean the summaries: convert to string and handle NaN
df['Summary'] = df['Summary'].fillna('').astype(str)

In [None]:
# Generate news body embeddings
print("\nGenerating summaries embeddings...")
summ_embeddings = generate_embeddings(
    texts=df['Summary'].tolist(),
    tokenizer=tokenizer,
    model=model
)

# Create a dictionary mapping NewsID to news body embeddings
summ_embeddings_dict = dict(zip(df['SummID'], summ_embeddings))

# Save to pickle file
with open(SUMM_EMBEDDINGS_FILE, 'wb') as f:
    pickle.dump(summ_embeddings_dict, f)

print(f"Saved summary embeddings to {SUMM_EMBEDDINGS_FILE}")
print(f"Embedding shape: {summ_embeddings.shape}")