In [3]:
# Import libraries

import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [5]:
# Load data
df = pd.read_csv("sample_data.csv")
df.head()

Unnamed: 0,Class Index,Title,Description,desc_length,clean_desc
0,3,"BBC set for major shake-up, claims newspaper","London - The British Broadcasting Corporation,...",39,london british broadcast corpor world 39 bigge...
1,3,Marsh averts cash crunch,Embattled insurance broker #39;s banks agree t...,24,embattl insur broker 39 bank agre waiv claus m...
2,2,"Jeter, Yankees Look to Take Control (AP)",AP - Derek Jeter turned a season that started ...,23,ap derek jeter turn season start terribl slump...
3,4,Flying the Sun to Safety,When the Genesis capsule comes back to Earth w...,29,genesi capsul come back earth sampl sun helico...
4,3,Stocks Seen Flat as Nortel and Oil Weigh,NEW YORK (Reuters) - U.S. stocks were set to ...,37,new york reuter us stock set open near unchang...


In [7]:
# Load Fine-Tuned model

tokenizer = BertTokenizer.from_pretrained('fine_tuned_bert')
model = BertModel.from_pretrained('fine_tuned_bert')
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [9]:
# Get Embedding

def get_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Take mean of last hidden state (pooled output alternative)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Compute embeddings for all articles
embeddings = []
for text in df['Description']:
    emb = get_embedding(text)
    embeddings.append(emb)

df['embedding'] = embeddings

In [11]:
# Simulate User Profile

# Simulate user history (indices of articles read)
user_read_indices = [0, 2, 5, 10, 20]  # Example indices

# Average their embeddings to create user profile
user_embedding = np.mean([df['embedding'].iloc[idx] for idx in user_read_indices], axis=0)

In [15]:
# Recommend Artices

# Compute similarity
similarities = []
for emb in df['embedding']:
    sim = cosine_similarity([user_embedding], [emb])[0][0]
    similarities.append(sim)

df['similarity'] = similarities

# Get top 10 recommendations (excluding already-read)
recommendations = df.drop(user_read_indices).sort_values(by='similarity', ascending=False).head(10)

# Show recommendations
recommendations[['Title', 'Description', 'similarity']]

Unnamed: 0,Title,Description,similarity
493,"Broncos 31, Raiders 3",Jake Plummer and Reuben Droughns did little wr...,0.85806
173,Bronze possible for Hill,"ATHENS -- Shawn Hill #39;s Olympics are over, ...",0.856977
425,National League Preview from The Sports Network,(Sports Network) - Barry Bonds can equal two h...,0.853722
474,10th World Series Title Beckoning for St. Louis,NEW YORK (Reuters) - The Cardinals may be hea...,0.852506
704,Rick Weaver,The Steelers and Miami Dolphins are in a tough...,0.851357
580,I #39;m not rushing myself: Tendulkar,Nagpur: Batting maestro Sachin Tendulkar today...,0.848712
705,"Garciaparra on mend, expected to return tonight",Chicago Cubs shortstop Nomar Garciaparra expec...,0.846664
274,End of Series will bring a lot of questions,Now what? Was winning the World Series truly t...,0.846251
62,"Jeter, Yankees Look Dashing Once Again",Here it comes. You can feel the rumbling of an...,0.845852
159,"Markets surge on oil drop, election",Wall Street bounded higher for the second stra...,0.845776
