# Dataset transformation and mapping

Using [Microsoft News Recommendation Dataset](https://www.kaggle.com/datasets/arashnic/mind-news-dataset)



In [None]:
!pip install transformers
!pip install torch
!pip install bertopic
!pip install pinecone-client

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from transformers import BertTokenizer, BertModel
import pinecone
import torch


In [3]:
news = pd.read_csv(
    "/content/news.tsv",
    sep="\t",
    names=["itemId","category","subcategory","title","abstract","url","title_entities","abstract_entities"])
print(f"The article data consist in total of {len(news)} number of articles.")
news.head()

The article data consist in total of 51282 number of articles.


Unnamed: 0,itemId,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [5]:
raw_behaviour = pd.read_csv(
    "/content/behaviors.tsv",
    sep="\t",
    names=["impressionId","userId","timestamp","click_history","impressions"])

print(f"The dataset originally consist of {len(raw_behaviour)} number of interactions.")
raw_behaviour.head()

The dataset originally consist of 156965 number of interactions.


Unnamed: 0,impressionId,userId,timestamp,click_history,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


In [6]:
# Function to split the impressions and clicks into two seperate lists
def process_impression(impression_list):
    list_of_strings = impression_list.split()
    click = [x.split('-')[0] for x in list_of_strings if x.split('-')[1] == '1']
    non_click = [x.split('-')[0] for x in list_of_strings if x.split('-')[1] == '0']
    return click,non_click

# We can then indexize these two new columns:
raw_behaviour['click'], raw_behaviour['noclicks'] = zip(*raw_behaviour['impressions'].map(process_impression))

# Add timestamps
raw_behaviour['tmstp'] = pd.to_datetime(raw_behaviour['timestamp'])
raw_behaviour['tmstp'].fillna(pd.to_datetime('1970-01-01'), inplace=True)
raw_behaviour['tmstp'] = raw_behaviour['tmstp'].apply(lambda x: x.timestamp())
raw_behaviour['tmstp'] = pd.to_datetime(raw_behaviour['tmstp']).values.astype(np.int64)

# Convert timestamp value to hours since epoch
raw_behaviour['epochhrs'] = pd.to_datetime(raw_behaviour['timestamp']).values.astype(np.int64)/(1e6)/1000/3600
raw_behaviour['epochhrs'] = raw_behaviour['epochhrs'].round()

# If there exists several clicks in one session, expand to new observation
raw_behaviour = raw_behaviour.explode("click").reset_index(drop=True)

raw_behaviour

Unnamed: 0,impressionId,userId,timestamp,click_history,impressions,click,noclicks,tmstp,epochhrs
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,N55689,[N35729],1573463158,437073.0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,N17059,"[N20678, N39317, N58114, N20495, N42977, N2240...",1573582290,437106.0
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,N23814,"[N50014, N23877, N35389, N49712, N16844, N5968...",1573714908,437143.0
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,N49685,"[N35729, N33632, N27581]",1573450085,437069.0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,N8400,"[N39985, N36050, N16096, N22407, N60408, N6149...",1573575081,437104.0
...,...,...,...,...,...,...,...,...,...
236339,156964,U44625,11/13/2019 2:57:02 PM,N4118 N47297 N3164 N43295 N6056 N38747 N42973 ...,N6219-0 N3663-0 N31147-0 N58363-0 N4107-0 N457...,N50007,"[N6219, N3663, N31147, N58363, N4107, N4573, N...",1573657022,437127.0
236340,156964,U44625,11/13/2019 2:57:02 PM,N4118 N47297 N3164 N43295 N6056 N38747 N42973 ...,N6219-0 N3663-0 N31147-0 N58363-0 N4107-0 N457...,N366,"[N6219, N3663, N31147, N58363, N4107, N4573, N...",1573657022,437127.0
236341,156964,U44625,11/13/2019 2:57:02 PM,N4118 N47297 N3164 N43295 N6056 N38747 N42973 ...,N6219-0 N3663-0 N31147-0 N58363-0 N4107-0 N457...,N18573,"[N6219, N3663, N31147, N58363, N4107, N4573, N...",1573657022,437127.0
236342,156964,U44625,11/13/2019 2:57:02 PM,N4118 N47297 N3164 N43295 N6056 N38747 N42973 ...,N6219-0 N3663-0 N31147-0 N58363-0 N4107-0 N457...,N20630,"[N6219, N3663, N31147, N58363, N4107, N4573, N...",1573657022,437127.0


In [7]:
# Extract the clicks from the previous clicks
click_history = raw_behaviour[["userId","click_history"]].drop_duplicates().dropna()
click_history["click_history"] = click_history.click_history.map(lambda x: x.split())
click_history = click_history.explode("click_history").rename(columns={"click_history":"click"})
click_history["noclicks"] = pd.Series([[] for _ in range(len(click_history.index))])
click_history["epochhrs"] = raw_behaviour.epochhrs.min()

click_history


Unnamed: 0,userId,click,noclicks,epochhrs
0,U13740,N55189,[],437016.0
0,U13740,N42782,[],437016.0
0,U13740,N34694,[],437016.0
0,U13740,N45794,[],437016.0
0,U13740,N18445,[],437016.0
...,...,...,...,...
236327,U66493,N62940,[],437016.0
236327,U66493,N56889,[],437016.0
236334,U72015,N53895,[],437016.0
236334,U72015,N48715,[],437016.0


In [8]:
# concatenate historical clicks with the raw_behaviour
raw_behaviour = pd.concat([raw_behaviour,click_history],axis=0).reset_index(drop=True)
print(f"The dataset after pre-processing consist of {len(raw_behaviour)} number of interactions.")

raw_behaviour

The dataset after pre-processing consist of 1162402 number of interactions.


Unnamed: 0,impressionId,userId,timestamp,click_history,impressions,click,noclicks,tmstp,epochhrs
0,1.0,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,N55689,[N35729],1.573463e+09,437073.0
1,2.0,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,N17059,"[N20678, N39317, N58114, N20495, N42977, N2240...",1.573582e+09,437106.0
2,3.0,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,N23814,"[N50014, N23877, N35389, N49712, N16844, N5968...",1.573715e+09,437143.0
3,4.0,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,N49685,"[N35729, N33632, N27581]",1.573450e+09,437069.0
4,5.0,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,N8400,"[N39985, N36050, N16096, N22407, N60408, N6149...",1.573575e+09,437104.0
...,...,...,...,...,...,...,...,...,...
1162397,,U66493,,,,N62940,[],,437016.0
1162398,,U66493,,,,N56889,[],,437016.0
1162399,,U72015,,,,N53895,[],,437016.0
1162400,,U72015,,,,N48715,[],,437016.0


In [9]:
min_click_cutoff = 100
print(f'Number of items that have less than {min_click_cutoff} clicks make up',np.round(np.mean(raw_behaviour.groupby("click").size() < min_click_cutoff)*100,3),'% of the total, and these will be removed.')
# remove items with less clicks than min_click_cutoff
raw_behaviour = raw_behaviour[raw_behaviour.groupby("click")["userId"].transform('size') >= min_click_cutoff].reset_index(drop=True)

Number of items that have less than 100 clicks make up 93.852 % of the total, and these will be removed.


In [10]:
# Get a set with all the unique items
click_set = set(raw_behaviour['click'].unique())
# remove items for impressions that is not avaiable in the click set (the items that we will be training on)
raw_behaviour['noclicks'] = raw_behaviour['noclicks'].apply(lambda impressions: [impression for impression in impressions if impression in click_set])

## Select the columns that we now want to use for further analysis
behaviour = raw_behaviour[['tmstp','epochhrs','userId','click','noclicks']].copy()

print('Number of interactions in the behaviour dataset:', behaviour.shape[0])
print('Number of users in the behaviour dataset:', behaviour.userId.nunique())
print('Number of articles in the behaviour dataset:', behaviour.click.nunique())

behaviour.head()

Number of interactions in the behaviour dataset: 781871
Number of users in the behaviour dataset: 49832
Number of articles in the behaviour dataset: 2451


Unnamed: 0,tmstp,epochhrs,userId,click,noclicks
0,1573463000.0,437073.0,U13740,N55689,[N35729]
1,1573582000.0,437106.0,U91836,N17059,"[N20678, N39317, N58114, N20495, N42977, N1459..."
2,1573715000.0,437143.0,U73700,N23814,"[N23877, N35389, N49712, N16844, N59685, N2344..."
3,1573450000.0,437069.0,U34670,N49685,"[N35729, N33632, N27581]"
4,1573498000.0,437083.0,U19739,N33619,[]


In [11]:
target_behaviour = raw_behaviour[['tmstp','timestamp','epochhrs','userId','click']].copy()
print('Number of interactions in the behaviour dataset:', target_behaviour.shape[0])
print('Number of users in the behaviour dataset:', target_behaviour.userId.nunique())
print('Number of articles in the behaviour dataset:', target_behaviour.click.nunique())

target_behaviour

Number of interactions in the behaviour dataset: 781871
Number of users in the behaviour dataset: 49832
Number of articles in the behaviour dataset: 2451


Unnamed: 0,tmstp,timestamp,epochhrs,userId,click
0,1.573463e+09,11/11/2019 9:05:58 AM,437073.0,U13740,N55689
1,1.573582e+09,11/12/2019 6:11:30 PM,437106.0,U91836,N17059
2,1.573715e+09,11/14/2019 7:01:48 AM,437143.0,U73700,N23814
3,1.573450e+09,11/11/2019 5:28:05 AM,437069.0,U34670,N49685
4,1.573498e+09,11/11/2019 6:52:13 PM,437083.0,U19739,N33619
...,...,...,...,...,...
781866,,,437016.0,U66493,N4255
781867,,,437016.0,U66493,N62940
781868,,,437016.0,U66493,N56889
781869,,,437016.0,U72015,N53895


In [13]:
df_sorted = target_behaviour.sort_values(['userId', 'tmstp'], ascending=[True, False])
top_1000_rows = df_sorted.groupby('userId').head(1000)
top_1000_rows.drop(['epochhrs','timestamp'], axis=1)

top_1000_rows = top_1000_rows.rename(columns={"click":"itemId"})

top_1000_rows

Unnamed: 0,tmstp,timestamp,epochhrs,userId,itemId
731547,,,437016.0,U100,N20121
731548,,,437016.0,U100,N33998
731549,,,437016.0,U100,N45954
731550,,,437016.0,U100,N55743
731551,,,437016.0,U100,N18870
...,...,...,...,...,...
750132,,,437016.0,U9999,N62471
750133,,,437016.0,U9999,N38256
750134,,,437016.0,U9999,N4486
750135,,,437016.0,U9999,N14761


In [14]:
data = pd.merge(top_1000_rows, news, on='itemId')

data

Unnamed: 0,tmstp,timestamp,epochhrs,userId,itemId,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,,,437016.0,U100,N20121,music,musicnews,"Bob Kingsley, Country Radio Legend, Dead at 80","Country radio legend Bob Kingley, the longtime...",https://assets.msn.com/labs/mind/AAIW26m.html,"[{""Label"": ""Bob Kingsley"", ""Type"": ""P"", ""Wikid...","[{""Label"": ""Bob Kingsley"", ""Type"": ""P"", ""Wikid..."
1,,,437016.0,U10034,N20121,music,musicnews,"Bob Kingsley, Country Radio Legend, Dead at 80","Country radio legend Bob Kingley, the longtime...",https://assets.msn.com/labs/mind/AAIW26m.html,"[{""Label"": ""Bob Kingsley"", ""Type"": ""P"", ""Wikid...","[{""Label"": ""Bob Kingsley"", ""Type"": ""P"", ""Wikid..."
2,,,437016.0,U10058,N20121,music,musicnews,"Bob Kingsley, Country Radio Legend, Dead at 80","Country radio legend Bob Kingley, the longtime...",https://assets.msn.com/labs/mind/AAIW26m.html,"[{""Label"": ""Bob Kingsley"", ""Type"": ""P"", ""Wikid...","[{""Label"": ""Bob Kingsley"", ""Type"": ""P"", ""Wikid..."
3,,,437016.0,U10098,N20121,music,musicnews,"Bob Kingsley, Country Radio Legend, Dead at 80","Country radio legend Bob Kingley, the longtime...",https://assets.msn.com/labs/mind/AAIW26m.html,"[{""Label"": ""Bob Kingsley"", ""Type"": ""P"", ""Wikid...","[{""Label"": ""Bob Kingsley"", ""Type"": ""P"", ""Wikid..."
4,,,437016.0,U10149,N20121,music,musicnews,"Bob Kingsley, Country Radio Legend, Dead at 80","Country radio legend Bob Kingley, the longtime...",https://assets.msn.com/labs/mind/AAIW26m.html,"[{""Label"": ""Bob Kingsley"", ""Type"": ""P"", ""Wikid...","[{""Label"": ""Bob Kingsley"", ""Type"": ""P"", ""Wikid..."
...,...,...,...,...,...,...,...,...,...,...,...,...
781866,,,437016.0,U90290,N9647,video,wonder,Scientists freak out over deep sea feast,"A team of researchers stumbled upon a ""whale f...",https://assets.msn.com/labs/mind/AAIXD2w.html,[],[]
781867,,,437016.0,U91426,N9647,video,wonder,Scientists freak out over deep sea feast,"A team of researchers stumbled upon a ""whale f...",https://assets.msn.com/labs/mind/AAIXD2w.html,[],[]
781868,,,437016.0,U91620,N9647,video,wonder,Scientists freak out over deep sea feast,"A team of researchers stumbled upon a ""whale f...",https://assets.msn.com/labs/mind/AAIXD2w.html,[],[]
781869,,,437016.0,U93250,N9647,video,wonder,Scientists freak out over deep sea feast,"A team of researchers stumbled upon a ""whale f...",https://assets.msn.com/labs/mind/AAIXD2w.html,[],[]


In [15]:
data.to_csv("combined.csv")

# Generate embeddings and upload to pinceone

In [None]:
YOUR_API_KEY = ""
YOUR_ENV = ""

pinecone.init(
    api_key=YOUR_API_KEY,
    environment=YOUR_ENV
)

indices = pinecone.list_indexes()
print(indices)
index = pinecone.Index('mindnews')
stats = index.describe_index_stats()
print(stats)
data = pd.read_csv("/content/combined.csv")
combined = data[:1000]

['mindnews']
{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


In [None]:
index = pinecone.Index('mindnews')
index

<pinecone.index.Index at 0x7f36e3b8add0>

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
model = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
text = combined.title.tolist()

tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = model(**tokens)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Use average pooling over token embeddings
    print(embeddings)

tensor([[-0.1826, -0.1199,  0.3654,  ..., -0.1021,  0.1346, -0.3270],
        [ 0.2373, -0.1507,  0.4501,  ..., -0.0460,  0.1540, -0.1651],
        [-0.1893, -0.0424,  0.2154,  ..., -0.0758,  0.0500, -0.1756],
        ...,
        [-0.1251, -0.1115,  0.2427,  ..., -0.3053, -0.1087, -0.0497],
        [ 0.0268, -0.3382,  0.4264,  ...,  0.1346,  0.1572, -0.3507],
        [-0.0383, -0.2010,  0.3787,  ..., -0.0722, -0.0581, -0.1697]],
       device='cuda:0')


In [None]:
len(embeddings)

1000

In [None]:
from tqdm.auto import tqdm

batch_size = 200

for i in tqdm(range(0, len(combined), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(combined))
    # extract metadata batch
    meta_batch = combined.iloc[i:i_end]
    meta_dict = meta_batch.to_dict(orient="records")
    # concatinate all metadata field except for id and year to form a single string
    title = combined.title[i:i_end]
    # create dense vectors
    dense_embeds = embeddings[i:i_end]
    # create unique IDs
    ids = [str(x) for x in range(i, i_end)]

    upserts = []
    # loop through the data and create dictionaries for uploading documents to pinecone index
    for _id, dense, meta in zip(ids, dense_embeds, meta_dict):
        upserts.append({
            'id': _id,
            'values': [tensor.item() for tensor in dense],
            'metadata': meta
        })

    # upload the documents to the new hybrid index
    index.upsert(upserts)
# print(full)
# show index description after uploading the documents
#index.describe_index_stats()


  0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
#query category
index.query(
    vector=[0] * 768,
    filter={
        "category": {"$eq": "lifestyle music tv sports"},
    },
    top_k=5,
    include_metadata=True
)

{'matches': [{'id': '480',
              'metadata': {'Unnamed: 0': 480.0,
                           'abstract': "Kiss won't be playing Australia and NZ "
                                       "after all. You might say she's scary "
                                       'good at playing dress-up, because '
                                       "Heidi Klum's 2019 Halloween costume is "
                                       'even more impressive than we could '
                                       "have imagined   and that's saying a "
                                       'lot, considering transformative '
                                       'Halloween costumes are kind of her '
                                       'thing. But this year Klum took it up '
                                       'one more notch as she shared her '
                                       'metamorphosis into, Social media had a '
                                       "field day with Baker Mayfie