In [1]:
from pathlib import Path
import ast

import pandas as pd

____

# Reading the files

In [2]:
DATA_DIR_PATH = Path("/mnt/c/Users/jolan/OneDrive - NTNU/First year/Semester 2/Recommender Systems/Group Project/data")

In [3]:
train_folder_name = "MINDsmall_train"
train_behavior_file_path = DATA_DIR_PATH / train_folder_name / "behaviors.tsv"
train_news_file_path = DATA_DIR_PATH / train_folder_name / "news.tsv"
train_entity_embedding_file_path = DATA_DIR_PATH / train_folder_name / "entity_embedding.vec"
train_relation_embedding_file_path = DATA_DIR_PATH / train_folder_name / "relation_embedding.vec"

In [4]:
behaviors_df = pd.read_csv(train_behavior_file_path, sep='\t', header=None)
news_df = pd.read_csv(train_news_file_path, sep='\t', header=None)

In [5]:
behaviors_df.columns = ['Impression ID', 'User ID', 'Time', 'History', 'Impressions']
news_df.columns = ['News ID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'Title Entities', 'Abstract Entities']

In [6]:
news_df.drop(columns=['URL'], inplace=True)

In [7]:
def read_embedding(file_path: Path) -> dict:
    """ TODO """
    with open(file_path) as f:
        lines = f.readlines()
    embedding = {}
    for line in lines:
        line = line.strip().split()
        embedding[line[0]] = list(map(float, line[1:]))
    return embedding

In [8]:
entity_embedding = read_embedding(train_entity_embedding_file_path)
relation_embedding = read_embedding(train_relation_embedding_file_path)

____

# EDA

In [9]:
print(behaviors_df.columns)
print(news_df.columns)

Index(['Impression ID', 'User ID', 'Time', 'History', 'Impressions'], dtype='object')
Index(['News ID', 'Category', 'SubCategory', 'Title', 'Abstract',
       'Title Entities', 'Abstract Entities'],
      dtype='object')


## Behavior

In [None]:
behaviors_df.head(10)

Unnamed: 0,Impression ID,User ID,Time,History,Impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...
5,6,U19739,11/11/2019 6:52:13 PM,N39074 N14343 N32607 N32320 N22007 N442 N19001...,N21119-1 N53696-0 N33619-1 N25722-0 N2869-0
6,7,U8355,11/11/2019 12:22:09 PM,N8419 N15771 N1431 N5888 N18663 N24123 N22130 ...,N51346-0 N33848-0 N15132-0 N10688-0 N6342-0 N6...
7,8,U46596,11/12/2019 10:29:36 PM,N47438 N20950 N21317 N5469,N7821-0 N24898-0 N12029-0 N13579-0 N42977-0 N3...
8,9,U79199,11/13/2019 10:13:02 AM,N37083 N459 N29499 N38118 N37378 N24691 N27235...,N51048-1 N64094-0 N13907-0 N39010-0
9,10,U53231,11/11/2019 11:28:11 AM,N58936 N15919 N11917 N2153 N55312 N13008 N4142...,N53585-1 N55689-0


In [11]:
behaviors_df['History'] = behaviors_df['History'].str.split()
behaviors_df['Impressions'] = behaviors_df['Impressions'].str.split()

In [12]:
history_df = behaviors_df[['Impression ID', 'History']].explode('History')
impressions_df = behaviors_df[['Impression ID', 'Impressions']].explode('Impressions')
impressions_df['Impressions'] = impressions_df['Impressions'].str.split('-')
impressions_df[['News ID', 'Clicked']] = pd.DataFrame(impressions_df['Impressions'].tolist(), index=impressions_df.index)
impressions_df.drop(columns=['Impressions'], inplace=True)
impressions_df.head(10)

Unnamed: 0,Impression ID,News ID,Clicked
0,1,N55689,1
0,1,N35729,0
1,2,N20678,0
1,2,N39317,0
1,2,N58114,0
1,2,N20495,0
1,2,N42977,0
1,2,N22407,0
1,2,N14592,0
1,2,N17059,1


In [13]:
history_df.head(10)

Unnamed: 0,Impression ID,History
0,1,N55189
0,1,N42782
0,1,N34694
0,1,N45794
0,1,N18445
0,1,N63302
0,1,N10414
0,1,N19347
0,1,N31801
1,2,N31739


## News

In [14]:
news_df.head(10)

Unnamed: 0,News ID,Category,SubCategory,Title,Abstract,Title Entities,Abstract Entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...","[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."
5,N2073,sports,football_nfl,Should NFL be able to fine players for critici...,Several fines came down against NFL players fo...,"[{""Label"": ""National Football League"", ""Type"":...","[{""Label"": ""National Football League"", ""Type"":..."
6,N49186,weather,weathertopstories,It's been Orlando's hottest October ever so fa...,There won't be a chill down to your bones this...,"[{""Label"": ""Orlando, Florida"", ""Type"": ""G"", ""W...","[{""Label"": ""Orlando, Florida"", ""Type"": ""G"", ""W..."
7,N59295,news,newsworld,Chile: Three die in supermarket fire amid prot...,Three people have died in a supermarket fire a...,"[{""Label"": ""Chile"", ""Type"": ""G"", ""WikidataId"":...","[{""Label"": ""Santiago"", ""Type"": ""G"", ""WikidataI..."
8,N24510,entertainment,gaming,Best PS5 games: top PlayStation 5 titles to lo...,Every confirmed or expected PS5 game we can't ...,"[{""Label"": ""PlayStation"", ""Type"": ""J"", ""Wikida...",[]
9,N39237,news,newsscienceandtechnology,"How to report weather-related closings, delays","When there are active closings, view them here...",[],"[{""Label"": ""WXII-TV"", ""Type"": ""M"", ""WikidataId..."


In [15]:
news_df['Title Entities'] = news_df['Title Entities'].apply(lambda x: [] if pd.isna(x) else x)
news_df['Abstract Entities'] = news_df['Abstract Entities'].apply(lambda x: [] if pd.isna(x) else x)

In [16]:
news_df['Title Entities'] = news_df['Title Entities'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
news_df['Abstract Entities'] = news_df['Abstract Entities'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

In [17]:
news_df['len Title Entities'] = news_df['Title Entities'].apply(len)
news_df['len Abstract Entities'] = news_df['Abstract Entities'].apply(len)
news_df['sum len Entities'] = news_df['len Title Entities'] + news_df['len Abstract Entities']

In [27]:
news_df.head(10)

Unnamed: 0,News ID,Category,SubCategory,Title,Abstract,Title Entities,Abstract Entities,len Title Entities,len Abstract Entities,sum len Entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...","[{'Label': 'Prince Philip, Duke of Edinburgh',...",[],3,0,3
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,"[{'Label': 'Adipose tissue', 'Type': 'C', 'Wik...","[{'Label': 'Adipose tissue', 'Type': 'C', 'Wik...",1,1,2
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,[],"[{'Label': 'Ukraine', 'Type': 'G', 'WikidataId...",0,1,1
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",[],"[{'Label': 'National Basketball Association', ...",0,1,1
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...","[{'Label': 'Skin tag', 'Type': 'C', 'WikidataI...","[{'Label': 'Skin tag', 'Type': 'C', 'WikidataI...",1,3,4
5,N2073,sports,football_nfl,Should NFL be able to fine players for critici...,Several fines came down against NFL players fo...,"[{'Label': 'National Football League', 'Type':...","[{'Label': 'National Football League', 'Type':...",1,1,2
6,N49186,weather,weathertopstories,It's been Orlando's hottest October ever so fa...,There won't be a chill down to your bones this...,"[{'Label': 'Orlando, Florida', 'Type': 'G', 'W...","[{'Label': 'Orlando, Florida', 'Type': 'G', 'W...",1,1,2
7,N59295,news,newsworld,Chile: Three die in supermarket fire amid prot...,Three people have died in a supermarket fire a...,"[{'Label': 'Chile', 'Type': 'G', 'WikidataId':...","[{'Label': 'Santiago', 'Type': 'G', 'WikidataI...",1,2,3
8,N24510,entertainment,gaming,Best PS5 games: top PlayStation 5 titles to lo...,Every confirmed or expected PS5 game we can't ...,"[{'Label': 'PlayStation', 'Type': 'J', 'Wikida...",[],1,0,1
9,N39237,news,newsscienceandtechnology,"How to report weather-related closings, delays","When there are active closings, view them here...",[],"[{'Label': 'WXII-TV', 'Type': 'M', 'WikidataId...",0,2,2


In [18]:
title_entities_df = news_df[['News ID', 'Title Entities']].explode('Title Entities')
abstract_entities_df = news_df[['News ID', 'Abstract Entities']].explode('Abstract Entities')

In [19]:
title_entities_df = pd.concat([title_entities_df, title_entities_df['Title Entities'].apply(pd.Series)], axis=1)
abstract_entities_df = pd.concat([abstract_entities_df, abstract_entities_df['Abstract Entities'].apply(pd.Series)], axis=1)

title_entities_df.drop(columns=['Title Entities'], inplace=True)
abstract_entities_df.drop(columns=['Abstract Entities'], inplace=True)

In [20]:
title_entities_df.drop(columns=[0], inplace=True)
abstract_entities_df.drop(columns=[0], inplace=True)

In [21]:
title_entities_df.head(10)

Unnamed: 0,News ID,Label,Type,WikidataId,Confidence,OccurrenceOffsets,SurfaceForms
0,N55528,"Prince Philip, Duke of Edinburgh",P,Q80976,1.0,[48],[Prince Philip]
0,N55528,"Charles, Prince of Wales",P,Q43274,1.0,[28],[Prince Charles]
0,N55528,Elizabeth II,P,Q9682,0.97,[11],[Queen Elizabeth]
1,N19639,Adipose tissue,C,Q193583,1.0,[20],[Belly Fat]
2,N61837,,,,,,
3,N53526,,,,,,
4,N38324,Skin tag,C,Q3179593,1.0,[18],[Skin Tags]
5,N2073,National Football League,O,Q1215884,1.0,[7],[NFL]
6,N49186,"Orlando, Florida",G,Q49233,0.962,[10],[Orlando]
7,N59295,Chile,G,Q298,0.988,[0],[Chile]


In [22]:
abstract_entities_df.head(10)

Unnamed: 0,News ID,Label,Type,WikidataId,Confidence,OccurrenceOffsets,SurfaceForms
0,N55528,,,,,,
1,N19639,Adipose tissue,C,Q193583,1.0,[97],[belly fat]
2,N61837,Ukraine,G,Q212,0.946,[87],[Ukraine]
3,N53526,National Basketball Association,O,Q155223,1.0,[40],[NBA]
4,N38324,Skin tag,C,Q3179593,1.0,[105],[Skin Tags]
4,N38324,Dermatology,C,Q171171,1.0,[131],[Dermatologist]
4,N38324,Reader's Digest,M,Q371820,0.999,[163],[Reader's Digest]
5,N2073,National Football League,O,Q1215884,1.0,[32],[NFL]
6,N49186,"Orlando, Florida",G,Q49233,0.962,[60],[Orlando]
7,N59295,Santiago,G,Q2887,0.995,[125],[Santiago]


In [23]:
news_df['sum len Entities'].max()

np.int64(32)

In [24]:
print(news_df['SubCategory'].nunique())
print(len(news_df))

264
51282


## Entity Embedding

In [25]:
entity_embedding.keys()

dict_keys(['Q41', 'Q1860', 'Q39631', 'Q30', 'Q60', 'Q847017', 'Q183', 'Q2736', 'Q21198', 'Q131524', 'Q12788174', 'Q142', 'Q298', 'Q2887', 'Q1321', 'Q155', 'Q15180', 'Q408', 'Q177220', 'Q38', 'Q752297', 'Q29', 'Q2807', 'Q214317', 'Q336286', 'Q1362411', 'Q37226', 'Q213', 'Q8678', 'Q90', 'Q11424', 'Q25089', 'Q19877770', 'Q160432', 'Q228943', 'Q83287', 'Q1384', 'Q2256', 'Q145', 'Q62', 'Q5083', 'Q1085', 'Q175621', 'Q178194', 'Q174', 'Q128581', 'Q337585', 'Q33', 'Q193592', 'Q1757', 'Q9617', 'Q19453', 'Q9592', 'Q611644', 'Q250867', 'Q10884', 'Q31', 'Q8420', 'Q924', 'Q641', 'Q490', 'Q12152', 'Q1461', 'Q928', 'Q275482', 'Q1781', 'Q797', 'Q860626', 'Q1146335', 'Q126399', 'Q639669', 'Q131578', 'Q85', 'Q79', 'Q282722', 'Q467', 'Q202466', 'Q144', 'Q1420', 'Q1064904', 'Q8074', 'Q43801', 'Q20', 'Q280658', 'Q8341', 'Q2155167', 'Q1166', 'Q237', 'Q96', 'Q134556', 'Q4967196', 'Q9212', 'Q1037', 'Q159', 'Q203059', 'Q12439', 'Q49088', 'Q174710', 'Q113603', 'Q640506', 'Q16', 'Q839078', 'Q8646', 'Q1976985', '

## Relation Embedding

In [26]:
relation_embedding.keys()

dict_keys(['P31', 'P21', 'P106', 'P735', 'P108', 'P101', 'P69', 'P27', 'P19', 'P1412', 'P1343', 'P20', 'P509', 'P1196', 'P734', 'P17', 'P641', 'P463', 'P131', 'P159', 'P39', 'P3373', 'P551', 'P793', 'P2094', 'P1344', 'P1303', 'P512', 'P84', 'P466', 'P462', 'P361', 'P3032', 'P413', 'P1532', 'P118', 'P54', 'P366', 'P527', 'P119', 'P495', 'P58', 'P57', 'P161', 'P272', 'P364', 'P162', 'P1040', 'P1657', 'P750', 'P840', 'P344', 'P136', 'P1552', 'P2758', 'P5150', 'P115', 'P276', 'P710', 'P156', 'P5008', 'P102', 'P1433', 'P50', 'P921', 'P2860', 'P407', 'P195', 'P170', 'P180', 'P1598', 'P140', 'P186', 'P3602', 'P137', 'P4552', 'P915', 'P86', 'P1411', 'P166', 'P2554', 'P5970', 'P26', 'P144', 'P6216', 'P197', 'P421', 'P127', 'P25', 'P22', 'P97', 'P53', 'P1290', 'P1001', 'P179', 'P155', 'P674', 'P1884', 'P2522', 'P264', 'P175', 'P410', 'P241', 'P607', 'P676', 'P1889', 'P1142', 'P6886', 'P279', 'P6379', 'P6275', 'P2416', 'P1441', 'P4584', 'P1080', 'P404', 'P400', 'P123', 'P178', 'P937', 'P40', 'P13