In [37]:
from sentence_transformers import SentenceTransformer, util
from scipy import spatial
import pandas as pd
sentences = ["This is a Norwegian boy", "Dette er en norsk gutt"]

model = SentenceTransformer('NbAiLab/nb-sbert-base')
embeddings = model.encode(sentences)
embeddings

array([[ 1.4281031 ,  0.39518782,  0.06262632, ...,  1.1759279 ,
         0.59693366, -0.12693629],
       [ 1.4975808 , -0.52666426, -0.03449104, ...,  1.3204104 ,
         0.85494673,  0.14108105]], dtype=float32)

In [38]:
# Compute cosine-similarities with sentence transformers
cosine_scores = util.cos_sim(embeddings[0],embeddings[1])
print(cosine_scores)

# Compute cosine-similarities with SciPy
scipy_cosine_scores = 1 - spatial.distance.cosine(embeddings[0],embeddings[1])
print(scipy_cosine_scores)

tensor([[0.8250]])
0.8250487446784973


In [39]:
from keybert import KeyBERT
from bertopic import BERTopic
kw_model = KeyBERT(model=model)
model = BERTopic(embedding_model=model, verbose=True)

In [40]:
# from sklearn.datasets import fetch_20newsgroups

# docs = fetch_20newsgroups(subset='all',  remove=('headers', 'footers', 'quotes'))['data']
# docs = docs[:1000]

In [41]:
df = pd.read_csv('Data_NER.csv')
df.dropna(inplace=True)

In [42]:
# Create a list with all docs from df.Subtitle
docs = df.Subtitle.tolist()

In [43]:
model.fit(docs)

Batches: 100%|██████████| 517/517 [19:43<00:00,  2.29s/it]
2022-12-07 17:16:12,756 - BERTopic - Transformed documents to Embeddings
2022-12-07 17:16:34,703 - BERTopic - Reduced dimensionality
2022-12-07 17:16:38,028 - BERTopic - Clustered reduced embeddings


<bertopic._bertopic.BERTopic at 0x1d39272b730>

In [47]:
model.visualize_topics()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [84]:
topic_info = model.get_topic_info()
# Get topic info where Name includes 'haaland'

topic_info[topic_info['Name'].str.lower().str.contains('haaland')]

Unnamed: 0,Topic,Count,Name
6,5,263,5_haaland_erling_braut_city
161,160,10,160_sverige_rangers_sang_haaland


In [85]:
topic_info

Unnamed: 0,Topic,Count,Name
0,-1,5507,-1_på_en_og_er
1,0,3020,0_ukraina_russiske_russland_putin
2,1,431,1_hun_da_fikk_legene
3,2,429,2_dronning_prins_prinsesse_elizabeth
4,3,278,3_drept_mann_siktet_kvinne
...,...,...,...
159,158,10,158_ekskona_grunlegger_amazon_skodesigner
160,159,10,159_janne_andersson_straffe_sveriges
161,160,10,160_sverige_rangers_sang_haaland
162,161,10,161_kvinnehelse_kvinner_gjerrig_selvsagt


In [52]:
asd = model.transform(['hello world', 'hallo verden'])

Batches: 100%|██████████| 1/1 [00:00<00:00, 20.41it/s]
2022-12-07 17:36:33,586 - BERTopic - Reduced dimensionality
2022-12-07 17:36:33,588 - BERTopic - Predicted clusters


In [59]:
asd

([18, 7], array([1.        , 0.71400195]))

In [54]:
model.get_topic(7)

[('jeg', 0.06034656787040656),
 ('meg', 0.04140010768785461),
 ('ikke', 0.020729475707607532),
 ('dette', 0.01938208132402451),
 ('hva', 0.015338045145589568),
 ('det', 0.014283259285607794),
 ('hadde', 0.01293093760318717),
 ('veldig', 0.012576391548908291),
 ('min', 0.012536958699112668),
 ('si', 0.012053349978694037)]

In [60]:
model_results = model.transform(docs)

Batches: 100%|██████████| 517/517 [19:39<00:00,  2.28s/it]
2022-12-07 18:02:03,006 - BERTopic - Reduced dimensionality
2022-12-07 18:02:03,580 - BERTopic - Predicted clusters


In [71]:
df['BERT_topic_num'] = -1
df['BERT_topic_prob'] = 0
for i in range(len(model_results[0])):
    group = model_results[0][i]
    prob = model_results[1][i]
    df['BERT_topic_num'][i] = group
    df['BERT_topic_prob'][i] = prob

In [72]:
df

Unnamed: 0.1,Unnamed: 0,Link,Headline,Category,Subtitle,Keywords,subtitle_entities,keywords_entities,BERT_topic_num,BERT_topic_prob
0,0,https://www.dagbladet.no/nyheter/kaotisk-nytta...,kaotisk nyttarsnatt ute av kontroll,nyheter,Det har vært en hektisk nyttårsnatt for politi...,nyheter,{},{},-1,0.000000
1,1,https://www.dagbladet.no/tema/fra-singel-til-d...,fra singel til damemagnet,tema,Ekspertenes beste råd til hvordan du blir attr...,"kjæreste,dating,singel",{},{},13,0.364456
2,2,https://www.dagbladet.no/tema/strom-nytt-fra-j...,strom nytt fra januar,tema,"Nye boligregler, bedre jobbpensjon og dyrere f...","metadz,økonomi,pensjon,skatt",{},{},-1,0.000000
3,3,https://www.dagbladet.no/tema/anders-kvitt-mag...,anders kvitt magefettet i superfart 1,tema,"- Hvis folk bare visste hvor enkelt det er, ha...","magefett,vektnedgang,kosthold,styrketrening,",{'Anders Muren': 'PER'},{},8,1.000000
4,4,https://www.dagbladet.no/nyheter/taus-om-overg...,taus om overgrepsdom,nyheter,Trump ønsket «henne alt godt» i fjor sommer. N...,"donald trump,ghislaine maxwell,utenriks,nyhete...","{'Trump': 'PER', 'Ghislaine Maxwell': 'PER', '...","{'donald trump': 'PER', 'usa': 'LOC', 'jeffrey...",-1,0.000000
...,...,...,...,...,...,...,...,...,...,...
16529,17417,https://www.dagbladet.no/sport/hjerteskjaerend...,hjerteskjaerende melding,sport,Spania og Tyskland spilte 1-1. Det betyr at sp...,"qatar-vm,sport","{'Spania': 'ORG', 'Tyskland': 'ORG', 'Qatar - ...",{},-1,0.000000
16530,17418,https://www.dagbladet.no/nyheter/venter-stromh...,venter stromhopp i vinter,nyheter,"Lavere temperaturer og lite vind i Europa, kan...","strømpriser,været,nyheter,innenriks",{'Europa': 'LOC'},{},48,0.958357
16531,17419,https://www.dagbladet.no/kjendis/betatt-av-hol...,betatt av hollywood stjerne,kjendis,Ifølge ei ny bok var kong Charles litt forelsk...,"forelsket,kjendis,kong charles,barbra streisand",{'Charles': 'PER'},{},2,1.000000
16532,17420,https://www.dagbladet.no/tema/fra-sexlei-til-v...,fra sexlei til vanvittig tent,tema,Kvinnen ville ikke ha sex lenger. Så gjorde ma...,"xavierx,kåt,sex,",{},{},-1,0.000000


In [86]:
df.to_csv('Data_NER_BERT.csv', index=False)