In [1]:
from sentence_transformers import SentenceTransformer, util
from scipy import spatial
import pandas as pd
sentences = ["This is a Norwegian boy", "Dette er en norsk gutt"]

model = SentenceTransformer('NbAiLab/nb-sbert-base')
embeddings = model.encode(sentences)
embeddings

  from .autonotebook import tqdm as notebook_tqdm


array([[ 1.4281031 ,  0.39518782,  0.06262632, ...,  1.1759279 ,
         0.59693366, -0.12693629],
       [ 1.4975808 , -0.52666426, -0.03449104, ...,  1.3204104 ,
         0.85494673,  0.14108105]], dtype=float32)

In [2]:
# Compute cosine-similarities with sentence transformers
cosine_scores = util.cos_sim(embeddings[0],embeddings[1])
print(cosine_scores)

# Compute cosine-similarities with SciPy
scipy_cosine_scores = 1 - spatial.distance.cosine(embeddings[0],embeddings[1])
print(scipy_cosine_scores)

tensor([[0.8250]])
0.8250487446784973


In [3]:
from keybert import KeyBERT
from bertopic import BERTopic
model = BERTopic(embedding_model=model, verbose=True)

In [4]:
df = pd.read_csv('Data_NER.csv')
df.dropna(inplace=True)

In [5]:
# Create a list with all docs from df.Subtitle
docs = df.Subtitle.tolist()

In [6]:
model.fit(docs)

Batches: 100%|██████████| 517/517 [20:07<00:00,  2.34s/it]
2022-12-08 20:38:01,922 - BERTopic - Transformed documents to Embeddings
2022-12-08 20:38:23,830 - BERTopic - Reduced dimensionality
2022-12-08 20:38:25,360 - BERTopic - Clustered reduced embeddings


<bertopic._bertopic.BERTopic at 0x2777c50fa60>

In [7]:
model.save('topicModel')

In [8]:
model.visualize_topics()

In [11]:
topic_info

Unnamed: 0,Topic,Count,Name
0,-1,8203,-1_og_det_ukraina_har
1,0,320,0_drept_mann_politiet_kvinne
2,1,265,1_haaland_erling_braut_city
3,2,239,2_fhi_nakstad_helsedirektør_covid
4,3,223,3_jeg_meg_dette_ikke
...,...,...,...
199,198,10,198_demokrati_makt_statsoverhode_statsledernes
200,199,10,199_beskjed_utenriksministeren_rahmon_tadsjiki...
201,200,10,200_miniminister_ukas_denne_problemene
202,201,10,201_dronningen_kongefamilien_vinke_oppmøte


In [54]:
model.get_topic(7)

[('jeg', 0.06034656787040656),
 ('meg', 0.04140010768785461),
 ('ikke', 0.020729475707607532),
 ('dette', 0.01938208132402451),
 ('hva', 0.015338045145589568),
 ('det', 0.014283259285607794),
 ('hadde', 0.01293093760318717),
 ('veldig', 0.012576391548908291),
 ('min', 0.012536958699112668),
 ('si', 0.012053349978694037)]

In [60]:
model_results = model.transform(docs)

Batches: 100%|██████████| 517/517 [19:39<00:00,  2.28s/it]
2022-12-07 18:02:03,006 - BERTopic - Reduced dimensionality
2022-12-07 18:02:03,580 - BERTopic - Predicted clusters


In [71]:
df['BERT_topic_num'] = -1
df['BERT_topic_prob'] = 0
for i in range(len(model_results[0])):
    group = model_results[0][i]
    prob = model_results[1][i]
    df['BERT_topic_num'][i] = group
    df['BERT_topic_prob'][i] = prob

In [86]:
df.to_csv('Data_NER_BERT.csv', index=False)

In [13]:
kw_model = KeyBERT(model=model)
keywords = kw_model.extract_keywords(docs, keyphrase_ngram_range=(1, 1), stop_words='english', top_n=5)

Downloading: 100%|██████████| 968/968 [00:00<00:00, 905kB/s]
Downloading: 100%|██████████| 190/190 [00:00<00:00, 95.1kB/s]
Downloading: 100%|██████████| 3.79k/3.79k [00:00<00:00, 1.26MB/s]
Downloading: 100%|██████████| 645/645 [00:00<00:00, 646kB/s]
Downloading: 100%|██████████| 122/122 [00:00<00:00, 122kB/s]
Downloading: 100%|██████████| 471M/471M [00:12<00:00, 37.1MB/s] 
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 53.1kB/s]
Downloading: 100%|██████████| 5.07M/5.07M [00:00<00:00, 8.55MB/s]
Downloading: 100%|██████████| 239/239 [00:00<00:00, 239kB/s]
Downloading: 100%|██████████| 9.08M/9.08M [00:00<00:00, 13.2MB/s]
Downloading: 100%|██████████| 480/480 [00:00<00:00, 240kB/s]
Downloading: 100%|██████████| 14.8M/14.8M [00:01<00:00, 13.4MB/s]
Downloading: 100%|██████████| 229/229 [00:00<00:00, 114kB/s]


In [21]:
# Write the keywords to a file called subtitles_keywords.txt
with open('subtitles_keywords.txt', 'w', encoding='utf8') as f:
    f.write(str(keywords))


In [22]:
keywords

[[('nyttårsnatt', 0.6744),
  ('politiet', 0.5614),
  ('hektisk', 0.3224),
  ('landet', 0.2671),
  ('flere', 0.1755)],
 [('datingmarkedet', 0.6793),
  ('attraktiv', 0.4779),
  ('råd', 0.2773),
  ('ekspertenes', 0.1502),
  ('beste', 0.1493)],
 [('boligregler', 0.5257),
  ('endringer', 0.4224),
  ('dyrere', 0.3517),
  ('bedre', 0.3048),
  ('nye', 0.2647)],
 [('magefettet', 0.5365),
  ('overflødige', 0.3743),
  ('hvordan', 0.3652),
  ('enkelt', 0.3598),
  ('raskt', 0.3527)],
 [('maxwell', 0.4318),
  ('trump', 0.414),
  ('presidenten', 0.3146),
  ('unge', 0.1916),
  ('dømt', 0.1801)],
 [('skiforbundet', 0.4334),
  ('surrealistisk', 0.3826),
  ('hoppernes', 0.3595),
  ('hoppuka', 0.3501),
  ('fjor', 0.3393)],
 [('fansen', 0.625),
  ('jared', 0.3909),
  ('måpe', 0.3485),
  ('får', 0.2524),
  ('til', 0.2409)],
 [('norske', 0.5267),
  ('kjendiser', 0.3557),
  ('paraidrett', 0.3315),
  ('risiko', 0.299),
  ('kjendis', 0.292)],
 [('pandemien', 0.5961),
  ('coronatilfellene', 0.3124),
  ('forsker'

In [23]:
df['BERT_keywords'] = ''
for i in range(len(keywords)):
    current_keywords = keywords[i]
    df['BERT_keywords'][i] = [tup[0] for tup in current_keywords]

In [49]:
bert_kw = df.BERT_keywords.tolist()

In [82]:
df['BERT_kw_topics'] = ''
k = 0
for i in range(k, len(bert_kw)):
    try:
        kw = bert_kw[i]
        if len(kw) == 1:
            kw = kw[0]
        topic_nums, topic_probs = model.find_topics(kw, top_n=5)
        df['BERT_kw_topics'][i] = topic_nums
    except Exception as e:
        print(f"Cause: {kw}")
        raise e

In [84]:
df.head()

Unnamed: 0.1,Unnamed: 0,Link,Headline,Category,Subtitle,Keywords,subtitle_entities,keywords_entities,BERT_keywords,BERT_kw_topics
0,0,https://www.dagbladet.no/nyheter/kaotisk-nytta...,kaotisk nyttarsnatt ute av kontroll,nyheter,Det har vært en hektisk nyttårsnatt for politi...,nyheter,{},{},"[nyttårsnatt, politiet, hektisk, landet, flere]","[16, 90, 116, 92, 146]"
1,1,https://www.dagbladet.no/tema/fra-singel-til-d...,fra singel til damemagnet,tema,Ekspertenes beste råd til hvordan du blir attr...,"kjæreste,dating,singel",{},{},"[datingmarkedet, attraktiv, råd, ekspertenes, ...","[188, 7, 95, 65, 88]"
2,2,https://www.dagbladet.no/tema/strom-nytt-fra-j...,strom nytt fra januar,tema,"Nye boligregler, bedre jobbpensjon og dyrere f...","metadz,økonomi,pensjon,skatt",{},{},"[boligregler, endringer, dyrere, bedre, nye]","[154, 175, 126, 196, 48]"
3,3,https://www.dagbladet.no/tema/anders-kvitt-mag...,anders kvitt magefettet i superfart 1,tema,"- Hvis folk bare visste hvor enkelt det er, ha...","magefett,vektnedgang,kosthold,styrketrening,",{'Anders Muren': 'PER'},{},"[magefettet, overflødige, hvordan, enkelt, raskt]","[93, 42, 134, 179, 189]"
4,4,https://www.dagbladet.no/nyheter/taus-om-overg...,taus om overgrepsdom,nyheter,Trump ønsket «henne alt godt» i fjor sommer. N...,"donald trump,ghislaine maxwell,utenriks,nyhete...","{'Trump': 'PER', 'Ghislaine Maxwell': 'PER', '...","{'donald trump': 'PER', 'usa': 'LOC', 'jeffrey...","[maxwell, trump, presidenten, unge, dømt]","[4, 70, 35, 174, 21]"


In [87]:
bert_df_kw = df.BERT_keywords
bert_kw_topics = df.BERT_kw_topics
bert_df_kw

0          [nyttårsnatt, politiet, hektisk, landet, flere]
1        [datingmarkedet, attraktiv, råd, ekspertenes, ...
2             [boligregler, endringer, dyrere, bedre, nye]
3        [magefettet, overflødige, hvordan, enkelt, raskt]
4                [maxwell, trump, presidenten, unge, dømt]
                               ...                        
16529    [spania, qatar, spanias, spenningen, hjerteskj...
16530    [strømpriser, temperaturer, klimaforsker, vint...
16531            [stjerner, charles, bok, kong, forelsket]
16532              [sexologen, magien, sex, skjedde, mens]
16533           [plutselig, sjokk, inferno, kvalme, linda]
Name: BERT_keywords, Length: 16534, dtype: object

In [89]:
df2 = pd.read_csv('Data_NER_BERT.csv')
df2['BERT_keywords'] = bert_df_kw
df2['BERT_kw_topics'] = bert_kw_topics

In [91]:
df2.to_csv('Preprocessed_Data.csv', index=False)

In [93]:
df2.head()

Unnamed: 0.1,Unnamed: 0,Link,Headline,Category,Subtitle,Keywords,subtitle_entities,keywords_entities,BERT_topic_num,BERT_topic_prob,BERT_keywords,BERT_kw_topics
0,0,https://www.dagbladet.no/nyheter/kaotisk-nytta...,kaotisk nyttarsnatt ute av kontroll,nyheter,Det har vært en hektisk nyttårsnatt for politi...,nyheter,{},{},-1,0.0,"[nyttårsnatt, politiet, hektisk, landet, flere]","[16, 90, 116, 92, 146]"
1,1,https://www.dagbladet.no/tema/fra-singel-til-d...,fra singel til damemagnet,tema,Ekspertenes beste råd til hvordan du blir attr...,"kjæreste,dating,singel",{},{},13,0.364456,"[datingmarkedet, attraktiv, råd, ekspertenes, ...","[188, 7, 95, 65, 88]"
2,2,https://www.dagbladet.no/tema/strom-nytt-fra-j...,strom nytt fra januar,tema,"Nye boligregler, bedre jobbpensjon og dyrere f...","metadz,økonomi,pensjon,skatt",{},{},-1,0.0,"[boligregler, endringer, dyrere, bedre, nye]","[154, 175, 126, 196, 48]"
3,3,https://www.dagbladet.no/tema/anders-kvitt-mag...,anders kvitt magefettet i superfart 1,tema,"- Hvis folk bare visste hvor enkelt det er, ha...","magefett,vektnedgang,kosthold,styrketrening,",{'Anders Muren': 'PER'},{},8,1.0,"[magefettet, overflødige, hvordan, enkelt, raskt]","[93, 42, 134, 179, 189]"
4,4,https://www.dagbladet.no/nyheter/taus-om-overg...,taus om overgrepsdom,nyheter,Trump ønsket «henne alt godt» i fjor sommer. N...,"donald trump,ghislaine maxwell,utenriks,nyhete...","{'Trump': 'PER', 'Ghislaine Maxwell': 'PER', '...","{'donald trump': 'PER', 'usa': 'LOC', 'jeffrey...",-1,0.0,"[maxwell, trump, presidenten, unge, dømt]","[4, 70, 35, 174, 21]"
