In [None]:
!nvidia-smi -L

GPU 0: Tesla T4 (UUID: GPU-69b64100-f4fd-a76b-5cc0-baf6c733e6a6)


In [None]:
import numpy as np 
import pandas as pd
import string

In [None]:
pd.set_option('display.max_colwidth', None)

## Training data processing

In [None]:
ner_dataset = pd.read_csv('/content/ner_dataset.csv', 
    encoding='latin1')

In [None]:
ner_dataset['Sentence #'] = ner_dataset['Sentence #'].str.replace('Sentence:', '')
ner_dataset = ner_dataset.fillna(method='ffill')

In [None]:
ner_dataset['Sentence #'] = ner_dataset['Sentence #'].astype(int)

#### Create the `sentences_df`

In [None]:
sentences_df = ner_dataset.groupby('Sentence #', as_index=False)['Word'].apply(lambda x: x.str.cat(sep=' '))
sentences_df = sentences_df.rename(columns={'Word': 'Sentences'})

Inspect Sentence 8411. It only contains the word "The". 

In [None]:
sentences_df.iloc[8411]

Sentence #    8412
Sentences      The
Name: 8411, dtype: object

In [None]:
sentences_df = sentences_df.drop(labels=[8411], axis=0)
sentences_df = sentences_df.reset_index()
sentences_df = sentences_df.drop(columns='index')

### Sentences processing for LDA 

In [None]:
import nltk
from nltk.corpus import stopwords 
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def _lemmatize_words(sentence):
    wordnet_map = {'N':wordnet.NOUN, 'V':wordnet.VERB, 'J':wordnet.ADJ, 'R':wordnet.ADV}
    pos_tagged_text = nltk.pos_tag(sentence.split())
    return ' '.join([WordNetLemmatizer().lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN))
                    for word, pos in pos_tagged_text])
    
def lda_sent_process(text):
    text = text.lower()  
    PUNCT_TO_REMOVE = string.punctuation
    text = text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
    STOPWORDS = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in STOPWORDS])
    text = _lemmatize_words(text)
    return text.split()

In [None]:
sentences_df['lda_sents'] = sentences_df['Sentences'].apply(lambda x: lda_sent_process(x))

In [None]:
sentences_df

Unnamed: 0,Sentence #,Sentences,lda_sents
0,1,Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .,"[thousand, demonstrator, march, london, protest, war, iraq, demand, withdrawal, british, troop, country]"
1,2,"Families of soldiers killed in the conflict joined the protesters who carried banners with such slogans as "" Bush Number One Terrorist "" and "" Stop the Bombings . ""","[family, soldier, kill, conflict, join, protester, carry, banner, slogan, bush, number, one, terrorist, stop, bombing]"
2,3,They marched from the Houses of Parliament to a rally in Hyde Park .,"[march, house, parliament, rally, hyde, park]"
3,4,"Police put the number of marchers at 10,000 while organizers claimed it was 1,00,000 .","[police, put, number, marcher, 10000, organizer, claim, 100000]"
4,5,The protest comes on the eve of the annual conference of Britain 's ruling Labor Party in the southern English seaside resort of Brighton .,"[protest, come, eve, annual, conference, britain, rule, labor, party, southern, english, seaside, resort, brighton]"
...,...,...,...
23155,23157,Tehran insists its nuclear program is peaceful .,"[tehran, insists, nuclear, program, peaceful]"
23156,23158,A Vatican spokesman says Pope Benedict is willing to meet with victims of pedophile priests .,"[vatican, spokesman, say, pope, benedict, willing, meet, victim, pedophile, priest]"
23157,23159,Reverend Federico Lombardi made the statement to Vatican Radio on Friday .,"[reverend, federico, lombardi, make, statement, vatican, radio, friday]"
23158,23160,Lombardi said the church must also cooperate with police and judicial authorities because it is the only way to regain trust .,"[lombardi, say, church, must, also, cooperate, police, judicial, authority, way, regain, trust]"


## LDA Model (gensim)

In [None]:
from gensim.corpora.dictionary import Dictionary 
from gensim import models 
import re

#### Model trainsing

In [None]:
dct = Dictionary(sentences_df['lda_sents'])

In [None]:
corpus = [dct.doc2bow(sentence) for sentence in sentences_df['lda_sents']]
lda = models.LdaModel(corpus, num_topics=20)



In [None]:
topics = lda.print_topics()

In [None]:
topics 

[(0,
  '0.050*"19" + 0.047*"250" + 0.035*"96" + 0.033*"221" + 0.022*"37" + 0.019*"252" + 0.018*"494" + 0.018*"150" + 0.017*"170" + 0.015*"430"'),
 (1,
  '0.015*"96" + 0.015*"10" + 0.012*"275" + 0.011*"1153" + 0.011*"2053" + 0.011*"489" + 0.008*"1422" + 0.008*"912" + 0.008*"132" + 0.008*"1"'),
 (2,
  '0.041*"165" + 0.028*"96" + 0.024*"1216" + 0.017*"646" + 0.017*"120" + 0.016*"134" + 0.011*"14" + 0.011*"1535" + 0.010*"27" + 0.009*"758"'),
 (3,
  '0.015*"68" + 0.013*"96" + 0.012*"212" + 0.010*"42" + 0.009*"76" + 0.009*"2" + 0.008*"1430" + 0.008*"0" + 0.008*"2662" + 0.007*"2845"'),
 (4,
  '0.011*"134" + 0.011*"96" + 0.011*"427" + 0.009*"1298" + 0.009*"646" + 0.008*"2268" + 0.008*"1844" + 0.007*"275" + 0.007*"356" + 0.007*"715"'),
 (5,
  '0.035*"96" + 0.018*"2109" + 0.010*"275" + 0.010*"95" + 0.009*"248" + 0.009*"1748" + 0.008*"846" + 0.008*"376" + 0.008*"2973" + 0.008*"347"'),
 (6,
  '0.016*"96" + 0.014*"191" + 0.012*"1213" + 0.011*"410" + 0.010*"500" + 0.010*"643" + 0.010*"2029" + 0.009*

In [None]:
for topic in topics:
  key_indices = re.findall(r'"(.*?)"', topic[1])
  key_words = [dct[int(idx)] for idx in key_indices]
  print(f'Topic {topic[0]}: ', key_words)

Topic 0:  ['kill', 'people', 'say', 'least', 'police', 'wound', 'bomb', 'two', 'attack', 'three']
Topic 1:  ['say', 'war', 'election', 'result', 'constitution', 'market', 'crime', 'express', 'new', 'country']
Topic 2:  ['state', 'say', 'united', 'mr', 'u', 'president', 'bush', 'chavez', 'house', 'storm']
Topic 3:  ['day', 'say', 'clear', 'come', 'second', 'demand', 'release', 'british', 'policeman', 'christian']
Topic 4:  ['president', 'say', 'minister', 'japan', 'mr', 'shiite', 'visit', 'election', 'call', 'comment']
Topic 5:  ['say', 'officer', 'election', 'official', 'last', 'approve', 'test', 'measure', 'earthquake', 'vote']
Topic 6:  ['say', 'million', 'indian', 'give', 'dollar', 'announce', 'kashmir', 'billion', 'pay', 'estimate']
Topic 7:  ['right', 'group', 'say', 'two', 'human', 'peace', 'darfur', 'ago', 'hong', 'government']
Topic 8:  ['thousand', 'minister', 'last', 'government', 'say', 'prime', 'voa', 'rule', 'german', 'put']
Topic 9:  ['say', 'iran', 'nuclear', 'weapon', '

#### Inference example 

In [None]:
def get_topics(new_text, lda_model, dct): 
  '''
  new_text: str
  lda_model: load from lda.pkl
  dct: load from dct.pkl
  '''
  new_text_doc = lda_sent_process(new_text)
  topics = lda_model[dct.doc2bow(new_text_doc)]
  for topic in topics: 
    print(f'Topic {topic[0]} with probability {topic[1]}')


In [None]:
# copy from NYT
new_sents = 'As Midterms Near, Biden Faces a Nation as Polarized as Ever'

In [None]:
print(get_topics(new_sents, lda, dct))

Topic 1 with probability 0.4993284046649933
Topic 10 with probability 0.26306474208831787
Topic 14 with probability 0.131356880068779
None


### Pickle the LDA data 

In [None]:
import pickle

In [None]:
## pickle the dictionary 
with open('dct.pkl', 'wb') as pickle_dict: 
  pickle.dump(dct, pickle_dict)

In [None]:
## pickle the LDA model 
with open('lda.pkl', 'wb') as pickle_lda:
  pickle.dump(lda, pickle_lda)

## KNN with SentenceTransformer

### Train Sentence transformer


In [None]:
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 5.2 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 39.9 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 56.9 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 71.2 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 62.3 MB/s 
Building wheels for collected 

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
embeddings = model.encode(sentences_df['Sentences'])

### KNN with sentence embeddings 

In [None]:
from sklearn.neighbors import NearestNeighbors 
nbrs = NearestNeighbors(n_neighbors=3, algorithm='ball_tree').fit(embeddings)

### Inference example

In [None]:
def get_near_sent(text, emb_model, knn_model):
  embedding = emb_model.encode([text])
  _, index = knn_model.kneighbors(embedding)
  for idx in range(index.shape[1]):
    print(sentences_df['Sentences'][index[0,idx]])


Test on a sentence in the dataset. 

In [None]:
get_near_sent(sentences_df['Sentences'][0], emb_model=model, knn_model=nbrs)

Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .
The London march came ahead of anti-war protests today in other cities , including Rome , Paris , and Madrid .
A group of Iraqi lawmakers loyal to Shi'ite cleric Moqtada al-Sadr has staged a protest against a proposal allowing British troops to remain in the country .


Test on a new sentences: 

In [None]:
new_sentences = 'North Korea says launches were simulated attack, as South recovers missile parts'

In [120]:
get_near_sent(new_sentences, emb_model=model, knn_model=nbrs)

The U.S. and its allies in Asia have said the recent rocket launch was a test of a ballistic missile , but North Korea denies the claim , saying it sent a satellite into space .
North Korea this week launched seven missiles , including a long-range Taepodong-2 believed to be able to reach the United States .
However , a White House spokesman , Scott McClellan , said the launches were similar to previous tests carried out by North Korea .


In [122]:
get_near_sent('Woman shot and killed near 49th and Miami, Omaha police investigating', model, nbrs)

Several people were injured , and one woman died of a gunshot wound .
Unidentified gunmen on Mexico 's Gulf coast have shot to death the news director of one of the most influential newspapers in Veracruz , the second shooting attack on Mexican journalists in one week .
A woman who was nearby was wounded in the attack .


### Pickle embedding model and KNN model

In [None]:
with open('emb_model.pkl', 'wb') as pickle_emb:
  pickle.dump(model, pickle_emb)

with open('knn_modle.pkl', 'wb') as pickle_knn:
  pickle.dump(nbrs, pickle_knn)

## KMeans with SentenceTransformers (not used)

In [None]:
#from sklearn.pipeline import Pipeline 
#from sklearn.cluster import KMeans 

#from sklearn.metrics import silhouette_score

In [None]:
#for n_cluster in range(2,50):
#    kmeans_model = KMeans(n_clusters=n_cluster, random_state=1).fit(embeddings)
#    labels = kmeans_model.labels_
#    print(f'n_clusters = {n_cluster}: Silhouette Coefficient: {silhouette_score(embeddings, labels)}')

Execution restuls: 

n_clusters = 2: Silhouette Coefficient: 0.027680527418851852
n_clusters = 3: Silhouette Coefficient: 0.026121865957975388
n_clusters = 4: Silhouette Coefficient: 0.02658974751830101
n_clusters = 5: Silhouette Coefficient: 0.025382913649082184
n_clusters = 6: Silhouette Coefficient: 0.024761341512203217
n_clusters = 7: Silhouette Coefficient: 0.019972721114754677
n_clusters = 8: Silhouette Coefficient: 0.02104909159243107
n_clusters = 9: Silhouette Coefficient: 0.022642113268375397
n_clusters = 10: Silhouette Coefficient: 0.023155178874731064
n_clusters = 11: Silhouette Coefficient: 0.022205878049135208
n_clusters = 12: Silhouette Coefficient: 0.023076286539435387
n_clusters = 13: Silhouette Coefficient: 0.02041112817823887
n_clusters = 14: Silhouette Coefficient: 0.02360903099179268
n_clusters = 15: Silhouette Coefficient: 0.023625940084457397
n_clusters = 16: Silhouette Coefficient: 0.024519361555576324
n_clusters = 17: Silhouette Coefficient: 0.023255495354533195
n_clusters = 18: Silhouette Coefficient: 0.025912733748555183
n_clusters = 19: Silhouette Coefficient: 0.024510184302926064
n_clusters = 20: Silhouette Coefficient: 0.026332970708608627
n_clusters = 21: Silhouette Coefficient: 0.026626840233802795
n_clusters = 22: Silhouette Coefficient: 0.02656748704612255
n_clusters = 23: Silhouette Coefficient: 0.025318237021565437
n_clusters = 24: Silhouette Coefficient: 0.02624185010790825
n_clusters = 25: Silhouette Coefficient: 0.026188340038061142
n_clusters = 26: Silhouette Coefficient: 0.02609947882592678
n_clusters = 27: Silhouette Coefficient: 0.02687591314315796
n_clusters = 28: Silhouette Coefficient: 0.026777690276503563
n_clusters = 29: Silhouette Coefficient: 0.027936046943068504
n_clusters = 30: Silhouette Coefficient: 0.02727353386580944
n_clusters = 31: Silhouette Coefficient: 0.028382878750562668
n_clusters = 32: Silhouette Coefficient: 0.02829943224787712
n_clusters = 33: Silhouette Coefficient: 0.028072169050574303
n_clusters = 34: Silhouette Coefficient: 0.02787160314619541
n_clusters = 35: Silhouette Coefficient: 0.02733534574508667