In [1]:
#pip install -U sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer, util

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')


In [None]:
sentences = ['the cat sits outside', 'the new moview is awesome', 'the new movie is really great', 'the dog bark on stangers']

In [None]:
embeddings = model.encode(sentences=sentences, convert_to_tensor=True)

In [None]:
for sent,embed in zip(sentences, embeddings):
  print("Sentence:", sent)
  print("Len(Embeddings:", len(embed))
  # print("Embeddings: ", embed)

Sentence: the cat sits outside
Len(Embeddings: 384
Sentence: the new moview is awesome
Len(Embeddings: 384
Sentence: the new movie is really great
Len(Embeddings: 384
Sentence: the dog bark on stangers
Len(Embeddings: 384


In [None]:
cosine_scores = util.cos_sim(embeddings, embeddings)

In [None]:
cosine_scores

tensor([[ 1.0000, -0.0295, -0.0258,  0.1578],
        [-0.0295,  1.0000,  0.8336,  0.0933],
        [-0.0258,  0.8336,  1.0000,  0.1001],
        [ 0.1578,  0.0933,  0.1001,  1.0000]])

In [None]:
sentences

['the cat sits outside',
 'the new moview is awesome',
 'the new movie is really great',
 'the dog bark on stangers']

In [None]:
paraphrases = util.paraphrase_mining(model, sentences)

In [None]:
for sim in paraphrases[0:10]:
  score, i, j = sim
  print(sentences[i], "<>" ,sentences[j], " --> ",score)

the new moview is awesome <> the new movie is really great  -->  0.8336373567581177
the cat sits outside <> the dog bark on stangers  -->  0.15776793658733368
the new movie is really great <> the dog bark on stangers  -->  0.10014551877975464
the new moview is awesome <> the dog bark on stangers  -->  0.0932723879814148
the cat sits outside <> the new movie is really great  -->  -0.025751961395144463
the cat sits outside <> the new moview is awesome  -->  -0.02949126996099949


In [None]:
#%pip install -U sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer, util
import torch

In [None]:
model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

In [None]:
import requests

In [None]:
response = requests.get('https://raw.githubusercontent.com/laxmimerit/machine-learning-dataset/master/text-dataset-for-machine-learning/sbert-corpus.txt')
corpus = response.text.split('\r\n')

response = requests.get('https://raw.githubusercontent.com/laxmimerit/machine-learning-dataset/master/text-dataset-for-machine-learning/sbert-queries.txt')
queries = response.text.split('\r\n')

In [None]:
print(corpus)

['A man is eating food.', 'A man is eating a piece of bread.', 'A man is eating pasta.', 'The girl is carrying a baby.', 'The baby is carried by the woman', 'A man is riding a horse.', 'A man is riding a white horse on an enclosed ground.', 'A monkey is playing drums.', 'Someone in a gorilla costume is playing a set of drums.', 'A cheetah is running behind its prey.', 'A cheetah chases prey on across a field.']


In [None]:
print(queries)

['A man is eating pasta.', 'Someone in a gorilla costume is playing a set of drums.', 'A cheetah chases prey on across a field.']


In [None]:
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
queries_embeddings = model.encode(queries, convert_to_tensor=True)

In [None]:
# corpus_embeddings[0]

In [None]:
# lets normalize vectors for fast calculation
corpus_embeddings = util.normalize_embeddings(corpus_embeddings)
queries_embeddings = util.normalize_embeddings(queries_embeddings)

In [None]:
len(corpus_embeddings[0])

384

In [None]:
hits = util.semantic_search(queries_embeddings, corpus_embeddings, score_function=util.dot_score, top_k=3)

In [None]:
hits

[[{'corpus_id': 2, 'score': 0.9999997615814209},
  {'corpus_id': 0, 'score': 0.8384664058685303},
  {'corpus_id': 1, 'score': 0.7468275427818298}],
 [{'corpus_id': 8, 'score': 1.0},
  {'corpus_id': 7, 'score': 0.7612731456756592},
  {'corpus_id': 3, 'score': 0.38152894377708435}],
 [{'corpus_id': 10, 'score': 0.9999997019767761},
  {'corpus_id': 9, 'score': 0.8703992962837219},
  {'corpus_id': 6, 'score': 0.3741169273853302}]]

In [None]:
for query, hit in zip(queries, hits):
  for q_hit in hit:
    id = q_hit['corpus_id']
    score = q_hit['score']

    print(query, "<>", corpus[id], "-->", score)

  print()
  

A man is eating pasta. <> A man is eating pasta. --> 0.9999997615814209
A man is eating pasta. <> A man is eating food. --> 0.8384664058685303
A man is eating pasta. <> A man is eating a piece of bread. --> 0.7468275427818298

Someone in a gorilla costume is playing a set of drums. <> Someone in a gorilla costume is playing a set of drums. --> 1.0
Someone in a gorilla costume is playing a set of drums. <> A monkey is playing drums. --> 0.7612731456756592
Someone in a gorilla costume is playing a set of drums. <> The girl is carrying a baby. --> 0.38152894377708435

A cheetah chases prey on across a field. <> A cheetah chases prey on across a field. --> 0.9999997019767761
A cheetah chases prey on across a field. <> A cheetah is running behind its prey. --> 0.8703992962837219
A cheetah chases prey on across a field. <> A man is riding a white horse on an enclosed ground. --> 0.3741169273853302



In [None]:
# K-mean Clustering

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
import requests
response = requests.get('https://raw.githubusercontent.com/laxmimerit/machine-learning-dataset/master/text-dataset-for-machine-learning/sbert-corpus.txt')
corpus = response.text.split('\r\n')

In [None]:
len(corpus), print(corpus)

['A man is eating food.', 'A man is eating a piece of bread.', 'A man is eating pasta.', 'The girl is carrying a baby.', 'The baby is carried by the woman', 'A man is riding a horse.', 'A man is riding a white horse on an enclosed ground.', 'A monkey is playing drums.', 'Someone in a gorilla costume is playing a set of drums.', 'A cheetah is running behind its prey.', 'A cheetah chases prey on across a field.']


(11, None)

In [None]:
corpus_embeddings = model.encode(corpus)

In [None]:
num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

In [None]:
cluster_assignment

array([1, 1, 1, 3, 3, 2, 2, 4, 4, 0, 0], dtype=int32)

In [None]:
clustered_sentences = [[] for i in range(num_clusters)]
clustered_sentences

[[], [], [], [], []]

In [None]:
for sentence_id, cluster_id in enumerate(cluster_assignment):
  clustered_sentences[cluster_id].append(corpus[sentence_id])

In [None]:
for i, cluster in enumerate(clustered_sentences):
  print("Cluster ", i+1)
  print(cluster)
  print()

Cluster  1
['A cheetah is running behind its prey.', 'A cheetah chases prey on across a field.']

Cluster  2
['A man is eating food.', 'A man is eating a piece of bread.', 'A man is eating pasta.']

Cluster  3
['A man is riding a horse.', 'A man is riding a white horse on an enclosed ground.']

Cluster  4
['The girl is carrying a baby.', 'The baby is carried by the woman']

Cluster  5
['A monkey is playing drums.', 'Someone in a gorilla costume is playing a set of drums.']



In [2]:
# Agglomerative Clustering

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import numpy as np


In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
import requests
response = requests.get('https://raw.githubusercontent.com/laxmimerit/machine-learning-dataset/master/text-dataset-for-machine-learning/sbert-corpus.txt')
corpus = response.text.split('\r\n')

In [None]:
print(corpus)

['A man is eating food.', 'A man is eating a piece of bread.', 'A man is eating pasta.', 'The girl is carrying a baby.', 'The baby is carried by the woman', 'A man is riding a horse.', 'A man is riding a white horse on an enclosed ground.', 'A monkey is playing drums.', 'Someone in a gorilla costume is playing a set of drums.', 'A cheetah is running behind its prey.', 'A cheetah chases prey on across a field.']


In [None]:
corous_embeddings = model.encode(corpus)

In [None]:
corpus_embeddings = corpus_embeddings/np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

In [None]:
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

In [None]:
cluster_assignment

array([0, 0, 0, 4, 4, 1, 1, 2, 2, 3, 3])

In [None]:
np.unique(cluster_assignment)

array([0, 1, 2, 3, 4])

In [None]:
num_clusters = len(np.unique(cluster_assignment))
clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
  clustered_sentences[cluster_id].append(corpus[sentence_id])


for i, cluster in enumerate(clustered_sentences):
  print("Cluster ", i+1)
  print(cluster)
  print()


Cluster  1
['A man is eating food.', 'A man is eating a piece of bread.', 'A man is eating pasta.']

Cluster  2
['A man is riding a horse.', 'A man is riding a white horse on an enclosed ground.']

Cluster  3
['A monkey is playing drums.', 'Someone in a gorilla costume is playing a set of drums.']

Cluster  4
['A cheetah is running behind its prey.', 'A cheetah chases prey on across a field.']

Cluster  5
['The girl is carrying a baby.', 'The baby is carried by the woman']



In [None]:
# Fast Clustering

In [None]:
from sentence_transformers import SentenceTransformer, util

In [None]:
import pandas as pd
import time

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
df = pd.read_csv('http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv', sep='\t')
df.shape

(404290, 6)

In [None]:
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [None]:
sentences = df['question1'].tolist()[:1000]
len(sentences)

1000

In [None]:
corpus_embeddings = model.encode(sentences, batch_size=64, show_progress_bar=True)

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [None]:
clusters = util.community_detection(corpus_embeddings, min_community_size=5, threshold=0.5)

In [None]:
for i,cluster in enumerate(clusters):
  print("\nCluster {}, #{} Questions".format(i+1, len(cluster)))
  for id in cluster[0:3]:
    print("\t", sentences[id])
  print("\t", "...")


Cluster 1, #10 Questions
	 Which are the best Hollywood thriller movies?
	 What are the most underrated and overrated movies you've seen?
	 What are the best films that take place in one room?
	 ...

Cluster 2, #9 Questions
	 What are your views on Modi governments decision to demonetize 500 and 1000 rupee notes? How will this affect economy?
	 What's your opinion about the decision on removal of 500 and 1000 rupees currency notes?
	 How will Indian GDP be affected from banning 500 and 1000 rupees notes?
	 ...

Cluster 3, #8 Questions
	 What is best way to make money online?
	 How can I make money through the Internet?
	 What are the easy ways to earn money online?
	 ...

Cluster 4, #7 Questions
	 What are the most important things for living a good life?
	 What is most important in life - money or values?
	 What is the best lesson in life?
	 ...

Cluster 5, #6 Questions
	 What is our stance against Pakistan?
	 What is the reason Pakistan supports terrorism?
	 If there will be a war b