In [1]:
import torch

In [2]:
torch.cuda.get_device_name()

'GeForce RTX 3090'

In [3]:
from sentence_transformers import SentenceTransformer, util
import os
import csv
import time


# Model for computing sentence embeddings. We use one trained for similar questions detection
model = SentenceTransformer('all-MiniLM-L6-v2')

# We donwload the Quora Duplicate Questions Dataset (https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs)
# and find similar question in it
url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
dataset_path = "quora_duplicate_questions.tsv"
max_corpus_size = 50000 # We limit our corpus to only the first 50k questions


# Check if the dataset exists. If not, download and extract
# Download dataset if needed
if not os.path.exists(dataset_path):
    print("Download dataset")
    util.http_get(url, dataset_path)

# Get all unique sentences from the file
corpus_sentences = set()
with open(dataset_path, encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
    for row in reader:
        corpus_sentences.add(row['question1'])
        corpus_sentences.add(row['question2'])
        if len(corpus_sentences) >= max_corpus_size:
            break

corpus_sentences = list(corpus_sentences)
print("Encode the corpus. This might take a while")
corpus_embeddings = model.encode(corpus_sentences, batch_size=64, show_progress_bar=True, convert_to_tensor=True)


print("Start clustering")
start_time = time.time()

#Two parameters to tune:
#min_cluster_size: Only consider cluster that have at least 25 elements
#threshold: Consider sentence pairs with a cosine-similarity larger than threshold as similar
clusters = util.community_detection(corpus_embeddings, min_community_size=25, threshold=0.75)

print("Clustering done after {:.2f} sec".format(time.time() - start_time))

#Print for all clusters the top 3 and bottom 3 elements
for i, cluster in enumerate(clusters):
    print("\nCluster {}, #{} Elements ".format(i+1, len(cluster)))
    for sentence_id in cluster[0:3]:
        print("\t", corpus_sentences[sentence_id])
    print("\t", "...")
    for sentence_id in cluster[-3:]:
        print("\t", corpus_sentences[sentence_id])

Encode the corpus. This might take a while


Batches:   0%|          | 0/782 [00:00<?, ?it/s]

Start clustering
Clustering done after 4.06 sec

Cluster 1, #103 Elements 
	 How can I improve my spoken English?
	 How will I improve my spoken English?
	 What should I do to improve my spoken English?
	 ...
	 How can I increase my knowledge in English language?
	 How do I improve my English writing and speaking skills?
	 What should I do to speak English fluently and not face any problem with vocabulary?

Cluster 2, #86 Elements 
	 How can one make money online?
	 How could I make money online?
	 How do I to make money online?
	 ...
	 How can an apprentice programmer make money online?
	 How do I earned big money even online without investment?
	 What are the ways to make money working from home?

Cluster 3, #82 Elements 
	 What are the economic implications of banning 500 and 1000 rupee notes?
	 What will be the implications of banning 500 and 1000 rupees currency notes on Indian economy?
	 How will the ban of 1000 and 500 rupee notes affect the Indian economy?
	 ...
	 What are your

In [6]:
import spacy
nlp = spacy.load('en_core_web_sm')

text = 'My first birthday was great. My 2. was even better.'
sentences = [i for i in nlp(text).sents]

In [5]:
sentences

[My first birthday was great., My 2. was even better.]

In [7]:
text = '''
But I just I'm working on it for a while and that's. You know, it's kind of like just how positive or negative statements are working at transcripts so. Mahoning. Select signals from the Q&A section section. So what's the analyst sentiment? So when they're asking their questions, how negative or positive they are? That seems to be pretty indicative. You know the the general. Corporate ones kind of work, but it's mostly the change one. So if analysts become more negative, then that's pretty useful. Or I think that even the CEO one change over the year is also useful. And today, will we be interested in talking to you? Is that we're we're getting at the topic modeling, and so that is where we're actually tagging different sentences. In the transcript on what topics they are related to. So you could say in our case, really, for DLC would be are they talking about debt paydown? So the degree of that occurring in the transcripts? Or in other filings or whatnot, but also the sentiment related to that. So what are they talking about? Debt paydown is a positive way, or a negative way, and to that provides a signals. So, debt paydown tomorrow you could also think like M&A is like obviously a topic and transcripts sometimes. No hate dividend policy or share repurchase policy. But you know, I think you're probably even better expert than I am on all the different types of categories that we'd be interested in exploring. I think MJ has like 10 of them, so maybe I'm ju just show the different and we we kind of source these from the CFA institute's like they have. I mean, we just pulled because they what we need. We need text because we need text that we can use that will, especially our correlation. So what's the correlation between this text and the senses? And so that tells us, you know if it's really related, then they're talking about that topic. You know what? Before we get off with this. I'm not the just don't know what your ship bringing up. It's really interesting to me that that's not what you're what you're finding from endless sentiment is more powerful than the CEO sentiment and just two things. I'll become my head now that might be might be helpful. Just talking out there and throw it away if it's not helpful, but if there's a. And and management, if they say divergent in that sentiment. But I don't know if he's looking at analyst sentiment, standalone or or. Just comparing it to the the management as well. But you know, man was always gonna be the dog. Gonna be positive, right? Yeah it was all of a sudden bring it back to somewhere to where they really want to focus. That's always instructed, right? Because it's something that speaks to management intentions versus what really matters. 
'''

In [8]:
sentences = [i for i in text.split(". ") if len(i.split(" ")) > 3]

In [8]:
sentences[0]

'\nBeing leadership meeting yesterday, I think that would be the the corporate updates, so I already mentioned the FTE'

In [9]:
# Get all unique sentences from the file

corpus_sentences = sentences[:]
print("Encode the corpus. This might take a while")
corpus_embeddings = model.encode(corpus_sentences, batch_size=64, show_progress_bar=True, convert_to_tensor=True)


print("Start clustering")
start_time = time.time()

#Two parameters to tune:
#min_cluster_size: Only consider cluster that have at least 25 elements
#threshold: Consider sentence pairs with a cosine-similarity larger than threshold as similar
clusters = util.community_detection(corpus_embeddings, min_community_size=6,  threshold=0.75)

print("Clustering done after {:.2f} sec".format(time.time() - start_time))

#Print for all clusters the top 3 and bottom 3 elements
for i, cluster in enumerate(clusters):
    print("\nCluster {}, #{} Elements ".format(i+1, len(cluster)))
    for sentence_id in cluster[0:3]:
        print("\t", corpus_sentences[sentence_id])
    print("\t", "...")
    for sentence_id in cluster[-3:]:
        print("\t", corpus_sentences[sentence_id])

Encode the corpus. This might take a while


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Start clustering
Clustering done after 0.00 sec


In [10]:
corpus_embeddings

tensor([[-0.0837, -0.0670,  0.0436,  ..., -0.0422,  0.0232,  0.0802],
        [-0.0796,  0.0007, -0.0557,  ...,  0.0113,  0.0807,  0.0460],
        [ 0.0203,  0.0548,  0.0305,  ...,  0.0633,  0.0602,  0.0833],
        ...,
        [-0.0045, -0.0902, -0.0298,  ..., -0.0204, -0.0385, -0.0455],
        [-0.0700, -0.0020,  0.0966,  ..., -0.1205,  0.0096,  0.0213],
        [-0.0098, -0.0005, -0.0096,  ...,  0.0853, -0.0511,  0.0523]],
       device='cuda:0')

In [9]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

embedder = SentenceTransformer('all-MiniLM-L6-v2')


corpus_embeddings = embedder.encode(sentences)

# Perform kmean clustering
num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(sentences[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

Cluster  1
["So what's the analyst sentiment? So when they're asking their questions, how negative or positive they are? That seems to be pretty indicative", "Corporate ones kind of work, but it's mostly the change one", "So if analysts become more negative, then that's pretty useful", 'Or I think that even the CEO one change over the year is also useful', 'No hate dividend policy or share repurchase policy', "It's really interesting to me that that's not what you're what you're finding from endless sentiment is more powerful than the CEO sentiment and just two things", 'And and management, if they say divergent in that sentiment', "But I don't know if he's looking at analyst sentiment, standalone or or", 'Just comparing it to the the management as well', 'Gonna be positive, right? Yeah it was all of a sudden bring it back to somewhere to where they really want to focus', "That's always instructed, right? Because it's something that speaks to management intentions versus what really ma

In [44]:
clustered_sentences

[['A man is eating food.', 'The girl is carrying a baby.'],
 ['A man is eating a piece of bread.'],
 ['A man is eating pasta.', 'A monkey is playing drums.'],
 ['The baby is carried by the woman',
  'A man is riding a white horse on an enclosed ground.'],
 ['A man is riding a horse.',
  'Someone in a gorilla costume is playing a set of drums.',
  'A cheetah is running behind its prey.',
  'A cheetah chases prey on across a field.']]