In [1]:
import os
import pandas as pd
from sqlalchemy import create_engine
import chromadb
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
def get_data(database_url):
    engine = create_engine(database_url)
    query = "SELECT * FROM reddit_usernames_comments"
    df = pd.read_sql(query, engine)
    return df

In [5]:
database_url = os.getenv('DATABASE_URL')
df = get_data(database_url)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3276 entries, 0 to 3275
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   username  3276 non-null   object
 1   comments  3276 non-null   object
dtypes: object(2)
memory usage: 51.3+ KB


In [6]:
list(df[df['username']=='--solaris--']['comments'])

['I think a lot of other people made some good points so I’m not going to repeat a lot of it. It’s never too late to consider vet med as a career and undergrad is a great opportunity to push yourself and make yourself competitive.\n\nI will play devils advocate though. I am a second year resident and I’m burnt out already. Undergrad is 4 years, vet school is another 4, and residency is 3. If you want to do aquatic medicine then it will likely be another 2 years of rotating and specialty internships before residency. That’s upwards of 13 years of training ahead of you. Vet Med can be rewarding, but it is hard and long to get to the end of your training. I’m not saying this to discourage you, but simply to make sure you know what you would be commuting to. You have to be driven and always keep your eye on the prize. Be prepared for long nights, long weeks, and long years.|As someone who knows people looking for path jobs in diagnostics/industry, this information is not current. Most diag

In [None]:
df['username'][0]

'LoveAGoodTwist'

In [7]:
df['comments'] = df['comments'].str.split('|', n=1).str[0]

In [None]:
# Check if username column has duplicates
df['username'].duplicated().any()

False

In [8]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
model = SentenceTransformer('all-MiniLM-L6-v2')



In [10]:
embeddings = model.encode(df['comments'].tolist(), show_progress_bar=True)

Batches: 100%|██████████| 103/103 [00:13<00:00,  7.78it/s]


In [11]:
df['embeddings'] = list(embeddings)

In [12]:
df['embeddings']

0       [0.010265142, 0.021804111, 0.09015161, 0.03777...
1       [0.025500813, 0.055280138, 0.056986548, 0.0642...
2       [0.035866946, 0.06707864, 0.050283547, -0.0314...
3       [0.031657185, -0.07286959, 0.031879846, 0.0158...
4       [-0.009881736, -0.023968354, -0.0071312725, 0....
                              ...                        
3271    [0.06685015, -0.11729141, 0.01155566, -0.01828...
3272    [0.023961065, -0.009860428, 0.00067358464, -0....
3273    [-0.021498615, -0.031337075, 0.020569805, -0.0...
3274    [-0.037924305, -0.0030589858, -0.011220716, -0...
3275    [0.0053146393, 0.0147564905, 0.08191258, 0.011...
Name: embeddings, Length: 3276, dtype: object

In [13]:
from sklearn.cluster import KMeans

In [14]:
# Number of clusters
num_clusters = 7

# Initialize KMeans
kmeans = KMeans(n_clusters=num_clusters, random_state=42)

# Fit the model on the embeddings
kmeans.fit(embeddings)

# Predict the cluster labels
cluster_labels = kmeans.labels_

# Add the cluster labels to your DataFrame
df['cluster'] = cluster_labels

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
df

Unnamed: 0,username,comments,embeddings,cluster
0,LoveAGoodTwist,"Female, Kentucky. 4 years out. Work equine on...","[0.010265142, 0.021804111, 0.09015161, 0.03777...",1
1,wahznooski,"As a woman of reproductive age, fuck Texas","[0.025500813, 0.055280138, 0.056986548, 0.0642...",3
2,Churro_The_fish_Girl,what makes you want to become a vet?,"[0.035866946, 0.06707864, 0.050283547, -0.0314...",4
3,abarthch,"I see of course there are changing variables, ...","[0.031657185, -0.07286959, 0.031879846, 0.0158...",6
4,VoodooKing,I have 412+ and faced issues because wireguard...,"[-0.009881736, -0.023968354, -0.0071312725, 0....",6
...,...,...,...,...
3271,B1u3Chips_,I’m looking into applying for veterinary nursi...,"[0.06685015, -0.11729141, 0.01155566, -0.01828...",4
3272,Daktari2018,Good for you for sticking to standards of care...,"[0.023961065, -0.009860428, 0.00067358464, -0....",0
3273,Sheepb1,"Yes feel free to ask someone to double check, ...","[-0.021498615, -0.031337075, 0.020569805, -0.0...",0
3274,Elyrath,"Same! Helps massively. Errors can still occur,...","[-0.037924305, -0.0030589858, -0.011220716, -0...",2


In [None]:
pd.set_option('display.width', None)

In [16]:
comments_0 = df[df['cluster']==0] # vet doctors

In [17]:
comments_0['comments'].to_csv('0.csv', index=False)

In [18]:
comments_1 = df[df['cluster']==1] # other
comments_1['comments'].to_csv('1.csv', index=False)

In [19]:
comments_2 = df[df['cluster']==2] # other
comments_2['comments'].to_csv('2.csv', index=False)

In [20]:
comments_3 = df[df['cluster']==3] # other
comments_3['comments'].to_csv('3.csv', index=False)

In [None]:
comments['comments'][13]

'But does it mean that something is wrong with my setup?|But does it mean that something is wrong with my setup?|But does it mean that something is wrong with my setup?|But does it mean that something is wrong with my setup?|But does it mean that something is wrong with my setup?|But does it mean that something is wrong with my setup?|But does it mean that something is wrong with my setup?|But does it mean that something is wrong with my setup?|But does it mean that something is wrong with my setup?|But does it mean that something is wrong with my setup?|But does it mean that something is wrong with my setup?|But does it mean that something is wrong with my setup?|But does it mean that something is wrong with my setup?|But does it mean that something is wrong with my setup?|But does it mean that something is wrong with my setup?|But does it mean that something is wrong with my setup?|But does it mean that something is wrong with my setup?|But does it mean that something is wrong with m