In [1]:
from ctransformers import AutoModelForCausalLM

# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
llm = AutoModelForCausalLM.from_pretrained(
    model_path_or_repo_id="/mnt/DATA/THESIS/playground/models/mistral-7b-instruct-v0.1.Q5_K_M.gguf",
    model_file="./models/mistral-7b-instruct-v0.1.Q5_K_M.gguf",
    model_type="mistral", 
    gpu_layers=30)


In [2]:
import os
import re

# Define a function to preprocess text
def preprocess_text(text):
    
    text = text.lower()
    # Split the text into sentences
    sentences = re.split(r'[.!?]', text)
    
    for i,sentence in enumerate(sentences):
        sentence = sentence.split("|")[-1]
        sentence = re.sub(r'[^a-zA-Z\s]', '', sentence)
        sentences[i] = sentence.split("|")[-1]
    
    return sentences

# Load the dataset from a folder
dataset_folder = "documents/topic_model_dataset"
sentences = []

# Iterate over files in the dataset folder
for filename in os.listdir(dataset_folder):
    if filename.endswith(".txt"):
        with open(os.path.join(dataset_folder, filename), "r", encoding="utf-8") as file:
            text = file.read()
            sentences.extend(preprocess_text(text))



print(f'length of documents: {len(sentences)}')
print(f"first sentences:\n", "\n".join(sentences[:5]))

length of documents: 107599
first sentences:
 planning to hire a personal trainer
 read these  tips first httpow
rt annamedaris any dads out their who struggled w depression or anxiety after their kid was born
 lets talk
americas problem with diabetes in one map httpow


In [3]:
from pipeline.embeddings.basic_embeddings import Embedding

embed_model = Embedding(model_name="thenlper/gte-small")

embeddings = []

for sentence in sentences[:10]:
    embeddings.append(embed_model.encode({"text":sentence, "source":"test"})["embeddings"])

print(embeddings[:5])

[[[-0.05373017489910126, 0.008957922458648682, 0.03548308089375496, -0.06724701076745987, -0.03307387977838516, 0.01764615625143051, 0.08244088292121887, 0.04874323308467865, 0.03191882371902466, 0.012899615801870823, 0.05522066727280617, -0.1409163922071457, -0.0005719410837627947, 0.011659265495836735, -0.03939105197787285, 0.020896051079034805, -0.025060387328267097, 0.05184738337993622, -0.045636240392923355, 0.0654531940817833, 0.06609681248664856, -0.03381873294711113, -0.05752822011709213, -0.039953384548425674, 0.04631269350647926, 0.05753389745950699, -0.02352314628660679, -0.01644165627658367, -0.03750652074813843, -0.15327997505664825, -0.0404517762362957, -0.02026621624827385, 0.02911100909113884, -0.027495305985212326, -0.02361663058400154, -0.010767251253128052, -0.01067839190363884, 0.04418688267469406, 0.0037337562534958124, 0.08392883837223053, 0.005817645229399204, -0.0008909117896109819, -0.043435242027044296, -0.04243021458387375, 0.013179684057831764, -0.0487609580

In [8]:
print(embeddings[0])

[[-0.05373017489910126, 0.008957922458648682, 0.03548308089375496, -0.06724701076745987, -0.03307387977838516, 0.01764615625143051, 0.08244088292121887, 0.04874323308467865, 0.03191882371902466, 0.012899615801870823, 0.05522066727280617, -0.1409163922071457, -0.0005719410837627947, 0.011659265495836735, -0.03939105197787285, 0.020896051079034805, -0.025060387328267097, 0.05184738337993622, -0.045636240392923355, 0.0654531940817833, 0.06609681248664856, -0.03381873294711113, -0.05752822011709213, -0.039953384548425674, 0.04631269350647926, 0.05753389745950699, -0.02352314628660679, -0.01644165627658367, -0.03750652074813843, -0.15327997505664825, -0.0404517762362957, -0.02026621624827385, 0.02911100909113884, -0.027495305985212326, -0.02361663058400154, -0.010767251253128052, -0.01067839190363884, 0.04418688267469406, 0.0037337562534958124, 0.08392883837223053, 0.005817645229399204, -0.0008909117896109819, -0.043435242027044296, -0.04243021458387375, 0.013179684057831764, -0.04876095801

In [10]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import umap



for i in embeddings[0]:
    print(len(i)) 
embeddings_m = []
for i in embeddings:
    for j in i:
        embeddings_m.append(j)
        
scores = []
for k in range(5,6):
    kmeans = KMeans(n_clusters=k, random_state=42).fit(embeddings_m)
    score = silhouette_score(embeddings_m, kmeans.labels_)
    scores.append(score)

best_k = scores.index(max(scores)) + 5 
documents = KMeans(n_clusters=best_k, random_state=42).fit(embeddings_m)

384
384
384
384
384
384
384
384
384
384
384
384
384
384
384
384
384
384
384
384
384
384
384
384
384
384
384
384
384
384
384
384
384
384
384


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [11]:
best_k

5

In [12]:
documents.__dict__

{'n_clusters': 5,
 'init': 'k-means++',
 'max_iter': 300,
 'tol': 0.0001,
 'n_init': 'warn',
 'verbose': 0,
 'random_state': 42,
 'copy_x': True,
 'algorithm': 'lloyd',
 'n_features_in_': 384,
 '_tol': 3.72367878354634e-08,
 '_n_init': 10,
 '_algorithm': 'lloyd',
 '_n_threads': 6,
 'cluster_centers_': array([[-0.04505173, -0.03900701,  0.03350275, ...,  0.00207929,
          0.02345373,  0.03160206],
        [-0.03098073, -0.0119556 ,  0.04310571, ..., -0.02368924,
          0.03633795,  0.02386844],
        [-0.05539044, -0.00795   ,  0.086402  , ..., -0.0294363 ,
          0.0562095 ,  0.04495492],
        [-0.00987482,  0.02902576,  0.04259352, ..., -0.03112824,
          0.03667639,  0.05444451],
        [-0.05390283, -0.00513945,  0.03749872, ..., -0.03520564,
          0.02635146,  0.01609629]]),
 '_n_features_out': 5,
 'labels_': array([1, 0, 1, 4, 4, 1, 4, 1, 1, 3, 1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 2, 0,
        1, 1, 4, 1, 0, 1, 3, 0, 1, 1, 4, 2, 0, 1, 0, 2, 1, 1, 1, 3, 1, 2,
    

TypeError: only integer scalar arrays can be converted to a scalar index

In [None]:
# k * vocab
X_per_cluster = model.transform(documents)
# D * vocab
X_origin = self.vectorizer_model.transform(origin_documents)

if self.word_select_method == 'tfidf_idfi':
    socres = TFIDF_IDFi(X_per_cluster, X_origin, documents).socre()
elif self.word_select_method == 'tfidf_tfi':
    socres = TFIDF_TFi(X_per_cluster, X_origin, all_documents).socre()
elif self.word_select_method == 'tfi':
    socres = TFi(X_per_cluster).socre()
elif self.word_select_method == 'tfidfi':
    socres = TFIDFi(X_per_cluster).socre()

In [None]:
import pipeline.pseudo_oracle.oracle.SentenceClassiier as p
p

In [None]:
1 3 4 5 6 7 8 9 10 13 14 15 35 37 39 41 42 47 50 53 62 82 83 118 138