In [10]:
! pip install -U sentence-transformers
! pip install lbl2vec



In [11]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import AgglomerativeClustering
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

def train_model(data_path):
    # Load and preprocess your unlabelled data
    data = pd.read_csv(data_path)
    text_samples = data['text']

    # Convert text to numerical features using TF-IDF vectorization
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(text_samples)

    # Encode text samples using a pre-trained transformer-based model
    model = SentenceTransformer('bert-base-nli-mean-tokens')
    embeddings = model.encode(text_samples)

    # Perform hierarchical clustering on the text embeddings
    n_clusters = 5  # Number of clusters
    clustering = AgglomerativeClustering(n_clusters=n_clusters)
    cluster_labels = clustering.fit_predict(embeddings)

    # Define appropriate types for each cluster (adjust as needed)
    cluster_types = {
        0: 'News',
        1: 'Customer Reviews',
        2: 'Social Media',
        3: 'Technical Discussions',
        4: 'Blogs/Opinions'
    }

    return vectorizer, model, embeddings, cluster_labels, cluster_types

def classify_text(text, vectorizer, model, embeddings, cluster_labels, cluster_types):
    # Encode user input
    text_embedding = model.encode([text])
    similarities = cosine_similarity(text_embedding, embeddings)[0]
    most_similar_cluster = np.argmax(similarities)
    cluster_label = cluster_labels[most_similar_cluster]
    cluster_type = cluster_types[cluster_label]

    return cluster_type
