In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.semi_supervised import LabelPropagation
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import numpy as np


In [59]:
labeled_data = [("This is a document about IBM", "IBM"), 
                ("This is a news article", "News"),
                ("Another document about sports", "Sports"),
                ("A text sample about politics", "Politics"),
                ("A document discussing music", "Music")]

unlabeled_data = ["This document discusses IBM", 
                  "Another document about News", 
                  "A short text sample"]

In [60]:
all_data = [text for text, _ in labeled_data] + unlabeled_data
texts, labels = zip(*labeled_data)
vectorizer = TfidfVectorizer(max_features=500)
 
features_dense = features.toarray()
all_labels = sorted(set(labels))
label_distributions = np.zeros((len(texts), len(all_labels)))
for i, label in enumerate(labels):
    label_distributions[i, all_labels.index(label)] = 1


In [61]:
X_train, X_test, y_train, y_test = train_test_split(features_dense[:len(texts)], labels, test_size=0.2, random_state=40)
y_train_indices = np.array([all_labels.index(label) for label in y_train])
semi_clf = LabelPropagation()
semi_clf.fit(X_train, y_train_indices) 
predictions = semi_clf.predict(X_test)
accuracy = accuracy_score(np.array([all_labels.index(label) for label in y_test]), predictions)
precision = precision_score(np.array([all_labels.index(label) for label in y_test]), predictions, average='weighted', labels=np.unique(predictions))
recall = recall_score(np.array([all_labels.index(label) for label in y_test]), predictions, average='weighted', labels=np.unique(predictions))


In [63]:
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
