In [4]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Step 1: Sample dataset
texts = [
    "I loved this movie", "What a fantastic film", "Absolutely great experience",
    "This movie was terrible", "I hated the acting", "Worst movie ever",
    "A masterpiece of cinema", "Horrible direction", "Great plot and story", "Boring and slow"
]
labels = [1, 1, 1, 0, 0, 0, 1, 0, 1, 0]  # 1 = Positive, 0 = Negative

# Step 2: Manual selection to balance classes
X_labeled_texts = [
    "I loved this movie",           # Positive
    "Absolutely great experience",  # Positive
    "Worst movie ever",             # Negative
    "I hated the acting"            # Negative
]
y_labeled = [1, 1, 0, 0]

# Use remaining as unlabeled
X_unlabeled_texts = [
    "What a fantastic film",        # Positive
    "This movie was terrible",      # Negative
    "A masterpiece of cinema",      # Positive
    "Horrible direction",           # Negative
    "Great plot and story",         # Positive
    "Boring and slow"               # Negative
]


# Step 3: Define two views (BOW and TF-IDF)
vectorizer1 = CountVectorizer()
vectorizer2 = TfidfVectorizer()

X1_labeled = vectorizer1.fit_transform(X_labeled_texts)
X2_labeled = vectorizer2.fit_transform(X_labeled_texts)

X1_unlabeled = vectorizer1.transform(X_unlabeled_texts)
X2_unlabeled = vectorizer2.transform(X_unlabeled_texts)

# Step 4: Initialize models
model1 = MultinomialNB()
model2 = MultinomialNB()

# Step 5: Co-training loop
for round_num in range(2):
    print(f"\n🔁 Round {round_num+1}")

    model1.fit(X1_labeled, y_labeled)
    model2.fit(X2_labeled, y_labeled)

    probs1 = model1.predict_proba(X1_unlabeled)
    probs2 = model2.predict_proba(X2_unlabeled)

    confident_idx = []
    for i, (p1, p2) in enumerate(zip(probs1, probs2)):
        if max(p1) > 0.9 and max(p2) > 0.9 and np.argmax(p1) == np.argmax(p2):
            confident_idx.append(i)

    if not confident_idx:
        print("🚫 No confident samples this round.")
        break

    # Add confident predictions to labeled set
    X1_new = X1_unlabeled[confident_idx]
    X2_new = X2_unlabeled[confident_idx]
    y_new = model1.predict(X1_new)

    X1_labeled = np.vstack([X1_labeled.toarray(), X2_new.toarray()])
    X2_labeled = np.vstack([X2_labeled.toarray(), X1_new.toarray()])
    y_labeled = np.concatenate([y_labeled, y_new])

    # Remove used samples
    remaining = [i for i in range(X1_unlabeled.shape[0]) if i not in confident_idx]
    X1_unlabeled = X1_unlabeled[remaining]
    X2_unlabeled = X2_unlabeled[remaining]

    print(f"✅ Added {len(confident_idx)} new samples.")

# Step 6: Evaluate
test_texts = ["Absolutely great experience", "So bad and boring", "Brilliantly directed", "Poor screenplay"]
X1_test = vectorizer1.transform(test_texts)
y_pred = model1.predict(X1_test)

print("\n🧪 Predictions on new data:")
for text, pred in zip(test_texts, y_pred):
    print(f"{text} => {'Positive' if pred == 1 else 'Negative'}")



🔁 Round 1
🚫 No confident samples this round.

🧪 Predictions on new data:
Absolutely great experience => Positive
So bad and boring => Negative
Brilliantly directed => Negative
Poor screenplay => Negative
