In [1]:
# Block 1: Import libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, clear_output
import joblib
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
labeled_df = pd.read_csv('labeled_comments.csv', encoding="latin1")
labeled_df.head()

Unnamed: 0,video_id,author,comment,label
0,g_fkq6WHcOs,@DesiCahaya-c7r,Gambar kebaretnya boongan ternyata,0
1,g_fkq6WHcOs,@CelineElviraNatalia,"Buat pemakai biasa sama saja iphone 16 promax,...",0
2,g_fkq6WHcOs,@borisdominggo,Case poco&quot; ??,1
3,g_fkq6WHcOs,@noname-jb4rz,Kaum miFans dan kaum munafik iri dengki lg kum...,0
4,g_fkq6WHcOs,@SIAPA-e8b,Saya mah nunggu S26 Ultra aja,0


In [3]:
unlabeled_df = pd.read_csv('unlabeled_data.csv', encoding="latin1")
unlabeled_df.head()


Unnamed: 0,video_id,author,comment,Unnamed: 3
0,O2xb1lVqUv4,@rsa2147,"<a href=""https://www.youtube.com/watch?v=O2xb1...",
1,O2xb1lVqUv4,@AntingWulanKuliner,Ngapain beli iphon yg harga nya menjajah??????...,
2,O2xb1lVqUv4,@GateofNusantara,Hp China udh bagus2 dan tahan lama. Kamera jug...,
3,O2xb1lVqUv4,@sabilurrosyidtowilliam7163,Kalo Kualitasnya Semua Brand Android dari segi...,
4,O2xb1lVqUv4,@nodengki,Brand HP paling worth it yg pernah sy pinang.<...,


In [4]:
total = len(labeled_df) + len(unlabeled_df)
print(f"Labeled percentage: {len(labeled_df)/total*100:.1f}%")
print(f"Unlabeled percentage: {len(unlabeled_df)/total*100:.1f}%")

print(labeled_df['label'].value_counts().sort_index())

Labeled percentage: 26.4%
Unlabeled percentage: 73.6%
label
0    381
1    575
2    136
Name: count, dtype: int64


In [5]:
X = labeled_df['comment'].fillna('')
y = labeled_df['label']

vectorizer = TfidfVectorizer(
  max_features=2000,
  ngram_range=(1, 2),
  min_df=2,
  max_df=0.95,
  stop_words=None
)

X_vec = vectorizer.fit_transform(X)

# Train SVM
base_svm = SVC(
  kernel='linear',
  probability=False,
  random_state=42,
  class_weight='balanced'
)

model = CalibratedClassifierCV(
  base_svm,
  method='sigmoid',
  cv=min(5, len(labeled_df))
)

model.fit(X_vec, y)

print("Initial SVM model trained successfully!")
print(f"Training data: {len(labeled_df)} comments")


train_predictions = model.predict(X_vec)
train_accuracy = (train_predictions == y).mean()
print(f"Training accuracy: {train_accuracy:.3f}")

print(classification_report(y, train_predictions))

Initial SVM model trained successfully!
Training data: 1092 comments
Training accuracy: 0.867
              precision    recall  f1-score   support

           0       0.85      0.94      0.89       381
           1       0.88      0.92      0.90       575
           2       0.91      0.44      0.59       136

    accuracy                           0.87      1092
   macro avg       0.88      0.77      0.79      1092
weighted avg       0.87      0.87      0.86      1092



In [6]:
# Block 1: Split Unlabeled Data into 70% for Verification and 30% for Pseudo-labeling
def split_unlabeled_data(unlabeled_df, verification_ratio=0.7, pseudo_ratio=0.3):
    """
    Split unlabeled data into verification set (70%) and pseudo-labeling set (30%)
    """
    print("Splitting unlabeled data into verification and pseudo-labeling sets...")
    print(f"Total unlabeled data: {len(unlabeled_df)} comments")

    # Verify ratios sum to 1.0
    total_ratio = verification_ratio + pseudo_ratio
    if abs(total_ratio - 1.0) > 0.01:
        print(f"Warning: Ratios sum to {total_ratio}, normalizing to 1.0")
        verification_ratio = verification_ratio / total_ratio
        pseudo_ratio = pseudo_ratio / total_ratio

    # Calculate sizes
    verification_size = int(len(unlabeled_df) * verification_ratio)
    pseudo_size = len(unlabeled_df) - verification_size

    print(f"Verification set (Active Learning): {verification_size} comments ({verification_ratio*100}%)")
    print(f"Pseudo-labeling set: {pseudo_size} comments ({pseudo_ratio*100}%)")

    # Split the data
    verification_set = unlabeled_df.sample(n=verification_size, random_state=42)
    pseudo_set = unlabeled_df.drop(verification_set.index)

    # Verify the split
    print(f"\nSplit verification:")
    print(f"Verification set actual: {len(verification_set)}")
    print(f"Pseudo set actual: {len(pseudo_set)}")
    print(f"Total after split: {len(verification_set) + len(pseudo_set)}")

    # Save the splits
    verification_set.to_csv('unlabeled_verification.csv', index=False)
    pseudo_set.to_csv('unlabeled_pseudo.csv', index=False)

    print(f"\nSplit completed and saved:")
    print(f"  - unlabeled_verification.csv")
    print(f"  - unlabeled_pseudo.csv")

    return verification_set, pseudo_set

# Split the unlabeled data
verification_set, pseudo_set = split_unlabeled_data(unlabeled_df, verification_ratio=0.7, pseudo_ratio=0.3)

Splitting unlabeled data into verification and pseudo-labeling sets...
Total unlabeled data: 3047 comments
Verification set (Active Learning): 2132 comments (70.0%)
Pseudo-labeling set: 915 comments (30.0%)

Split verification:
Verification set actual: 2132
Pseudo set actual: 915
Total after split: 3047

Split completed and saved:
  - unlabeled_verification.csv
  - unlabeled_pseudo.csv


In [8]:
def get_sentiment_name(label):
    sentiment_mapping = {
        0: 'Negative',
        1: 'Neutral',
        2: 'Positive'
    }
    return sentiment_mapping.get(label, 'Unknown')

print("Testing get_sentiment_name function:")
print(f"0 -> {get_sentiment_name(0)}")
print(f"1 -> {get_sentiment_name(1)}")
print(f"2 -> {get_sentiment_name(2)}")
print(f"5 -> {get_sentiment_name(5)}")  # Should return 'Unknown'

Testing get_sentiment_name function:
0 -> Negative
1 -> Neutral
2 -> Positive
5 -> Unknown


In [9]:
# Block 2: Apply Model to 70% Verification Set
def apply_model_to_verification_set(model, vectorizer, verification_set):
    print("Applying model to 70% verification set...")
    print(f"Verification set size: {len(verification_set)} comments")

    # Prepare features
    X_verification = verification_set['comment'].fillna('')

    # Transform using the fitted vectorizer
    X_verification_vec = vectorizer.transform(X_verification)

    # Get predictions and probabilities
    predictions = model.predict(X_verification_vec)
    probabilities = model.predict_proba(X_verification_vec)
    max_probs = np.max(probabilities, axis=1)

    # Add predictions to dataframe
    predicted_verification = verification_set.copy()
    predicted_verification['model_label'] = predictions
    predicted_verification['model_confidence'] = max_probs
    predicted_verification['prob_negative'] = probabilities[:, 0]
    predicted_verification['prob_neutral'] = probabilities[:, 1]
    predicted_verification['prob_positive'] = probabilities[:, 2]

    # Categorize by confidence levels
    predicted_verification['confidence_level'] = pd.cut(
        max_probs,
        bins=[0, 0.6, 0.8, 0.9, 1.0],
        labels=['low', 'medium', 'high', 'very_high'],
        right=False
    )

    # Display prediction statistics
    print("\nModel Prediction Statistics for Verification Set:")
    print(f"Total predictions: {len(predicted_verification)}")

    print("\nConfidence Distribution:")
    conf_counts = predicted_verification['confidence_level'].value_counts()
    for level, count in conf_counts.items():
        percentage = count / len(predicted_verification) * 100
        print(f"  {level}: {count} comments ({percentage:.1f}%)")

    print("\nPredicted Label Distribution:")
    label_counts = predicted_verification['model_label'].value_counts().sort_index()
    for label, count in label_counts.items():
        sentiment = get_sentiment_name(label)
        percentage = count / len(predicted_verification) * 100
        print(f"  {sentiment}: {count} comments ({percentage:.1f}%)")

    # Save predictions for verification
    predicted_verification.to_csv('model_predictions_70percent_verification.csv', index=False)
    print(f"\n✓ Predictions saved to 'model_predictions_70percent_verification.csv'")

    return predicted_verification

# Apply model to 70% verification set
predicted_verification_df = apply_model_to_verification_set(model, vectorizer, verification_set)

Applying model to 70% verification set...
Verification set size: 2132 comments

Model Prediction Statistics for Verification Set:
Total predictions: 2132

Confidence Distribution:
  low: 1096 comments (51.4%)
  medium: 1036 comments (48.6%)
  high: 0 comments (0.0%)
  very_high: 0 comments (0.0%)

Predicted Label Distribution:
  Negative: 1010 comments (47.4%)
  Neutral: 1024 comments (48.0%)
  Positive: 98 comments (4.6%)

✓ Predictions saved to 'model_predictions_70percent_verification.csv'


In [10]:
# Block 3: Prepare Smart Verification Queue for 70% Data
def prepare_verification_queue(predicted_verification_df):
    print("Preparing smart verification queue for 70% verification set...")

    # Calculate uncertainty scores (1 - confidence)
    verification_queue = predicted_verification_df.copy()
    verification_queue['uncertainty'] = 1 - verification_queue['model_confidence']

    # Sort by uncertainty (most uncertain first)
    verification_queue = verification_queue.sort_values('uncertainty', ascending=False)

    print("Verification queue prepared with uncertainty sampling:")
    print(f"Total comments in queue: {len(verification_queue)}")
    print(f"Most uncertain confidence: {verification_queue['model_confidence'].iloc[0]:.3f}")
    print(f"Least uncertain confidence: {verification_queue['model_confidence'].iloc[-1]:.3f}")
    print(f"Average confidence: {verification_queue['model_confidence'].mean():.3f}")
    print(f"Average uncertainty: {verification_queue['uncertainty'].mean():.3f}")

    # Display queue composition
    print("\nQueue composition by confidence level:")
    queue_conf_counts = verification_queue['confidence_level'].value_counts()
    for level, count in queue_conf_counts.items():
        percentage = count / len(verification_queue) * 100
        print(f"  {level}: {count} comments ({percentage:.1f}%)")

    # Save the verification queue
    verification_queue.to_csv('manual_verification.csv', index=False)
    print(f"\n✓ Verification queue saved to 'manual_verification.csv'")

    return verification_queue

# Prepare verification queue
# verification_queue = prepare_verification_queue(predicted_verification_df)
verification_queue = pd.read_csv('manual_verification.csv')

In [None]:
# Block 4: Colab-Compatible Verification for 70% Data
def verify_data(verification_queue, batch_size=50, total_to_verify=None):
    print("Starting verification")

    if total_to_verify is None:
        total_to_verify = len(verification_queue)
    else:
        total_to_verify = min(total_to_verify, len(verification_queue))

    print(f"Total comments to verify: {total_to_verify}")
    print(f"Batch size: {batch_size}")

    # Create working copy
    verified_df = verification_queue.head(total_to_verify).copy()
    verified_df['human_label'] = np.nan
    verified_df['verified'] = False
    verified_df['verification_notes'] = ''

    total_verified = 0
    total_corrections = 0
    batch_number = 1



    # Process in batches
    for start_idx in range(0, len(verified_df), batch_size):
        end_idx = min(start_idx + batch_size, len(verified_df))
        current_batch = verified_df.iloc[start_idx:end_idx]

        print(f"\n{'='*60}")
        print(f"BATCH {batch_number} - Comments {start_idx + 1} to {end_idx}")
        print(f"Total target: {total_to_verify} comments")
        print(f"{'='*60}")

        batch_corrections = 0

        for i, (idx, row) in enumerate(current_batch.iterrows()):
            global_index = start_idx + i + 1

            print(f"\n" + "="*80)
            print(f"COMMENT {global_index}/{total_to_verify} (Batch {batch_number}.{i+1})")
            print("="*80)

            # Display comment information
            print(f"CONFIDENCE: {row['confidence_level']} ({row['model_confidence']:.3f})")
            print("-"*80)
            print(f"COMMENT:\n{row['comment']}")
            print("-"*80)

            # Display model prediction with probabilities
            print("MODEL PREDICTION:")
            print(f"  Label: {row['model_label']} ({get_sentiment_name(row['model_label'])})")
            print(f"  Confidence: {row['model_confidence']:.3f}")
            print(f"  Probabilities: Neg={row['prob_negative']:.3f}, Neu={row['prob_neutral']:.3f}, Pos={row['prob_positive']:.3f}")
            print("-"*80)

            # Verification options
            print("VERIFICATION OPTIONS:")
            print("0: Negative")
            print("1: Neutral")
            print("2: Positive")
            print("Enter: Accept model prediction")
            print("s: Skip this comment")
            print("="*80)

            while True:
                try:
                    choice = input("Enter your choice: ").strip().lower()

                    if choice == '':
                        # Accept model prediction
                        verified_df.at[idx, 'human_label'] = row['model_label']
                        verified_df.at[idx, 'verified'] = True
                        print("✓ Accepted model prediction")
                        break

                    elif choice in ['0', '1', '2']:
                        human_label = int(choice)
                        verified_df.at[idx, 'human_label'] = human_label
                        verified_df.at[idx, 'verified'] = True

                        if human_label != row['model_label']:
                            batch_corrections += 1
                            print(f"✓ Corrected from {row['model_label']} to {human_label}")
                        else:
                            print("✓ Confirmed model prediction")
                        break

                    elif choice == 's':
                        print("Skipped this comment")
                        break

                    else:
                        print("Invalid choice! Please enter 0, 1, 2, Enter, or 's'")

                except KeyboardInterrupt:
                    print("\nVerification interrupted by user")
                    return verified_df, total_verified, total_corrections
                except Exception as e:
                    print(f"Error: {e}. Please try again.")

        # Update totals after batch completion
        batch_verified = len(current_batch[current_batch['verified'] == True])
        total_verified += batch_verified
        total_corrections += batch_corrections

        overall_accuracy = 1 - (total_corrections / total_verified) if total_verified > 0 else 1.0

        print(f"\n✓ Batch {batch_number} completed!")
        print(f"  Verified in this batch: {batch_verified}")
        print(f"  Corrections in this batch: {batch_corrections}")
        print(f"  Batch accuracy: {1 - (batch_corrections/batch_verified) if batch_verified > 0 else 1.0:.1%}")
        print(f"  Total verified: {total_verified}/{total_to_verify}")
        print(f"  Overall accuracy: {overall_accuracy:.1%}")

        # Save progress after each batch
        verified_so_far = verified_df[verified_df['verified'] == True]
        if len(verified_so_far) > 0:
            verified_so_far.to_csv(f'verification_70percent_batch_{batch_number}.csv', index=False)
            print(f"  Progress saved: 'verification_70percent_batch_{batch_number}.csv'")

        batch_number += 1

        # Ask if user wants to continue
        if total_verified < total_to_verify:
            remaining = total_to_verify - total_verified
            cont = input(f"\n{remaining} comments remaining. Continue to next batch? (y/n): ").strip().lower()
            if cont != 'y':
                print("Verification paused by user.")
                break
        else:
            print(f"\n✓ Target of {total_to_verify} comments reached!")

    print(f"\n VERIFICATION COMPLETED!")
    print(f"Total verified: {total_verified}/{total_to_verify}")
    print(f"Total corrections: {total_corrections}")
    print(f"Final model accuracy: {1 - (total_corrections/total_verified) if total_verified > 0 else 1.0:.1%}")

    return verified_df, total_verified, total_corrections

# Start verification
print("Beginning verification of unlabeled data...")
verified_df, total_verified, total_corrections = verify_data(
    verification_queue,
    batch_size=50,
    total_to_verify=len(verification_queue)  # Verify all 70%
)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Enter your choice: 
✓ Accepted model prediction

COMMENT 1487/2132 (Batch 30.37)
CONFIDENCE: medium (0.670)
--------------------------------------------------------------------------------
COMMENT:
Design poco, warna kayak infinix yg warna orange ini??<br>Paling aman warna hitam.<br>Atau putih mewah lumayan lah.<br>Tapiii pas dikasih case sangat hp china 2 jutaan??<br>Fix user 17 pro bakal ini bakal alergi sama case ??
--------------------------------------------------------------------------------
MODEL PREDICTION:
  Label: 0 (Negative)
  Confidence: 0.670
  Probabilities: Neg=0.670, Neu=0.152, Pos=0.178
--------------------------------------------------------------------------------
VERIFICATION OPTIONS:
0: Negative
1: Neutral
2: Positive
Enter: Accept model prediction
s: Skip this comment
Enter your choice: 
✓ Accepted model prediction

COMMENT 1488/2132 (Batch 30.38)
CONFIDENCE: medium (0.670)
------------------------

In [11]:
# Block 4: Colab-Compatible Verification for 70% Data WITH RESUME FUNCTIONALITY
def verify_data(verification_queue, batch_size=50, total_to_verify=None, resume_from_batch=None):
    print("Starting verification")

    # === NEW: RESUME FUNCTIONALITY ===
    if resume_from_batch is not None:
        print(f"🔄 RESUMING FROM BATCH {resume_from_batch}")
        # Try to load previous progress
        try:
            previous_file = f'verification_70percent_batch_{resume_from_batch-1}.csv'
            previous_progress = pd.read_csv(previous_file)
            print(f"📁 Loaded previous progress: {len(previous_progress)} verified comments")

            # Get already verified comments
            verified_comments = set(previous_progress['comment'].astype(str))

            # Filter out already verified comments
            verification_queue = verification_queue[
                ~verification_queue['comment'].astype(str).isin(verified_comments)
            ]
            print(f"📊 Remaining comments to verify: {len(verification_queue)}")

        except FileNotFoundError:
            print(f"❌ Could not find batch {resume_from_batch-1} file. Starting from beginning.")
    # === END NEW ===

    if total_to_verify is None:
        total_to_verify = len(verification_queue)
    else:
        total_to_verify = min(total_to_verify, len(verification_queue))

    print(f"Total comments to verify: {total_to_verify}")
    print(f"Batch size: {batch_size}")

    # Create working copy
    verified_df = verification_queue.head(total_to_verify).copy()
    verified_df['human_label'] = np.nan
    verified_df['verified'] = False
    verified_df['verification_notes'] = ''

    total_verified = 0
    total_corrections = 0

    # === NEW: SET STARTING BATCH NUMBER ===
    if resume_from_batch is not None:
        batch_number = resume_from_batch
        print(f"🎯 Starting from batch {batch_number}")
    else:
        batch_number = 1
    # === END NEW ===

    # Process in batches
    for start_idx in range(0, len(verified_df), batch_size):
        end_idx = min(start_idx + batch_size, len(verified_df))
        current_batch = verified_df.iloc[start_idx:end_idx]

        print(f"\n{'='*60}")
        print(f"BATCH {batch_number} - Comments {start_idx + 1} to {end_idx}")
        print(f"Total target: {total_to_verify} comments")
        print(f"{'='*60}")

        batch_corrections = 0

        for i, (idx, row) in enumerate(current_batch.iterrows()):
            global_index = start_idx + i + 1

            print(f"\n" + "="*80)
            print(f"COMMENT {global_index}/{total_to_verify} (Batch {batch_number}.{i+1})")
            print("="*80)

            # Display comment information
            print(f"CONFIDENCE: {row['confidence_level']} ({row['model_confidence']:.3f})")
            print("-"*80)
            print(f"COMMENT:\n{row['comment']}")
            print("-"*80)

            # Display model prediction with probabilities
            print("MODEL PREDICTION:")
            print(f"  Label: {row['model_label']} ({get_sentiment_name(row['model_label'])})")
            print(f"  Confidence: {row['model_confidence']:.3f}")
            print(f"  Probabilities: Neg={row['prob_negative']:.3f}, Neu={row['prob_neutral']:.3f}, Pos={row['prob_positive']:.3f}")
            print("-"*80)

            # Verification options
            print("VERIFICATION OPTIONS:")
            print("0: Negative")
            print("1: Neutral")
            print("2: Positive")
            print("Enter: Accept model prediction")
            print("s: Skip this comment")
            print("="*80)

            while True:
                try:
                    choice = input("Enter your choice: ").strip().lower()

                    if choice == '':
                        # Accept model prediction
                        verified_df.at[idx, 'human_label'] = row['model_label']
                        verified_df.at[idx, 'verified'] = True
                        print("✓ Accepted model prediction")
                        break

                    elif choice in ['0', '1', '2']:
                        human_label = int(choice)
                        verified_df.at[idx, 'human_label'] = human_label
                        verified_df.at[idx, 'verified'] = True

                        if human_label != row['model_label']:
                            batch_corrections += 1
                            print(f"✓ Corrected from {row['model_label']} to {human_label}")
                        else:
                            print("✓ Confirmed model prediction")
                        break

                    elif choice == 's':
                        print("Skipped this comment")
                        break

                    else:
                        print("Invalid choice! Please enter 0, 1, 2, Enter, or 's'")

                except KeyboardInterrupt:
                    print("\nVerification interrupted by user")
                    return verified_df, total_verified, total_corrections
                except Exception as e:
                    print(f"Error: {e}. Please try again.")

        # Update totals after batch completion
        batch_verified = len(current_batch[current_batch['verified'] == True])
        total_verified += batch_verified
        total_corrections += batch_corrections

        overall_accuracy = 1 - (total_corrections / total_verified) if total_verified > 0 else 1.0

        print(f"\n✓ Batch {batch_number} completed!")
        print(f"  Verified in this batch: {batch_verified}")
        print(f"  Corrections in this batch: {batch_corrections}")
        print(f"  Batch accuracy: {1 - (batch_corrections/batch_verified) if batch_verified > 0 else 1.0:.1%}")
        print(f"  Total verified: {total_verified}/{total_to_verify}")
        print(f"  Overall accuracy: {overall_accuracy:.1%}")

        # Save progress after each batch
        verified_so_far = verified_df[verified_df['verified'] == True]
        if len(verified_so_far) > 0:
            verified_so_far.to_csv(f'verification_70percent_batch_{batch_number}.csv', index=False)
            print(f"  Progress saved: 'verification_70percent_batch_{batch_number}.csv'")

        batch_number += 1

        # Ask if user wants to continue
        if total_verified < total_to_verify:
            remaining = total_to_verify - total_verified
            cont = input(f"\n{remaining} comments remaining. Continue to next batch? (y/n): ").strip().lower()
            if cont != 'y':
                print("Verification paused by user.")
                break
        else:
            print(f"\n✓ Target of {total_to_verify} comments reached!")

    print(f"\n VERIFICATION COMPLETED!")
    print(f"Total verified: {total_verified}/{total_to_verify}")
    print(f"Total corrections: {total_corrections}")
    print(f"Final model accuracy: {1 - (total_corrections/total_verified) if total_verified > 0 else 1.0:.1%}")

    return verified_df, total_verified, total_corrections

# === NEW: FUNCTION TO CHECK PROGRESS ===
def check_verification_progress():
    """Check what batches have been completed and suggest where to resume"""
    import glob
    batch_files = glob.glob('verification_70percent_batch_*.csv')

    if not batch_files:
        print("❌ No verification progress found. Start from batch 1.")
        return None

    batch_numbers = []
    for file in batch_files:
        try:
            batch_num = int(file.split('_')[-1].split('.')[0])
            batch_numbers.append(batch_num)
        except:
            continue

    if batch_numbers:
        latest_batch = max(batch_numbers)
        print(f"📊 Found progress up to batch {latest_batch}")
        print(f"💡 To resume, use: resume_from_batch={latest_batch + 1}")
        return latest_batch + 1
    else:
        print("❌ No valid batch files found.")
        return None

# === CHECK PROGRESS FIRST ===
print("Checking existing verification progress...")
next_batch = check_verification_progress()

if next_batch is not None:
    # RESUME from where you left off
    print(f"\n🔄 RESUMING VERIFICATION FROM BATCH {next_batch}")
    verified_df, total_verified, total_corrections = verify_data(
        verification_queue,
        batch_size=50,
        total_to_verify=len(verification_queue),
        resume_from_batch=next_batch  # === NEW PARAMETER ===
    )
else:
    # START FRESH
    print("\n🚀 STARTING NEW VERIFICATION FROM BATCH 1")
    verified_df, total_verified, total_corrections = verify_data(
        verification_queue,
        batch_size=50,
        total_to_verify=len(verification_queue)
    )

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Enter your choice: 
✓ Accepted model prediction

COMMENT 219/432 (Batch 39.19)
CONFIDENCE: medium (0.712)
--------------------------------------------------------------------------------
COMMENT:
.IP 17 base model Terbaik selanjutnya kah setelah IP 13 base model
--------------------------------------------------------------------------------
MODEL PREDICTION:
  Label: 1 (Neutral)
  Confidence: 0.712
  Probabilities: Neg=0.153, Neu=0.712, Pos=0.135
--------------------------------------------------------------------------------
VERIFICATION OPTIONS:
0: Negative
1: Neutral
2: Positive
Enter: Accept model prediction
s: Skip this comment
Enter your choice: 
✓ Accepted model prediction

COMMENT 220/432 (Batch 39.20)
CONFIDENCE: medium (0.712)
--------------------------------------------------------------------------------
COMMENT:
bahkan, bang david tidak mengetahui spek hpnya sendiri??
----------------------------------------

In [23]:
# Block 5: Combine with Original Data and Update Model
def combine_and_update_model(original_labeled_df, verified_70percent_df):
    """
    Combine original 30% labeled data with newly verified 70% data
    and retrain the model
    """
    print("Combining original 30% data with verified 70% data...")

    # Prepare original labeled data
    original_clean = original_labeled_df[['video_id', 'author', 'comment', 'label']].copy()
    original_clean['label_source'] = 'manual_initial_30percent'

    # Prepare newly verified 70% data
    verified_clean = verified_70percent_df[verified_70percent_df['verified'] == True]
    verified_clean = verified_clean[['video_id', 'author', 'comment', 'human_label']].copy()
    verified_clean = verified_clean.rename(columns={'human_label': 'label'})
    verified_clean['label_source'] = 'manual_verified_done'

    # Combine datasets
    combined_100percent_df = pd.concat([original_clean, verified_clean], ignore_index=True)

    print(f"Combined dataset created:")
    print(f"  Original manual labels (30%): {len(original_clean)}")
    print(f"  Newly verified labels (70%): {len(verified_clean)}")
    print(f"  Total: {len(combined_100percent_df)} comments")
    print(f"  Coverage: {len(combined_100percent_df)/4418*100:.1f}% of total data")

    # Analyze the combined dataset
    print(f"\nCOMBINED DATASET ANALYSIS:")
    print("=" * 40)

    # Label distribution
    label_dist = combined_100percent_df['label'].value_counts().sort_index()
    print("Label Distribution:")
    for label, count in label_dist.items():
        sentiment = get_sentiment_name(label)
        percentage = count / len(combined_100percent_df) * 100
        print(f"  {sentiment}: {count} ({percentage:.1f}%)")

    # Source distribution
    source_dist = combined_100percent_df['label_source'].value_counts()
    print(f"\nLabel Sources:")
    for source, count in source_dist.items():
        percentage = count / len(combined_100percent_df) * 100
        print(f"  {source}: {count} ({percentage:.1f}%)")

    # Save combined dataset
    combined_100percent_df.to_csv('combined_labeled_data.csv', index=False)
    print(f"\nCombined dataset saved as 'combined_labeled_data.csv'")

    # Retrain model on combined data
    print("\nRetraining model on 100% labeled data...")
    model_updated, vectorizer_updated = retrain_model_on_combined_data(combined_100percent_df)

    return combined_100percent_df, model_updated, vectorizer_updated

def retrain_model_on_combined_data(combined_df):
    """
    Retrain the model on the combined 100% labeled data
    """
    # Prepare features and labels
    X_combined = combined_df['comment'].fillna('')
    y_combined = combined_df['label']

    # Create new vectorizer with expanded vocabulary
    vectorizer_updated = TfidfVectorizer(
        max_features=4000,  # Increased features for larger dataset
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.95,
        stop_words=None
    )

    X_combined_vec = vectorizer_updated.fit_transform(X_combined)

    # Train SVM on larger dataset
    base_svm_updated = SVC(
        kernel='linear',
        probability=False,
        random_state=42,
        class_weight='balanced'
    )

    model_updated = CalibratedClassifierCV(
        base_svm_updated,
        method='sigmoid',
        cv=min(5, len(combined_df))
    )

    model_updated.fit(X_combined_vec, y_combined)

    print("✓ Model retrained on 100% combined data!")
    print(f"Training data size: {len(combined_df)} comments")

    # Evaluate on training data
    train_predictions = model_updated.predict(X_combined_vec)
    train_accuracy = (train_predictions == y_combined).mean()

    print(f"Training accuracy: {train_accuracy:.3f}")
    print("\nClassification Report:")
    print(classification_report(y_combined, train_predictions))

    return model_updated, vectorizer_updated

# Combine and update model
verified_df = pd.read_csv('manual_verification_done.csv')
combined_df, model_updated, vectorizer_updated = combine_and_update_model(labeled_df, verified_df)

Combining original 30% data with verified 70% data...
Combined dataset created:
  Original manual labels (30%): 1092
  Newly verified labels (70%): 2132
  Total: 3224 comments
  Coverage: 73.0% of total data

COMBINED DATASET ANALYSIS:
Label Distribution:
  Negative: 1148 (35.6%)
  Neutral: 1735 (53.8%)
  Positive: 341 (10.6%)

Label Sources:
  manual_verified_done: 2132 (66.1%)
  manual_initial_30percent: 1092 (33.9%)

Combined dataset saved as 'combined_labeled_data.csv'

Retraining model on 100% labeled data...
✓ Model retrained on 100% combined data!
Training data size: 3224 comments
Training accuracy: 0.908

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.92      0.91      1148
           1       0.92      0.92      0.92      1735
           2       0.90      0.78      0.84       341

    accuracy                           0.91      3224
   macro avg       0.90      0.88      0.89      3224
weighted avg       0.91      0

In [15]:
# Block 6: Prepare 30% Pseudo-Labeling Set
def prepare_pseudo_labeling_set(pseudo_set, model_updated, vectorizer_updated):
    print("Preparing pseudo-labeling set...")
    print(f"Pseudo-labeling set size: {len(pseudo_set)} comments")

    # Apply the updated model to the pseudo-labeling set
    X_pseudo = pseudo_set['comment'].fillna('')
    X_pseudo_vec = vectorizer_updated.transform(X_pseudo)

    predictions = model_updated.predict(X_pseudo_vec)
    probabilities = model_updated.predict_proba(X_pseudo_vec)
    max_probs = np.max(probabilities, axis=1)

    # Add predictions to pseudo set
    pseudo_labeled_set = pseudo_set.copy()
    pseudo_labeled_set['model_label'] = predictions
    pseudo_labeled_set['model_confidence'] = max_probs
    pseudo_labeled_set['prob_negative'] = probabilities[:, 0]
    pseudo_labeled_set['prob_neutral'] = probabilities[:, 1]
    pseudo_labeled_set['prob_positive'] = probabilities[:, 2]

    # Categorize by confidence
    pseudo_labeled_set['confidence_level'] = pd.cut(
        max_probs,
        bins=[0, 0.6, 0.8, 0.9, 1.0],
        labels=['low', 'medium', 'high', 'very_high'],
        right=False
    )

    print(f"\nPseudo-labeling set statistics:")
    print(f"Total comments: {len(pseudo_labeled_set)}")

    print("\nConfidence Distribution:")
    conf_counts = pseudo_labeled_set['confidence_level'].value_counts()
    for level, count in conf_counts.items():
        percentage = count / len(pseudo_labeled_set) * 100
        print(f"  {level}: {count} comments ({percentage:.1f}%)")

    print("\nPredicted Label Distribution:")
    label_counts = pseudo_labeled_set['model_label'].value_counts().sort_index()
    for label, count in label_counts.items():
        sentiment = get_sentiment_name(label)
        percentage = count / len(pseudo_labeled_set) * 100
        print(f"  {sentiment}: {count} comments ({percentage:.1f}%)")

    # Save pseudo-labeling set
    pseudo_labeled_set.to_csv('pseudo_labeled.csv', index=False)
    print(f"\n✓ Pseudo-labeling set saved as 'pseudo_labeled.csv'")

    return pseudo_labeled_set

# Prepare pseudo-labeling set
pseudo_labeled = prepare_pseudo_labeling_set(pseudo_set, model_updated, vectorizer_updated)

Preparing pseudo-labeling set...
Pseudo-labeling set size: 915 comments

Pseudo-labeling set statistics:
Total comments: 915

Confidence Distribution:
  medium: 667 comments (72.9%)
  low: 206 comments (22.5%)
  high: 42 comments (4.6%)
  very_high: 0 comments (0.0%)

Predicted Label Distribution:
  Negative: 393 comments (43.0%)
  Neutral: 459 comments (50.2%)
  Positive: 63 comments (6.9%)

✓ Pseudo-labeling set saved as 'pseudo_labeling.csv'


In [43]:
pseudo_for_combination = pseudo_labeled[['video_id', 'author', 'comment', 'model_label']].copy()
pseudo_for_combination = pseudo_for_combination.rename(columns={'model_label': 'label'})
pseudo_for_combination['label_source'] = 'pseudo_labeled'

final_dataset = pd.concat([combined_df, pseudo_for_combination], ignore_index=True)
print(f"COMBINATION RESULTS:")
print(f"   - Verified labeled data: {len(combined_df)} comments")
print(f"   - High-confidence pseudo-labels: {len(pseudo_labeled)} comments")
print(f"   - Combined dataset: {len(final_dataset)} comments")
print(f"Final dataset saved as 'final_labeled_dataset.csv")
final_dataset.to_csv('final_labeled_dataset.csv', index=False)


COMBINATION RESULTS:
   - Verified labeled data: 3224 comments
   - High-confidence pseudo-labels: 915 comments
   - Combined dataset: 4139 comments
Final dataset saved as 'final_labeled_dataset.csv
