# Imports & Installs


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#! pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m68.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from gensim.models import KeyedVectors
import warnings
import gensim.downloader as api
warnings.filterwarnings('ignore')

# Data Prep

In [None]:
path="/content/drive/MyDrive/Grad Project/preprocessed_2.csv"
data = pd.read_csv(path)

In [None]:
print("Duplicates:", data.duplicated(subset=["lemmas"]).sum())


Duplicates: 26


In [None]:
data = data.drop_duplicates(subset=["lemmas"]).reset_index(drop=True)
print("Duplicates:", data.duplicated(subset=["lemmas"]).sum())

Duplicates: 0


In [None]:
print(data.shape)

(2452, 3)


In [None]:
data = data.sample(frac=1, random_state=42).reset_index(drop=True)
data.head(10)

Unnamed: 0,claim_status,phase3_cleaned,lemmas
0,opinion,my colleagues hypothesis is that a banana is ...,my colleague hypothesis be that a banana be ...
1,Supported claim,columbia university emphasized that stated tha...,columbia university emphasize that state that ...
2,Supported claim,based on a post from reuters guatemala city gu...,base on a post from reuter guatemala city guat...
3,Supported claim,world bank emphasized that stated that inflati...,world bank emphasize that state that inflation...
4,claim,everyone is talking about how some ants can li...,everyone be talk about how some ant can lift u...
5,claim,i discovered on the radio that the queens in a...,I discover on the radio that the queen in ant ...
6,Supported claim,in its latest report united nations un reve...,in its late report united nations un rev...
7,claim,a friend discovered on the news a claim that d...,a friend discover on the news a claim that dam...
8,Supported claim,cleveland clinic stated that reported that ren...,cleveland clinic state that report that renewa...
9,claim,somalia s islamist al shabaab insurgency shot ...,somalia s islamist al shabaab insurgency shoot...


In [None]:
X = data['phase3_cleaned']
y= data['claim_status']

# Word2Vec

In [None]:
X_train_text, X_val_text, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:

word2vec_model = api.load("word2vec-google-news-300")



In [None]:
print(f"Vocabulary size: {len(word2vec_model.index_to_key):,} words")
word2vec = word2vec_model

Vocabulary size: 3,000,000 words


In [None]:
def get_sentence_vector(sentence, model, vector_size=300):
    """
    Convert a sentence into a fixed-size embedding by averaging word vectors.
    Words not in the pretrained vocabulary are ignored.
    """
    tokens = [w for w in sentence.split() if w in model]
    if not tokens:
        return np.zeros(vector_size)
    return np.mean(model[tokens], axis=0)

In [None]:
print("\nGenerating Word2Vec sentence embeddings...")
start_embed = time.time()

X_train_vecs = np.vstack([get_sentence_vector(s, word2vec) for s in tqdm(X_train_text, desc="Train Embeddings")])
X_val_vecs   = np.vstack([get_sentence_vector(s, word2vec) for s in tqdm(X_val_text, desc="Validation Embeddings")])

end_embed = time.time()
embedding_time = round(end_embed - start_embed, 2)
print(f" Embeddings generated in {embedding_time} seconds. Shape: {X_train_vecs.shape}")


[INFO] Generating Word2Vec sentence embeddings...


Train Embeddings: 100%|██████████| 18140/18140 [00:01<00:00, 11740.98it/s]
Validation Embeddings: 100%|██████████| 4536/4536 [00:00<00:00, 10701.48it/s]

[INFO] Embeddings generated in 2.03 seconds. Shape: (18140, 300)





In [None]:
print("\nTraining Support Vector Machine (SVM) classifier...")
start_train = time.time()

svm_clf_word2vec = SVC(kernel='linear', class_weight='balanced', random_state=42)
svm_clf_word2vec.fit(X_train_vecs, y_train)

end_train = time.time()
training_time = round(end_train - start_train, 2)
print(f"Training completed in {training_time} seconds.")


Training Support Vector Machine (SVM) classifier...
Training completed in 13.16 seconds.


In [None]:
start_pred = time.time()
y_pred = svm_clf_word2vec.predict(X_val_vecs)
end_pred = time.time()
inference_time = round(end_pred - start_pred, 2)

In [None]:
print("\nClassification Report for Word2Vec Embeddings:")
print(classification_report(y_val, y_pred, digits=3))

# Metrics summary
accuracy = accuracy_score(y_val, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred, average='weighted')


Classification Report for Word2Vec Embeddings:
                 precision    recall  f1-score   support

Supported claim      1.000     1.000     1.000       880
          claim      0.977     0.994     0.985      1681
        opinion      0.995     0.980     0.987      1975

       accuracy                          0.989      4536
      macro avg      0.990     0.991     0.991      4536
   weighted avg      0.989     0.989     0.989      4536



In [None]:
results = pd.DataFrame([{
    "Embedding": "Word2Vec (GoogleNews)",
    "Accuracy": round(accuracy, 3),
    "Precision": round(precision, 3),
    "Recall": round(recall, 3),
    "F1-Score": round(f1, 3),
    "Embedding Time (s)": embedding_time,
    "Training Time (s)": training_time,
    "Inference Time (s)": inference_time
}])

results

Unnamed: 0,Embedding,Accuracy,Precision,Recall,F1-Score,Embedding Time (s),Training Time (s),Inference Time (s)
0,Word2Vec (GoogleNews),0.989,0.989,0.989,0.989,2.03,13.16,1.26


# FastText

In [None]:
fasttext = api.load("fasttext-wiki-news-subwords-300")

print(f" Vocabulary size: {len(fasttext.index_to_key):,} words")

 Vocabulary size: 999,999 words


In [None]:
def get_sentence_vector(sentence, model, vector_size=300):
    """
    Convert a sentence into a fixed-size embedding by averaging word vectors.
    FastText supports subword information, so even OOV words are handled.
    """
    tokens = sentence.split()
    word_vectors = [model[w] for w in tokens if w in model]
    if not word_vectors:
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)


In [None]:
print("\n[INFO] Generating FastText sentence embeddings...")
start_embed = time.time()

X_train_vecs = np.vstack([get_sentence_vector(s, fasttext) for s in tqdm(X_train_text, desc="Train Embeddings")])
X_val_vecs   = np.vstack([get_sentence_vector(s, fasttext) for s in tqdm(X_val_text, desc="Validation Embeddings")])

end_embed = time.time()
embedding_time = round(end_embed - start_embed, 2)
print(f"[INFO] Embeddings generated in {embedding_time} seconds. Shape: {X_train_vecs.shape}")


[INFO] Generating FastText sentence embeddings...


Train Embeddings: 100%|██████████| 18140/18140 [00:01<00:00, 11151.31it/s]
Validation Embeddings: 100%|██████████| 4536/4536 [00:00<00:00, 10509.59it/s]


[INFO] Embeddings generated in 2.13 seconds. Shape: (18140, 300)


In [None]:
print("\n[INFO] Training Support Vector Machine (SVM) classifier...")
start_train = time.time()

svm_clf_fasttext = SVC(kernel='linear', class_weight='balanced', random_state=42)
svm_clf_fasttext.fit(X_train_vecs, y_train)

end_train = time.time()
training_time = round(end_train - start_train, 2)
print(f"[INFO] Training completed in {training_time} seconds.")


[INFO] Training Support Vector Machine (SVM) classifier...
[INFO] Training completed in 30.96 seconds.


In [None]:
start_pred = time.time()
y_pred = svm_clf_fasttext.predict(X_val_vecs)
end_pred = time.time()
inference_time = round(end_pred - start_pred, 2)

In [None]:
print("\n[RESULTS] Classification Report for FastText Embeddings:")
print(classification_report(y_val, y_pred, digits=3))


[RESULTS] Classification Report for FastText Embeddings:
                 precision    recall  f1-score   support

Supported claim      0.999     1.000     0.999       880
          claim      0.943     0.990     0.966      1681
        opinion      0.992     0.949     0.970      1975

       accuracy                          0.974      4536
      macro avg      0.978     0.980     0.979      4536
   weighted avg      0.975     0.974     0.974      4536



In [None]:
accuracy = accuracy_score(y_val, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred, average='weighted')

new_result = pd.DataFrame([{
    "Embedding": "FastText (Wiki-News via API)",
    "Accuracy": round(accuracy, 3),
    "Precision": round(precision, 3),
    "Recall": round(recall, 3),
    "F1-Score": round(f1, 3),
    "Embedding Time (s)": embedding_time,
    "Training Time (s)": training_time,
    "Inference Time (s)": inference_time
}])


In [None]:
results = pd.concat([results, new_result], ignore_index=True)
results

Unnamed: 0,Embedding,Accuracy,Precision,Recall,F1-Score,Embedding Time (s),Training Time (s),Inference Time (s)
0,Word2Vec (GoogleNews),0.989,0.989,0.989,0.989,2.03,13.16,1.26
1,FastText (Wiki-News via API),0.974,0.975,0.974,0.974,2.13,30.96,4.56


# Transformers

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
sbert = SentenceTransformer('all-MiniLM-L6-v2')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
print("\n[INFO] Generating SBERT sentence embeddings...")
start_embed = time.time()

X_train_vecs = sbert.encode(X_train_text.tolist(), batch_size=32, show_progress_bar=True)
X_val_vecs   = sbert.encode(X_val_text.tolist(), batch_size=32, show_progress_bar=True)

end_embed = time.time()
embedding_time = round(end_embed - start_embed, 2)
print(f"[INFO] Embeddings generated in {embedding_time} seconds. Shape: {X_train_vecs.shape}")


[INFO] Generating SBERT sentence embeddings...


Batches:   0%|          | 0/567 [00:00<?, ?it/s]

Batches:   0%|          | 0/142 [00:00<?, ?it/s]

[INFO] Embeddings generated in 227.18 seconds. Shape: (18140, 384)


In [None]:
print("\n[INFO] Training Support Vector Machine (SVM) classifier...")
start_train = time.time()

svm_clf_sbert = SVC(kernel='linear', class_weight='balanced', random_state=42)
svm_clf_sbert.fit(X_train_vecs, y_train)

end_train = time.time()
training_time = round(end_train - start_train, 2)
print(f"[INFO] Training completed in {training_time} seconds.")



[INFO] Training Support Vector Machine (SVM) classifier...
[INFO] Training completed in 24.34 seconds.


In [None]:
start_pred = time.time()
y_pred = svm_clf_sbert.predict(X_val_vecs)
end_pred = time.time()
inference_time = round(end_pred - start_pred, 2)


In [None]:
print("\n[RESULTS] Classification Report for Sentence Transformer Embeddings:")
print(classification_report(y_val, y_pred, digits=3))



[RESULTS] Classification Report for Sentence Transformer Embeddings:
                 precision    recall  f1-score   support

Supported claim      1.000     1.000     1.000       880
          claim      0.981     0.979     0.980      1681
        opinion      0.982     0.984     0.983      1975

       accuracy                          0.985      4536
      macro avg      0.988     0.988     0.988      4536
   weighted avg      0.985     0.985     0.985      4536



In [None]:
accuracy = accuracy_score(y_val, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_pred, average='weighted')


new_result = pd.DataFrame([{
    "Embedding": "Sentence Transformers (all-MiniLM-L6-v2)",
    "Accuracy": round(accuracy, 3),
    "Precision": round(precision, 3),
    "Recall": round(recall, 3),
    "F1-Score": round(f1, 3),
    "Embedding Time (s)": embedding_time,
    "Training Time (s)": training_time,
    "Inference Time (s)": inference_time
}])

results = pd.concat([results, new_result], ignore_index=True)

In [None]:
results

Unnamed: 0,Embedding,Accuracy,Precision,Recall,F1-Score,Embedding Time (s),Training Time (s),Inference Time (s)
0,Word2Vec (GoogleNews),0.989,0.989,0.989,0.989,2.03,13.16,1.26
1,FastText (Wiki-News via API),0.974,0.975,0.974,0.974,2.13,30.96,4.56
2,Sentence Transformers (all-MiniLM-L6-v2),0.985,0.985,0.985,0.985,227.18,24.34,1.88


https://sbert.net/

# GUI

In [None]:
from google.colab import output
output.enable_custom_widget_manager()


In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output
import numpy as np

In [None]:
clf_w2v = svm_clf_word2vec
clf_ft = svm_clf_fasttext
clf_sbert = svm_clf_sbert

In [None]:
def get_sentence_vector(sentence, model, vector_size=300):
    tokens = sentence.split()
    words = [w for w in tokens if w in model]
    if not words:
        return np.zeros(vector_size)
    return np.mean(model[words], axis=0)

def embed_text(text, model_name):
    if model_name == "Word2Vec":
        return get_sentence_vector(text, word2vec)
    elif model_name == "FastText":
        return get_sentence_vector(text, fasttext)
    elif model_name == "Sentence Transformers":
        return sbert.encode([text])[0]

def classify_text(text, model_name):
    if model_name == "Word2Vec":
        vec = embed_text(text, "Word2Vec").reshape(1, -1)
        return clf_w2v.predict(vec)[0]
    elif model_name == "FastText":
        vec = embed_text(text, "FastText").reshape(1, -1)
        return clf_ft.predict(vec)[0]
    elif model_name == "Sentence Transformers":
        vec = embed_text(text, "Sentence Transformers").reshape(1, -1)
        return clf_sbert.predict(vec)[0]


In [None]:
# Widgets
text_input = widgets.Textarea(
    value="",
    placeholder="Paste TikTok transcript here...",
    description="Transcript:",
    layout=widgets.Layout(width='100%', height='150px')
)

model_selector = widgets.Dropdown(
    options=['Sentence Transformers', 'FastText', 'Word2Vec'],
    value='Sentence Transformers',
    description='Model:'
)

button = widgets.Button(
    description='Classify',
    button_style='success',
    icon='check'
)

output_area = widgets.Output()

# Button logic
def on_button_click(b):
    with output_area:
        clear_output()
        text = text_input.value.strip()
        model_name = model_selector.value
        if not text:
            print("⚠️ Please enter some text to classify.")
            return
        print(f"🔍 Running {model_name} model...")
        prediction = classify_text(text, model_name)
        print(f"\n✅ Predicted Category: **{prediction.upper()}**")

button.on_click(on_button_click)


display(text_input, model_selector, button, output_area)


Textarea(value='', description='Transcript:', layout=Layout(height='150px', width='100%'), placeholder='Paste …

Dropdown(description='Model:', options=('Sentence Transformers', 'FastText', 'Word2Vec'), value='Sentence Tran…

Button(button_style='success', description='Classify', icon='check', style=ButtonStyle())

Output()