In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, f1_score
import sys
import os

In [3]:
# --- 1. LOAD DATA ---
INPUT_FILE = 'labeled_pairs.csv'
print(f"Loading {INPUT_FILE}...")
df = pd.read_csv(INPUT_FILE)

# Handle missing values
df['text_1'] = df['text_1'].fillna('')
df['text_2'] = df['text_2'].fillna('')

# Combine the text: "Tweet1 [SPACE] Tweet2"
df['combined_text'] = df['text_1'] + " " + df['text_2']

X = df['combined_text']
y = df['label']

Loading labeled_pairs.csv...


In [4]:
# --- 2. SPLIT DATA ---
print("Splitting data (Stratified)...")
# Stratify keeps the 95/5 split in both train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

Splitting data (Stratified)...


In [5]:
# --- 3. FEATURE ENGINEERING (Standard TF-IDF) ---
print("Vectorizing text (Standard Tokenizer)...")

tfidf = TfidfVectorizer(
    max_features=10000,     
    ngram_range=(1, 2),    # Use unigrams and bigrams
    stop_words='english'   # Remove common words like 'the', 'is'
    # We removed 'tokenizer=...' so it uses the fast, default one
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"Feature matrix shape: {X_train_tfidf.shape}")

Vectorizing text (Standard Tokenizer)...
Feature matrix shape: (264355, 10000)


In [6]:
# --- 4. TRAIN MODEL ---
print("Training Logistic Regression (Balanced)...")
# class_weight='balanced' is the most important parameter here
model = LogisticRegression(class_weight='balanced', max_iter=1000, n_jobs=-1)
model.fit(X_train_tfidf, y_train)

Training Logistic Regression (Balanced)...


LogisticRegression(class_weight='balanced', max_iter=1000, n_jobs=-1)

In [7]:
# --- 5. EVALUATE ---
print("\n--- BASELINE RESULTS ---")
y_pred = model.predict(X_test_tfidf)


--- BASELINE RESULTS ---


In [8]:
# We care most about Class 1 (Moved)
f1 = f1_score(y_test, y_pred, pos_label=1)
print(f"F1-Score (Moved): {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

F1-Score (Moved): 0.1198

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.68      0.79     62806
           1       0.07      0.46      0.12      3283

    accuracy                           0.67     66089
   macro avg       0.51      0.57      0.46     66089
weighted avg       0.92      0.67      0.76     66089

Confusion Matrix:
[[42508 20298]
 [ 1780  1503]]
